Source code for xrd_tools.plugins.refinement_profex

"""A xrd-tools refinement interface plugin that integrates Profex/BGMN."""
import logging
import os
import re
import shutil
import subprocess
from dataclasses import dataclass

import pandas as pd
from uncertainties import UFloat, ufloat

from xrd_tools import refinement_interface_factory, utils
from xrd_tools.refinement import RefinementResult
from xrd_tools.refinement_interface import AppNotInstalledError, RefinementInterface

# Settings for plugin loader
NAME_REFINEMENT_INTERFACE = "profex"

# Settings for plugin itself,
#   command to start the refinement app (also displayed in logs)
REFINEMENT_APPLICATION = "profex"
#   'software' argument handed over to RefinementResult object
SOFTWARE = "BGMN"
#   properties of input data file
INPUT_DATA_SUFFIX = ""
INPUT_DATA_HEADER = False
INPUT_DATA_DELIMITER = " "
HEADER_GLOBAL_PARAMETER = "Global parameters and GOALs"
HEADER_PHASES = "Local parameters and GOALs for phase "
FOOTER_PHASES = "Atomic positions for phase "

logger = logging.getLogger(__name__)


[docs]@dataclass
class ProfexInterface(RefinementInterface):
    """Profex/BGMN refinement interface.

    Args:
        measurement_id (str): ID of the measurement to be refined with profex.
        data (pd.Series): Series containing the x/y data of the measurement.
            The index represents the 2θ angle in °.
        dir_refinement (str): Path to the refinement project directory.
        encoding (str): Encoding used in refinement input data file.
        input_data_suffix (str): Suffix appended to ``measurement_id`` to generate
            refinement input- and project- filenames. Since the input data filename
            (without extension) is used as project name by profex, it is recommended
            to provide no suffix for the refinement files.
        input_data_delimiter (str): Delimeter to be used for creation of refinement
            input data file (profex can not read-in comma separated values).
        input_data_header (bool): Option to ignore header for creation of refinement
            input data file.
    """

    measurement_id: str
    data: pd.Series
    dir_refinement: str
    encoding: str
    input_data_suffix: str = INPUT_DATA_SUFFIX
    input_data_header: bool = INPUT_DATA_HEADER
    input_data_delimiter: str = INPUT_DATA_DELIMITER
    name_phases: str = "phases"

    def _string_converter(self, value: str) -> float | int | str:
        """Convert a provided string into an integer or a float."""
        try:
            res = int(value)
        except ValueError:
            try:
                res = float(value)
            except ValueError:
                res = value
        return res

    def _get_app_path(self) -> str:
        """Path to the profex binary, used to check if profex is installed."""
        return shutil.which(REFINEMENT_APPLICATION)

    def _get_dia_header(self) -> dict[str, str]:
        """Read the header of the ``*.dia`` file, excluding ``TITLE``."""
        with open(self.file_refinement_project, "r") as fobj:
            data = fobj.readline().strip("\n").split(" ")
        dct = {i.split("=")[0]: i.split("=")[1] for i in data[1:]}
        phases = []
        k_to_pop = []
        for k, v in dct.items():
            self._string_converter(v)
            if k.startswith("STRUC"):
                phases.append(v)
                k_to_pop.append(k)
        for k in k_to_pop:
            dct.pop(k)
        dct[self.name_phases] = phases
        return dct

    def _get_refinement_statistics(self):
        """Extract the refinement statistics from the ``*.lst`` file,

        as well as the zero shift (``EPS1``) and sample displacement (``EPS2``)
        from the ``*.par`` file.

        Calculates goodness of fit (GoF) and chi_sqrdfrom from statisctics as follows:
        $$\frac{Rwp}{Rexp}^2 = gof^2 = chi^2$$
        """
        file_path = self.file_refinement_project.replace("dia", "lst")
        with open(file_path) as fobj:
            file_contents = fobj.read()
        # Extract Rp, Rpb, R, Rwp, and Rexp
        match = re.search(
            r"Rp=(\d+.\d+)%\s+Rpb=(\d+.\d+)%\s+R=(\d+.\d+)%\s+Rwp=(\d+.\d+)%\s+Rexp=(\d+.\d+)%",
            file_contents,
        )
        Rp, Rpb, R, Rwp, Rexp = match.groups()
        stats = {
            "Rp": float(Rp),
            "Rpb": float(Rpb),
            "R": float(R),
            "Rwp": float(Rwp),
            "Rexp": float(Rexp),
        }
        stats["GoF"] = stats["Rwp"] / stats["Rexp"]
        stats["chi_sqrd"] = stats["GoF"] ** 2
        with open(self.file_refinement_project.replace("dia", "par"), "r") as fobj:
            line = fobj.readline().strip("\n")
        eps1, eps2 = re.findall(r"(?:EPS1|EPS2)=(\S+)", line)
        stats["zero_shift"] = float(eps1)
        stats["sample_discplacement"] = float(eps2)
        return stats

    def _get_global_parameters(self):
        """Extract the global refinement parameter from the ``*.lst`` file."""
        file_path = self.file_refinement_project.replace("dia", "lst")
        results = {}
        pattern = re.compile(r"[A-Z0-9]+=[\d+\.\d]+\+-\d+\.\d+")
        with open(file_path) as fobj:
            lines = fobj.readlines()
            # Extract global parameter
            for i, line in enumerate(lines):
                if HEADER_GLOBAL_PARAMETER in line:
                    start_line = i + 1
                    break
            for line in lines[start_line:]:
                if not line.strip():
                    break
                match = pattern.search(line)
                if match:
                    parameter, value = match.group().split("=")
                    value, uncertainty = value.split("+-")
                    results[parameter] = float(value), float(uncertainty)
        return results

[docs]    def create_input_data(self) -> None:
        """Create refinement input data for profex."""
        self.data.to_csv(
            self.file_refinement_input,
            sep=self.input_data_delimiter,
            header=self.input_data_header,
            encoding=self.encoding,
        )
        logger.debug(
            "Created input data for profex refinement of measurement {self.measurement_id}."
        )

[docs]    def get_cif_files(self) -> dict[str, str]:
        """Get a dictionary with the name and the path to the cif files for a refined phases."""
        dia_header = self._get_dia_header()
        phases = dia_header[self.name_phases]
        cif_files = [
            os.path.join(self.dir_refinement, file)
            for file in os.listdir(self.dir_refinement)
            if file.startswith(self.measurement_id) and file.endswith(".cif")
        ]
        if len(phases) != len(cif_files):
            raise ValueError(
                f"Amount of CIF files ({len(cif_files)}) not matching to amount of refined phases ({len(phases)})."
            )
        return dict(zip(phases, cif_files))

[docs]    def get_refined_data(self, i_calc: str, i_bg: str) -> pd.DataFrame:
        """Get a pandas DataFrame containing the refined data series.

        Args:
            i_calc (str): Column name for the calculated intensities.
            i_bg (str): Column name for the background intensities.

        Returns:
            pd.DataFrame: A DataFrame with index set to 2θ and an index name
            as found in the provided data. The columns correspond to:
            - I_calc with column name provided as argument,
            - I_bg with column name provided as argument, and
            - a further column for each phase refined, named with its name as defined
              in profex.
        """
        # Extract the amount of structures from the header of the dia file
        columns = [self.data.name, i_calc, i_bg]
        dia_header = self._get_dia_header()
        phases = dia_header[self.name_phases]
        for phase in phases:
            columns.append(phase)
        # Read remaining data from dia file, set index and add columns
        utils.ensure_file_exists(self.file_refinement_project)
        df = pd.read_csv(
            self.file_refinement_project,
            sep=" ",
            skiprows=1,
            header=None,
            skipinitialspace=True,
            index_col=0,
            names=columns,
        )
        df.index.name = self.data.index.name
        return df.drop([self.data.name], axis=1)

[docs]    def get_refinement_result(self) -> RefinementResult:
        """Get a RefinementResults object for the refinement of the measurement."""
        stats = self._get_refinement_statistics()
        return RefinementResult(
            r_wp=stats["Rwp"],
            r_exp=stats["Rexp"],
            gof=round(stats["GoF"], 2),
            chi_sqrd=round(stats["chi_sqrd"], 2),
            zero_shift=stats["zero_shift"],
            sample_discplacement=stats["sample_discplacement"],
            composition=self.get_composition(),
            software=SOFTWARE,
            version=None,
        )

    # TODO: Ensure the mass fractions are assigned to the correct phases! #
[docs]    def get_composition(self, mass_frac_prefix="Q") -> dict[UFloat]:
        """Get the composition of the refined sample."""
        global_parameter = self._get_global_parameters()
        mass_fracs = [
            ufloat(v[0], v[1])
            for k, v in global_parameter.items()
            if k.startswith(mass_frac_prefix)
        ]
        return dict(zip(self.get_phases(), mass_fracs))

[docs]    def get_phases(self) -> list[str]:
        """Get a list with phases determined in the refined sample."""
        dia_header = self._get_dia_header()
        return [p for p in dia_header[self.name_phases]]

    @property
    def file_refinement_input(self) -> str:
        """Path to ``*.xy`` refinement input data file.

        It is constructed as follows:
        ``<dir_refinement>/<measurement_id><suffix_data>.xy``
        """
        return os.path.join(
            self.dir_refinement,
            self.measurement_id + self.input_data_suffix + ".xy",
        )

    @property
    def file_refinement_project(self) -> str:
        """Path to ``*.dia`` refinement project file.

        It is constructed as follows:
        ``<dir_refinement>/<measurement_id><suffix_data>.dia``
        """
        return os.path.join(
            self.dir_refinement,
            self.measurement_id + self.input_data_suffix + ".dia",
        )

[docs]    def open_refinement(self) -> None:
        """Open the refinement project with profex.

        Raises:
            AppNotInstalledError: If profex is not installed on the machine.
        """
        if self._get_app_path() is None:
            raise AppNotInstalledError(
                f"Refinement application {REFINEMENT_APPLICATION!r} not installed."
            )
        # Get refinement project or input-data file
        if os.path.isfile(self.file_refinement_project):
            refinement_file = self.file_refinement_project
        else:
            refinement_file = self.file_refinement_input
        # Open refinement file with profex
        logger.debug(
            f"Refining measurement {self.measurement_id!r} with {REFINEMENT_APPLICATION!r}..."
        )
        subprocess.run(
            [REFINEMENT_APPLICATION, refinement_file],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )


[docs]def register() -> None:
    """Register the module as *xrd-tools* refinement interface plugin at its factory."""
    refinement_interface_factory.register(NAME_REFINEMENT_INTERFACE, ProfexInterface)