Source code for WaterNetworkAnalysis.WaterNetworkAnalysis

from __future__ import annotations

import os
import stat

import MDAnalysis
import MDAnalysis as mda
import MDAnalysis.analysis
import MDAnalysis.analysis.align
import MDAnalysis.core.topologyattrs
import numpy as np
from ConservedWaterSearch.utils import (
    get_orientations_from_positions,
    read_results,
)
from MDAnalysis.analysis.density import Density, DensityAnalysis
from MDAnalysis.topology.guessers import guess_types
from scipy.ndimage import gaussian_filter


def generate_water_selection_string():
    """Returns a selection string for selecting water molecules.

    This function generates a string that can be used in MDAnalysis's
    select_atoms method to select water molecules from a molecular dynamics
    simulation. The selection string is based on common residue names used
    for water molecules across different simulation packages and water models.

    Returns:
        str: Selection command in form of a string, which can be used with
             MDAnalysis's select_atoms method.

    Example::

        # Generate water selection string
        water_selection_string = generate_water_selection_string()

        # Use the selection string to select water molecules
        u = mda.Universe('topology_file.top', 'trajectory_file.trr')
        water_molecules = u.select_atoms(water_selection_string)
    """
    # List of common residue names for water
    water_resnames = [
        "SOL",
        "WAT",
        "H2O",
        "TIP3",
        "TIP4",
        "TIP5",
        "SPC",
        "SPCE",
        "HOH",
        "DOD",
        "T3P",
        "T4P",
    ]

    # Creating the selection string
    return " or ".join(f"resname {resname}" for resname in water_resnames)



[docs]
def get_selection_string_from_resnums(
    resids: list[int], selection_type: str = "MDA"
) -> str:
    """Return selection string for given residue ids.

    Returns the selection command string for different programs based on
    amioacid residue IDs list given.

    Args:
        resids (list[int]): list of aminoacid residue ids.
        selection_type (str, optional): selection program language.
            Options:"MDA", "PYMOL", "PROBIS" and "NGLVIEW" Defaults to
            "MDA".

    Returns:
        str: selection command in form of a string

    Example::

        # list of resids
        resids = [8,12,143,144] # print PYMOL selection string
        get_selection_string_from_resnums(resids, selection_type =
        "PYMOL")
    """
    selection: str = ""
    for i in resids:
        if selection_type == "MDA":
            selection += "resnum " + str(i) + " or "
        if selection_type == "PYMOL":
            selection += str(i) + "+"
        if selection_type == "PROBIS":
            selection += str(i) + ","
        if selection_type == "NGLVIEW":
            selection += str(i) + " or "
    if selection_type in ("MDA", "NGLVIEW"):
        selection = selection[: len(selection) - 3]
    if selection_type == "PYMOL":
        selection = selection[: len(selection) - 1]
        selection = "resi " + selection
    if selection_type == "PROBIS":
        selection = selection[: len(selection) - 1]
        selection = (
            ' -motif1 "[:A and ('
            + selection
            + ')]" -motif2 "[:A and ('
            + selection
            + ')]" '
        )
    return selection




[docs]
def get_center_of_selection(
    selection: str, trajectory: str, topology: str | None = None
) -> np.ndarray:
    """Compute centre of selection with MDAnalysis.

    Calculates coordinates in xyz of the centre of selection using
    MDAnalysis.

    Args:
        selection (str): selection string for MDAnalysis
        trajectory (str): trajectory filename - anything that is
            accepted by MDAnalysis should work
        topology (str | None, optional): topology filename. Defaults to
            None.

    Returns:
        np.ndarray:
            returns array that contains coordinates of center of selection

    Example::

        # find center of active site defined by residue ids
        resids = [8,12,143,144]
        get_center_of_selection(get_selection_string_from_resnums(resids))
    """
    if topology:
        u: mda.Universe = mda.Universe(topology, trajectory)
    else:
        u = mda.Universe(trajectory)
    s: mda.AtomGroup = u.select_atoms(selection)
    return s.center(None)




[docs]
def calculate_oxygen_density_map(
    selection_center: np.ndarray,
    trajectory: str,
    topology: str | None = None,
    dist: float = 12.0,
    delta: float = 0.4,
    every: int = 1,
    SOL: str | None = None,
    OW: str | None = None,
    output_name: str = "water.dx",
) -> Density:
    """Generate oxygen density maps.

    Generate oxygen density maps using MDAnalysis.

    Args:
        selection_center (np.ndarray): center of selection
            around which waters will be selected.
        trajectory (str): trajectory filename.
        topology (str | None, optional): Topology filename if available.
            Defaults to None.
        dist (float, optional): distance around selection center inside
            which the oxygen will be selected. Defaults to 12.0.
        delta (float, optional): bin size for density map. Defaults to
            0.4 Angstroms.
        every (int, optional): Take every ``n_every`` snapshot instead
            of taking all the snapshots (every = 1) for alignment.
            Defaults to 1.
        SOL (str, optional): Residue name of the water residue. If ``None`` it
            will be determined automatically. Defaults to None.
        OW (str, optional): Name of the oxygen atom. If ``None`` it will
            be determined automatically. Defaults to None.
        output_name (str, optional): name of the output file, it should
            end with '.dx' . Defaults to "water.dx".

    Returns:
        Density:
            returns MDA Density object containing the density map

    Example::

        # Generate water oxygen density map near active site
        resids = [8,12,143,144] calculate_oxygen_density_map(
            get_center_of_selection(
                get_selection_string_from_resnums(resids)), trajectory =
                'trajectory.pdb'
            )
        )
    """
    if topology:
        u: mda.Universe = mda.Universe(topology, trajectory)
    else:
        u = mda.Universe(trajectory)
    if SOL is None:
        water_selection = generate_water_selection_string()
    else:
        water_selection = "resname " + SOL
    if OW is None:
        oxygen_selection = "element O"
        guessed_and_read_props = [
            type(i)
            for i in list(u._topology.read_attributes)
            + list(u._topology.guessed_attributes)
        ]
        if MDAnalysis.core.topologyattrs.Elements not in guessed_and_read_props:
            u.add_TopologyAttr("elements", guess_types(u.atoms.names))
    else:
        oxygen_selection = "name " + OW
    ss: mda.AtomGroup = u.select_atoms(
        (
            f"{oxygen_selection} and {water_selection} and point "
            f"{selection_center[0]} {selection_center[1]} {selection_center[2]} "
            f"{dist}"
        ),
        updating=True,
    )
    D: DensityAnalysis = DensityAnalysis(
        ss,
        delta=delta,
        gridcenter=selection_center,
        xdim=dist * 2,
        ydim=dist * 2,
        zdim=dist * 2,
    )
    D.run(step=every)
    # oxygen vdw radius
    vdw_radius = 1.52
    sigma = vdw_radius / delta
    probability_density = D.results.density.grid / D.results.density.grid.sum()
    smoothed_density = gaussian_filter(probability_density, sigma)
    smoothed_density = smoothed_density / np.max(smoothed_density)
    # Put smoothed density into a Density object
    smoothed_density_obj = Density(
        grid=smoothed_density,
        edges=D._edges,
        parameters={"isDensity": True},
    )
    smoothed_density_obj.export(output_name, type="double")
    return smoothed_density_obj




[docs]
def make_results_pdb_MDA(
    water_type: list[str],
    waterO: np.ndarray,
    waterH1: np.ndarray,
    waterH2: np.ndarray,
    output_fname: str,
    protein_file: str | None = None,
    ligand_name: str | None = None,
    mode: str = "SOL",
) -> None:
    """Generate pdb file with clustering results.

    The water molecules determined by the clustering procedure are
    written in a pdb file which also contains protein and the ligand.
    Waters are labeled based on their hydrogen orientations (FCW for
    fully conserved, HCW for half conserved and WCW for weakly
    conserved). Uses MDAnalysis for construction of the pdb file.
    First 4 arguments of the function can be read from the results file
    by using cws.utils.read_results() or directly from the
    ``cws.water_clustering.WaterClustering`` class.

    Args:
        water_type (list[str]): List of water types.
        waterO (np.ndarray): numpy array containing coordinates
            of conserved waters' oxygens.
        waterH1 (np.ndarray): numpy array containing coordinates
            of conserved waters' first hydrogen
        waterH2 (np.ndarray): numpy array containing coordinates
            of conserved waters' second hydrogen
        output_fname (str): name of the output pdb file. Must end in '.pdb'.
        protein_file (str | None, optional): file which contains protein
            and ligand. It should be aligned in the same way as the
            trajectory used for calculation of conserved waters. If None no
            protein is saved. Defaults to None.
        ligand_name (str | None, optional): residue name of the ligand.
            If None no ligand is saved. Defaults to None.
        mode (str, optional): mode in which conserved waters will be
            saved. Options:

                - "SOL" - default mode. Saves water molecules as SOL so that
                  visualisation softwares can recognise them as waters. No
                  distinction is made between different types of conserved
                  waters.
                - "cathegorise" - cathegorises the waters according to
                  hydrogen orienation into fully conserved (FCW),
                  half-coserved (HCW) and weakly conserved (WCW). This mode
                  makes visualisers not able to recognise the waters as
                  water/sol but usefull for interpreting results.


    Example::

        # Generate pdb results file
        make_results_pdb_MDA(
            *cws.utils.read_results(),
            output_fname = 'results.pdb',
            mode = 'cathegorise'
        )
    """
    rescntr: int = 2
    if protein_file is not None:
        us: mda.Universe = mda.Universe(protein_file)
        protein: mda.AtomGroup = us.select_atoms("protein")
        # TODO for multiple confs
        ligand: list = []
        if ligand_name:
            ligand.append(us.select_atoms("resname " + str(ligand_name)))
            # ne radi?
            # if (ligand[0]==None):
            #    print ("no ligand with given name "+str(ligand_name))
        rescntr += len(ligand) + len(protein.residues)
    # waters
    n_residues: int = len(water_type)
    n_atoms: int = n_residues * 3
    # create resindex list
    resindices: np.ndarray = np.repeat(range(n_residues), 3)
    # all water molecules belong to 1 segment
    segindices: list[int] = [0] * n_residues
    waters = mda.Universe.empty(
        n_atoms,
        n_residues=n_residues,
        atom_resindex=resindices,
        residue_segindex=segindices,
        trajectory=True,
    )  # necessary for adding coordintes
    waters.add_TopologyAttr("name", ["O", "H1", "H2"] * n_residues)
    waters.add_TopologyAttr("type", ["O", "H", "H"] * n_residues)
    # waters.add_TopologyAttr('segid', ['SOL'])
    coordinates: list[list[float]] = []
    rns: list[str] = []
    resid: list[int] = []
    looper = zip(waterO, waterH1, waterH2, water_type, strict=False)
    for Opos, H1pos, H2pos, tip in looper:
        h2o = np.array(
            [
                [Opos[0], Opos[1], Opos[2]],  # oxygen
                [H1pos[0], H1pos[1], H1pos[2]],  # hydrogen
                [H2pos[0], H2pos[1], H2pos[2]],  # hydrogen
            ]
        )
        coordinates.extend(h2o)
        if mode == "cathegorise":
            if tip == "FCW":
                rns.append("FCW")
            if tip == "HCW":
                rns.append("HCW")
            if tip == "WCW":
                rns.append("WCW")
            if tip == "onlyO":
                rns.append("onlyO")
        elif mode == "SOL":
            rns.append("SOL")
        resid.append(rescntr)
        rescntr += 1
    waters.add_TopologyAttr("resname", rns)
    waters.add_TopologyAttr("resid", resid)
    coord_array = np.array(coordinates)
    waters.atoms.positions = coord_array
    bonds: list = []
    for o in range(0, n_atoms, 3):
        bonds.extend([(o, o + 1), (o, o + 2)])
    waters.add_TopologyAttr("bonds", bonds)
    if protein_file is not None:
        newU = mda.Merge(
            protein.atoms,
            *ligand,
            waters.atoms,
        )
    else:
        newU = mda.Merge(waters.atoms)
    newU.dimensions = 3
    if output_fname is None:
        if mode == "SOL":
            output_fname = "final_results_SOL.pdb"
        if mode == "cathegorise":
            output_fname = "final_results_cathegorise.pdb"
    newU.atoms.write(output_fname)




[docs]
def extract_waters_from_trajectory(
    selection_center: np.ndarray,
    trajectory: str,
    topology: str | None = None,
    dist: float = 12.0,
    every: int = 1,
    SOL: str | None = None,
    OW: str | None = None,
    HW: str | None = None,
    extract_only_O: bool = False,
    save_file: str | None = None,
) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
    """Extract waters for clustering analysis.

    Calculates water (oxygen and hydrogen) coordinates for all the
    waters in the aligned trajectory using MDAnalysis for further use in water
    clustering. The trajectory should be aligned previously.

    Args:
        selection_center (np.ndarray): coordinates of selection
            center around which waters will be selected.
        trajectory (str): Trajectory file name.
        topology (str | None, optional): Topology file name. Defaults to None.
        dist (float, optional): Distance around the center of selection
            inside which water molecules will be sampled. Defaults to 12.0.
        every (int, optional): Take every ``every`` snapshot instead of
            taking all the snapshots (every = 1) for alignment. Defaults
            to 1.
        SOL (str, optional): Residue name of the water residue. If ``None`` it
            will be determined automatically. Defaults to None.
        OW (str, optional): Name of the oxygen atom. If ``None`` it will
            be determined automatically. Defaults to None.
        HW (str, optional): Name of the hydrogen atom in water. Names
            checked will be the provided name and the name with a 1 or 2
            appended. If ``None`` it will be determined automatically.
            Defaults to None.
        extract_only_O (bool, optional): If ``True`` only oxygen atom
            positions. Defaults to False.
        save_file (str | None, optional): File to which coordinates will
            be saved. If none doesn't save to a file. Defaults to None.

    Returns:
        np.ndarray or tuple[np.ndarray, np.ndarray]:
            returns xyz numpy arrays that contain coordinates of oxygens,
            and combined array of hydrogen 1 and hydrogen 2 coordinates.
            If ``extract_only_O`` is True, returns only oxygen coordinates.

    Example::

        # Generate water coordinates for clustering analysis
        resids = [8,12,143,144]
        coordO, coordH = extract_waters_from_trajectory(
            get_center_of_selection(get_selection_string_from_resnums(resids)),
            trajectory = 'trajectory.xtc',
            topology = 'topology.tpr'
        )
    """
    if topology:
        u = mda.Universe(topology, trajectory)
    else:
        u = mda.Universe(trajectory)
    if SOL is None:
        water_selection = generate_water_selection_string()
    else:
        water_selection = "resname " + SOL
    if OW is None:
        oxygen_selection = "element O"
    else:
        oxygen_selection = "name " + OW
    if HW is None:
        hydrogen_selection = "element H"
    else:
        hydrogen_selection = "name " + HW + " or name " + HW + "1 or name " + HW + "2"
    if (HW is None and extract_only_O is False) or OW is None:
        guessed_and_read_props = [
            type(i)
            for i in list(u._topology.read_attributes)
            + list(u._topology.guessed_attributes)
        ]
        if MDAnalysis.core.topologyattrs.Elements not in guessed_and_read_props:
            u.add_TopologyAttr("elements", guess_types(u.atoms.names))
    if extract_only_O is False:
        coordsH = []
    coordsO = []
    mismatch_detected = False
    if is_pdb_file(trajectory):
        # Detect if any frame has a different number of atoms
        expected_n_atoms = len(u.atoms)
        try:
            for ts in u.trajectory:
                if ts.n_atoms != expected_n_atoms:
                    mismatch_detected = True
                    break
        except ValueError:
            mismatch_detected = True
    if mismatch_detected:
        model_counter = 0
        inside_model = False
        buffer_lines = []

        with open(trajectory) as f:
            for line in f:
                # Start of a new MODEL block
                if line.startswith("MODEL"):
                    if model_counter % every == 0:
                        inside_model = True
                        model_counter += 1
                        buffer_lines = [line]
                    continue

                # End of that MODEL block
                if inside_model and line.startswith("ENDMDL"):
                    buffer_lines.append(line)
                    # We have collected a full MODEL … ENDMDL block in buffer_lines

                    # Write it to a temp PDB file
                    frame_name = f"_temp_frame_{model_counter:03d}.pdb"
                    with open(frame_name, "w") as tmpf:
                        tmpf.writelines(buffer_lines)
                    tmpf.close()

                    # Load and process that single frame PDB
                    u_frame = mda.Universe(frame_name)
                    oxygens = u_frame.select_atoms(
                        f"{oxygen_selection} and {water_selection} and point "
                        f"{selection_center[0]} {selection_center[1]} "
                        f"{selection_center[2]} {dist}"
                    )
                    for oxygen in oxygens:
                        coordsO.append(oxygen.position.copy())
                        if not extract_only_O:
                            hydrogens = u_frame.select_atoms(
                                f"resid {oxygen.resid} and "
                                f"({hydrogen_selection}) and "
                                f"({water_selection})"
                            )
                            if len(hydrogens) != 2:
                                msg = (
                                    f"Water {oxygen.resid} has too many hydrogens "
                                    f"({len(hydrogens)}) in frame {model_counter}."
                                )
                                os.remove(frame_name)
                                raise Exception(msg)
                            for hpos in hydrogens.positions:
                                coordsH.append(hpos.copy())

                    # Delete the temp file before reading the next model
                    os.remove(frame_name)

                    # Reset for next MODEL block
                    inside_model = False
                    buffer_lines = []
                    continue

                # If we are inside a MODEL…ENDMDL block, keep adding lines
                if inside_model:
                    buffer_lines.append(line)

    else:
        # loop over
        for _ in u.trajectory[::every]:
            oxygens = u.select_atoms(
                (
                    f"{oxygen_selection} and {water_selection} and point "
                    f"{selection_center[0]} {selection_center[1]} {selection_center[2]}"
                    f" {dist}"
                ),
            )
            for oxygen in oxygens:
                coordsO.append(oxygen.position)
                if extract_only_O is False:
                    hydrogens = u.select_atoms(
                        f"resid {oxygen.resid} and "
                        f"({hydrogen_selection}) and "
                        f"({water_selection})"
                    )
                    if len(hydrogens) != 2:
                        exception_string: str = (
                            f"Water {oxygen.resid} has too many"
                            f" hydrogens ({len(hydrogens)})."
                        )
                        raise Exception(exception_string)
                    for hydrogen_positions in hydrogens.positions:
                        coordsH.append(hydrogen_positions)
    Odata: np.ndarray = np.asarray(coordsO)
    if extract_only_O is False:
        coordsH = np.asarray(coordsH)
        Opos, H1, H2 = get_orientations_from_positions(Odata, coordsH)
        # SAVEs full XYZ coordinates, not O coordinates and h orientations!!!!!
        if save_file is not None:
            np.savetxt(save_file, np.c_[Opos, H1, H2])
        return Odata, coordsH
    if save_file is not None:
        np.savetxt(save_file, Odata)
    return Odata



def __align_mda(
    unaligned_trj_file: str,
    pdb_to_align_to: str,
    output_trj_file: str,
    topology: str | None = None,
    align_selection: str = "protein",
) -> None:
    """Alignment via MDAnalysis.

    This function is not ment to be used directly. Use align_trajectory instead.

    Args:
        unaligned_trj_file (str): trajectory file containing unaligned
            trajectory.
        pdb_to_align_to (str): pdb file that containes the reference
            state to align to.
        output_trj_file (str): output file trajectory will be saved to.
        topology (str | None, optional): File containing topology.
            Defaults to None.
        align_selection (str, optional): selection to align to. Defaults
            to "protein".
    """
    if topology:
        mob: mda.Universe = mda.Universe(topology, unaligned_trj_file)
    else:
        mob = mda.Universe(unaligned_trj_file)
    ref: mda.Universe = mda.Universe(pdb_to_align_to)
    mda.analysis.align.AlignTraj(
        mob, ref, select=align_selection, filename=output_trj_file
    ).run()


def __align_probis(
    unaligned_trj_file: str,
    pdb_to_align_to: str,
    output_trj_file: str,
    probis_exec: str,
    topology: str | None = None,
    align_selection: str = "protein",
) -> None:
    """Alignment function that uses probis.

    This function is not ment to be used directly. Use align_trajectory
    instead.Uses probis algorithm for alignment. See
    `this link <http://insilab.org/probis-algorithm/>`__ for more info.

    Args:
        unaligned_trj_file (str): trajectory file containing unaligned
            trajectory.
        pdb_to_align_to (str): pdb file that containes the reference
            state to align to.
        output_trj_file (str): output file trajectory will be saved to.
        probis_exec (str | None, optional): Executable to run probis
            algorithm.
        topology (str | None, optional): File containing topology.
            Defaults to None.
        align_selection (str, optional): selection to align to. Defaults
            to "protein".
    """
    if probis_exec is None:
        msg = (
            "probis_exec must be provided or probis must be installed in the "
            "current working directory."
        )
        raise Exception(msg)
    st: os.stat_result = os.stat(probis_exec)
    os.chmod(probis_exec, st.st_mode | stat.S_IEXEC)
    if align_selection == "protein":
        sele: str = ""
    else:
        sele = align_selection
    if topology:
        u: mda.Universe = mda.Universe(topology, unaligned_trj_file)
    else:
        u = mda.Universe(unaligned_trj_file)
    mobile_pdb: str = "mobile.pdb"
    u.select_atoms(align_selection).segments.segids = "A"
    u.add_TopologyAttr("chainIDs")
    u.select_atoms(align_selection).chainIDs = "A"
    forwrite = u.atoms
    with mda.Writer(output_trj_file, multiframe=True) as W:
        for snap in u.trajectory:
            with mda.Writer(mobile_pdb) as mw:
                mw.write(forwrite)
            # string to run
            runstring: str = (
                probis_exec
                + " -compare -super "
                + sele
                + " -f1 "
                + str(pdb_to_align_to)
                + " -c1 A -f2 "
                + str(mobile_pdb)
                + " -c2 A"
            )
            if probis_exec == "probis":
                runstring = "./" + runstring
            # run probis
            print(runstring)
            os.system(runstring)
            # read in rota.pdb files and remove all other files
            newal: mda.Universe = mda.Universe(
                pdb_to_align_to[:4] + "A_" + mobile_pdb[:4] + "A.0.rota.pdb"
            )
            # remove aligned pdb files
            removestring: str = "rm *.rota.pdb probis*.tmp " + mobile_pdb
            os.system(removestring)
            # add cell dimensions and other missing info?
            newal.dimensions = snap.dimensions
            # append snapshot to new trajectory
            selnew = newal.atoms
            W.write(selnew)



[docs]
def align_trajectory(
    trajectory: str,
    output_trj_file: str,
    align_target_file_name: str,
    topology: str | None = None,
    every: int = 1,
    align_mode: str = "mda",
    align_target: int | None = -1,
    align_selection: str = "protein",
    probis_exec: str | None = None,
) -> None:
    """Align the trajectory.

    Before running water clustering for identification of conserved
    water molecules the trajectory should be aligned first. Alignment
    can be done via MDAnalysis or using the probis algorithm. Whole
    protein is aligned by default. To select the align reference state
    either select an integer for ``align_target`` and specify a file name to
    which the align target will be saved to with ``align_target_file_name``
    OR set align_target to ``None`` and ``align_target_file_name`` will be read
    and used as align target.

    The trajectory or topology should contain information on bond topology
    for alignment. Supported topology file types:

        DATA
        DMS
        GSD
        MMTF
        MOL2
        PARMED
        PDB
        ENT
        PSF
        TOP
        PRMTOP
        PARM7
        TPR
        TXYZ
        ARC
        XML
        XPDB

    Alternatively the whole trajectory can be provided in some of the above
    given file types as well.

    Args:
        trajectory (str): File name containing unaligned trajectory.
        output_trj_file (str): output file name for aligned trajectory.
        align_target_file_name (str): File name for saving the align
            target (usually pdb) if ``align_target`` is int. If align target is
            ``None``, the align target will be read from this file instead.
        topology (str | None, optional): Topology file name. Defaults to
            ``None``.
        every (int, optional): Take every ``every`` snapshot instead
            of taking all the snapshots (every = 1) for alignment. Defaults
            to 1.
        align_mode (str, optional): Align algorithm to use. "mda"
            uses MDAnalysis while "probis" uses the probis algorithm.
            Defaults to "mda".
        align_target (int | None, optional): Align target. If ``None`` the
            align target is read from the ``align_target_file_name``. If a
            number is given uses the given snapshot of the trajectory as
            the align target. If -1 uses the last snapshot. Defaults to
            -1.
        align_selection (str, optional): Selection to align to. Defaults
            to "protein".
        probis_exec (str | None, optional): location of probis
            executable if probis is used. Must be provided if ``align_mode`` is
            "probis".

    Example::

        # align the trajectory and save to a file
        align_trajectory(
            trajectory="trajectory.xtc",
            output_trj_file="aligned_trajectory.xtc",
            align_target_file_name='aligned.pdb', align_mode="mda",
            align_target=0, align_selection="protein",
            topology="topology.tpr",
        )
    """
    # check that output_trj_file name is not the same as trajectory, topology or
    # aligntarget
    if output_trj_file in {trajectory, topology, align_target_file_name}:
        exception_string = (
            "output_trj_file name cannot be the same as "
            "trajectory, topology or align_target_file_name"
        )
        raise Exception(exception_string)
    if align_target_file_name in {trajectory, topology}:
        exception_string = (
            "align_target_file_name name cannot be the same as " "trajectory or topology"
        )
        raise Exception(exception_string)
    if topology is not None:
        mob = mda.Universe(topology, trajectory)
        ref: mda.Universe = mda.Universe(topology, trajectory)
    else:
        mob: mda.Universe = mda.Universe(trajectory)
        ref = mda.Universe(trajectory)
    # center the box to center of mass
    if topology is None:
        if not (
            trajectory.upper().endswith(
                (
                    "DATA",
                    "DMS",
                    "GSD",
                    "MMTF",
                    "MOL2",
                    "PARMED",
                    "PDB",
                    "ENT",
                    "PSF",
                    "TOP",
                    "PRMTOP",
                    "PARM7",
                    "TPR",
                    "TXYZ",
                    "ARC",
                    "XML",
                    "XPDB",
                    "PDB",
                )
            )
        ):
            exception_string: str = (
                "unsupported trajectory file format. bond "
                "topology information is needed for alignment. Consider "
                "providing a topology file."
            )
    elif isinstance(topology, str):
        if not (
            topology.upper().endswith(
                (
                    "DATA",
                    "DMS",
                    "GSD",
                    "MMTF",
                    "MOL2",
                    "PARMED",
                    "PDB",
                    "ENT",
                    "PSF",
                    "TOP",
                    "PRMTOP",
                    "PARM7",
                    "TPR",
                    "TXYZ",
                    "ARC",
                    "XML",
                    "XPDB",
                    "PDB",
                )
            )
        ):
            exception_string: str = (
                "unsupported topology file type. Bond information is "
                "needed for alignment."
            )
            raise Exception(exception_string)
    ref.select_atoms(align_selection).segments.segids = "A"
    ref.add_TopologyAttr("chainIDs")
    ref.select_atoms(align_selection).chainIDs = "A"
    if isinstance(align_target, int):
        wr = ref.atoms
        wr.write(align_target_file_name, frames=ref.trajectory[[align_target]])
    if every > 1:
        unaligned_trj_file: str = (
            trajectory[: trajectory.rfind(".") - 1]
            + f"every_{every}"
            + trajectory[trajectory.rfind(".") :]
        )
        with mda.Writer(unaligned_trj_file, multiframe=True) as W:
            for _ in mob.trajectory[::every]:
                W.write(mob.atoms)
    elif every == 1:
        unaligned_trj_file: str = trajectory
    else:
        exception_string: str = "every must be a positive integer"
        raise Exception(exception_string)
    if align_mode == "probis":
        __align_probis(
            unaligned_trj_file=unaligned_trj_file,
            pdb_to_align_to=align_target_file_name,
            output_trj_file=output_trj_file,
            probis_exec=probis_exec,
            topology=topology,
            align_selection=align_selection,
        )
    elif align_mode == "mda":
        __align_mda(
            unaligned_trj_file=unaligned_trj_file,
            pdb_to_align_to=align_target_file_name,
            output_trj_file=output_trj_file,
            topology=topology,
            align_selection=align_selection,
        )
    else:
        exception_string: str = "wrong alignemnt mode, must be probis or mda"
        raise Exception(exception_string)




[docs]
def align_and_extract_waters(
    center_for_water_selection: np.ndarray,
    trajectory: str,
    aligned_trajectory_filename: str,
    align_target_file_name: str,
    topology: str | None = None,
    every: int = 1,
    align_mode: str = "mda",
    align_target: int | None = -1,
    align_selection: str = "protein",
    probis_exec: str | None = None,
    dist: float = 12.0,
    SOL: str = "SOL",
    OW: str = "OW",
    HW: str = "HW",
) -> tuple[np.ndarray]:
    """Align and extracts waters from trajectory.

    Aligns the trajectory first and then extracts water molecules for
    further water clustering analysis. If trajectory has already been
    aligned, one can use :py:meth:`extract_waters_from_trajectory` to
    extract the water molecules for water clustering analysis.

    Args:
        center_for_water_selection (np.ndarray): Coordiantes
            around which all water molecules inside a radius ``dist``
            will be seleceted for water clustering analysis.
        trajectory (str): File name of the trajectory from which waters
            will be extracted.
        aligned_trajectory_filename (str): File name to which aligned
            trajectory will be saved.
        align_target_file_name (str): File name for saving the align
            target (usually pdb) if ``align_target`` is ``int``. If
            align target is ``None``, the align target will be read
            from this file instead!
        topology (str | None, optional): Topology file name. Defaults
            to ``None``.
        every (int, optional): Take every ``every`` snapshot instead of
            taking all the snapshots (every = 1) for alignment.
            Defaults to 1.
        align_mode (str, optional): Align algorithm to use. "mda" uses
            MDAnalysis while "probis" uses the probis algorithm.
            Defaults to "mda".
        align_target (int | None, optional): Align target. If ``None``
            the align target is read from the align_target_file_name.
            If a number is given uses the given snapshot of the
            trajectory as the align target. If -1 uses the last
            snapshot. Defaults to -1.
        align_selection (str, optional): Selection to align to. Defaults
            to "protein".
        probis_exec (str | None, optional): location of probis
            executable if probis is used. If None it is downloaded
            from the internet. Defaults to None.
        dist (float, optional): Radius around
            ``center_for_water_selection`` to be used for extraction of
            water molecules. Defaults to 12.0.
        SOL (str, optional): Residue name for waters. Defaults to "SOL".
        OW (str, optional): Name of the oxygen atom. Defaults to "OW".
        HW (str, optional): Name of the hydrogen atom. Defaults to "HW".

    Returns:
        tuple[np.ndarray, np.ndarray]:
            Returns coordinates of oxygen atoms, first
            hydrogen atom and second hydrogen atom in three seperate numpy
            arrays. Each row in each array makes up coordinates of a single
            water molecule.

    Example::

        # Generate water coordinates for clustering analysis from unaligned trajectory
        resids = [8,12,143,144]
        align_and_extract_waters(
            get_center_of_selection(get_selection_string_from_resnums(resids)),
            trajectory = 'trajectory.xtc',
            aligned_trajectory_filename = 'aligned_trj.xtc',
            align_target_file_name = 'aligned.pdb',
            topology = 'topology.tpr',
            every = 1,
            align_mode = "mda",
            align_target= 0,
            align_selection = "protein",
            dist = 10.0,
        )
    """
    align_trajectory(
        trajectory=trajectory,
        output_trj_file=aligned_trajectory_filename,
        align_target_file_name=align_target_file_name,
        topology=topology,
        every=every,
        align_mode=align_mode,
        align_target=align_target,
        align_selection=align_selection,
        probis_exec=probis_exec,
    )
    return extract_waters_from_trajectory(
        center_for_water_selection,
        trajectory=aligned_trajectory_filename,
        dist=dist,
        topology=topology,
        SOL=SOL,
        OW=OW,
        HW=HW,
    )




[docs]
def read_results_and_make_pdb(
    fname: str = "Clustering_results.dat",
    typefname: str = "Type_Clustering_results.dat",
    protein_file: str = "aligned.pdb",
    ligand_name: str | None = None,
    output_fname: str | None = None,
    mode: str = "SOL",
) -> None:
    """Read results from files and generate a pdb file.

    Args:
        fname (str, optional): File name with clustering results -
            coordinates of water molecules. Defaults to "Clustering_results.dat".
        typefname (str, optional): File name which contains the types of
            each water molecule. Defaults to "Type_Clustering_results.dat".
        protein_file (str, optional): File name which contains the
            reference structure trajectory has been aligned to. Defaults
            to "aligned.pdb".
        ligand_name (str | None, optional): Residue name for the ligand.
            If none is given, no ligand is extracted and
            visualised/saved in the pdb file. Defaults to None.
        output_fname (str | None, optional): Name of the output file
            (pdb prefered). Defaults to None.
        mode (str, optional): mode in which conserved waters will
            besaved. Options:

                - "SOL" - default mode. Saves water molecules as SOL so that
                  visualisation softwares can recognise them as waters. No
                  distinction is made between different types of conserved
                  waters.
                - "cathegorise" - cathegorises the waters according to
                  hydrogen orienation into fully conserved (FCW),
                  half-coserved (HCW) and weakly conserved (WCW). This mode
                  makes visualisers not able to recognise the waters as
                  water/sol but usefull for interpreting results.

    Example::

        # Generate pdb results file
        read_results_and_make_pdb(
            fname = 'Results.dat',
            typefname = 'TypeResults.dat',
            protein_file = 'aligned_protein.pdb',
            ligand_name = 'UBY',
            output_fname = 'results.pdb',
            mode = 'cathegorise',
        )
    """
    water_type, waterO, waterH1, waterH2 = read_results(fname, typefname)
    make_results_pdb_MDA(
        water_type=water_type,
        waterO=waterO,
        waterH1=waterH1,
        waterH2=waterH2,
        protein_file=protein_file,
        ligand_name=ligand_name,
        output_fname=output_fname,
        mode=mode,
    )



def is_pdb_file(filepath: str) -> bool:
    """Return True if `filepath` is probably a PDB.

    If it's binary (like XTC/DCD), this will return False safely.
    """
    ext = os.path.splitext(filepath)[1].lower()
    if ext in {".pdb", ".ent", ".pdb1", ".pdb2"}:
        return True

    # If extension isn't a known PDB, peek at first few lines:
    try:
        with open(filepath, encoding="utf-8") as f:
            for _ in range(10):
                line = next(f)
                if line.startswith(("ATOM  ", "HETATM", "HEADER", "TITLE")):
                    return True
    except (UnicodeDecodeError, OSError, StopIteration):
        # If it fails to decode (binary), or can't open, it's not a PDB.
        return False

    return False