Source code for pympfit.mpfit._mpfit

import warnings
from typing import TYPE_CHECKING

import numpy as np
from openff.recharge.charges.library import LibraryChargeParameter
from openff.recharge.utilities.toolkits import molecule_to_tagged_smiles
from openff.toolkit.utils.exceptions import AtomMappingWarning
from openff.units import unit

from pympfit.gdma.storage import MoleculeGDMARecord
from pympfit.mbis.storage import MoleculeMBISRecord
from pympfit.mpfit.solvers import MPFITSolver
from pympfit.optimize import MPFITObjective

# Type alias for records that can be used with MPFIT
MultipoleRecord = MoleculeGDMARecord | MoleculeMBISRecord

if TYPE_CHECKING:
    from openff.recharge.charges.vsite import VirtualSiteCollection
    from openff.toolkit import Molecule

    from pympfit.mpfit.solvers import ConstrainedMPFITSolver


def _generate_dummy_values(smiles: str) -> list[float]:
    """Generate a list of dummy values for a ``LibraryChargeParameter``.

    The values sum to the correct total charge.
    """
    from openff.toolkit import Molecule

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", AtomMappingWarning)
        molecule: Molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True)

    total_charge = molecule.total_charge.m_as(unit.elementary_charge)
    per_atom_charge = total_charge / molecule.n_atoms

    n_values = len(set(molecule.properties["atom_map"].values()))
    return [per_atom_charge] * n_values


[docs] def molecule_to_mpfit_library_charge(molecule: "Molecule") -> LibraryChargeParameter: """Create a library charge parameter from a molecule. Parameters ---------- molecule The molecule to create the SMILES pattern from. Returns ------- The library charge parameter with one charge value per atom. """ hydrogen_indices = [ i for i, atom in enumerate(molecule.atoms) if atom.atomic_number == 1 ] other_indices = [ i for i, atom in enumerate(molecule.atoms) if atom.atomic_number != 1 ] atom_indices = list(range(1, molecule.n_atoms + 1)) tagged_smiles = molecule_to_tagged_smiles(molecule, atom_indices) return LibraryChargeParameter( smiles=tagged_smiles, value=_generate_dummy_values(tagged_smiles), provenance={ "hydrogen-indices": hydrogen_indices, "other-indices": other_indices, }, )
def _fit_single_conformer( multipole_record: MultipoleRecord, solver: "MPFITSolver", vsite_collection: "VirtualSiteCollection | None" = None, fit_limit: int | None = None, ) -> tuple[np.ndarray, np.ndarray | None]: """Fit charges for a single conformer. Parameters ---------- multipole_record The multipole record (GDMA or MBIS) for this conformer. solver The solver to use for fitting. vsite_collection Optional virtual site collection defining extra charge sites. fit_limit Optional maximum multipole rank (0-indexed) for fitting. When provided and less than the record's available rank, the multipole tensor is truncated so only terms up to this rank are used. Returns ------- Tuple of (atom_charges, vsite_charges). vsite_charges is None if no vsites are present. """ from openff.toolkit import Molecule # Generate objective term for this single conformer objective_terms_and_masks = list( MPFITObjective.compute_objective_terms( [multipole_record], vsite_collection=vsite_collection, return_quse_masks=True, fit_limit=fit_limit, ) ) term, mask_dict = objective_terms_and_masks[0] quse_masks = mask_dict["quse_masks"] n_vsites = mask_dict.get("n_vsites", 0) # Solve for this conformer all_charges = solver.solve( np.array(term.atom_charge_design_matrix, dtype=object), np.array(term.reference_values, dtype=object), ancillary_arrays={"quse_masks": quse_masks, "n_vsites": n_vsites}, ) # Split into atom and vsite charges molecule = Molecule.from_mapped_smiles( multipole_record.tagged_smiles, allow_undefined_stereo=True ) n_atoms = molecule.n_atoms atom_charges = all_charges[:n_atoms].flatten() vsite_charges = all_charges[n_atoms:].flatten() if n_vsites > 0 else None return atom_charges, vsite_charges
[docs] def generate_mpfit_charge_parameter( multipole_records: list[MultipoleRecord], solver: MPFITSolver | None = None, vsite_collection: "VirtualSiteCollection | None" = None, fit_limit: int | None = None, ) -> LibraryChargeParameter | tuple[LibraryChargeParameter, np.ndarray]: """Generate point charges that reproduce the distributed multipole analysis data. For multiple conformers, charges are fit independently for each conformer and then averaged. Parameters ---------- multipole_records The records containing the distributed multipole data (GDMA or MBIS). If multiple records are provided, charges are fit independently for each and averaged. solver The solver to use when finding the charges that minimize the MPFIT loss function. By default, the SVD solver is used. vsite_collection Optional virtual site collection defining extra charge sites beyond atoms. When provided, charges are fit at both atom and vsite positions. fit_limit Optional maximum multipole rank (0-indexed) for fitting. When provided and less than the record's available rank, the multipole tensor is truncated so only terms up to this rank are used. Allows running the QM/multipole step once at a high rank (e.g., GDMA limit=8 or MBIS max_radial_moment=4) and fitting charges at multiple lower ranks without rerunning the QM calculation. Returns ------- When vsite_collection is None: LibraryChargeParameter with atom charges. When vsite_collection is provided: Tuple of (LibraryChargeParameter, vsite_charges) where vsite_charges is a numpy array of shape (n_vsites,). """ from openff.toolkit import Molecule from pympfit.mpfit.solvers import MPFITSVDSolver solver = MPFITSVDSolver() if solver is None else solver # Ensure all records are for the same molecule unique_smiles = { Molecule.from_mapped_smiles( record.tagged_smiles, allow_undefined_stereo=True ).to_smiles(mapped=False) for record in multipole_records } if len(unique_smiles) != 1: msg = "all multipole records must be generated for the same molecule" raise ValueError(msg) molecule = Molecule.from_smiles( next(iter(unique_smiles)), allow_undefined_stereo=True ) # Create the charge parameter mpfit_parameter = molecule_to_mpfit_library_charge(molecule) # Fit each conformer independently and average the results all_atom_charges = [] all_vsite_charges = [] for record in multipole_records: atom_charges, vsite_charges = _fit_single_conformer( record, solver, vsite_collection, fit_limit=fit_limit ) all_atom_charges.append(atom_charges) if vsite_charges is not None: all_vsite_charges.append(vsite_charges) # Average atom charges across conformers averaged_atom_charges = np.mean(all_atom_charges, axis=0) mpfit_parameter.value = averaged_atom_charges.tolist() # Return tuple only when vsites are present (backward compatible) if vsite_collection is not None and all_vsite_charges: averaged_vsite_charges = np.mean(all_vsite_charges, axis=0) return mpfit_parameter, averaged_vsite_charges return mpfit_parameter
[docs] def generate_global_atom_type_labels( molecules: list["Molecule"], radius: int = 2, equivalize_between_methyl_carbons: bool = True, equivalize_between_methyl_hydrogens: bool = True, equivalize_between_other_heavy_atoms: bool = True, equivalize_between_other_hydrogen_atoms: bool = True, ) -> list[list[str]]: """Generate consistent atom type labels across multiple molecules. This function implements a hybrid, hierarchical atom typing approach, in which: 1. **Within-molecule** equivalence is set via ``get_atom_symmetries`` for topologically equivalent atoms for the same molecule 2. **Cross-molecule** equivalence is set via Morgan fingerprints, where local chemical environments establish equivalence. Two atoms receive the same label only if they share both the same Morgan environment hash **and** the same within-molecule symmetry group (relative to their Morgan group). Parameters ---------- molecules The molecules to generate labels for. radius The Morgan fingerprint radius (in bonds) controlling cross-molecule equivalence scope. Smaller values (2-3) match shared substructures (e.g., imidazolium ring across EMIM/BMIM). Larger values require more of the surrounding structure to match. equivalize_between_methyl_carbons Whether topologically symmetric methyl(ene) carbons (matched by ``[#6X4H3,#6H4,#6X4H2:1]``) in **different** molecules should be assigned an equivalent charge. equivalize_between_methyl_hydrogens Whether topologically symmetric methyl(ene) hydrogens (attached to a methyl(ene) carbon) in **different** molecules should be assigned an equivalent charge. equivalize_between_other_heavy_atoms Whether topologically symmetric heavy atoms that are not methyl(ene) carbons in **different** molecules should be assigned an equivalent charge. equivalize_between_other_hydrogen_atoms Whether topologically symmetric hydrogens that are not methyl(ene) hydrogens in **different** molecules should be assigned an equivalent charge. Returns ------- A list of label lists, one per molecule. Each inner list contains one string label per atom, ordered by atom index. """ from collections import defaultdict from openff.recharge.utilities.toolkits import get_atom_symmetries from openff.units.elements import SYMBOLS from rdkit.Chem import rdFingerprintGenerator # Global registries for cross-molecule label assignment env_to_label_map: dict[tuple, str] = {} element_counters: defaultdict[str, int] = defaultdict(int) unique_atom_counter = 0 all_molecule_labels: list[list[str]] = [] for molecule in molecules: mol_labels: list[str] = [] atom_symmetries = get_atom_symmetries(molecule) # Atom classification ("within" molecule) methyl_carbons = { i for (i,) in molecule.chemical_environment_matches("[#6X4H3,#6H4,#6X4H2:1]") } methyl_hydrogens = { atom.molecule_atom_index for i in methyl_carbons for atom in molecule.atoms[i].bonded_atoms if atom.atomic_number == 1 } other_heavy_atoms = { i for i, atom in enumerate(molecule.atoms) if atom.atomic_number != 1 and i not in methyl_carbons } other_hydrogen_atoms = { i for i, atom in enumerate(molecule.atoms) if atom.atomic_number == 1 and i not in methyl_hydrogens } # Cross-molecule hashing via Morgan fingerprints rdkit_mol = molecule.to_rdkit() morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=radius) atom_morgan_hashes = [] for i in range(molecule.n_atoms): fp = morgan_gen.GetSparseCountFingerprint(rdkit_mol, fromAtoms=[i]) atom_morgan_hashes.append(tuple(sorted(fp.GetNonzeroElements().items()))) # have atoms shared within molecules take precendence sym_group_to_atoms: defaultdict[int, list[int]] = defaultdict(list) for i, sym in enumerate(atom_symmetries): sym_group_to_atoms[sym].append(i) representative_hash = list(atom_morgan_hashes) for atom_indices in sym_group_to_atoms.values(): rep = atom_morgan_hashes[atom_indices[0]] for idx in atom_indices: representative_hash[idx] = rep # Assign labels per atom for i, atom in enumerate(molecule.atoms): element = SYMBOLS[atom.atomic_number] is_methyl_c = i in methyl_carbons is_methyl_h = i in methyl_hydrogens # Determine if cross-molecule equivalence applies if is_methyl_c: should_equivalize = equivalize_between_methyl_carbons elif is_methyl_h: should_equivalize = equivalize_between_methyl_hydrogens elif i in other_heavy_atoms: should_equivalize = equivalize_between_other_heavy_atoms elif i in other_hydrogen_atoms: should_equivalize = equivalize_between_other_hydrogen_atoms else: should_equivalize = False if should_equivalize: env_key = (element, representative_hash[i]) if env_key not in env_to_label_map: element_counters[element] += 1 env_to_label_map[env_key] = f"{element}{element_counters[element]}" mol_labels.append(env_to_label_map[env_key]) else: # unique label across all molecules unique_atom_counter += 1 mol_labels.append(f"{element}_u{unique_atom_counter}") all_molecule_labels.append(mol_labels) return all_molecule_labels
[docs] def generate_constrained_mpfit_charge_parameter( multipole_records: list[MultipoleRecord], molecules: list["Molecule"], solver: "ConstrainedMPFITSolver | None" = None, atom_type_labels: list[list[str]] | None = None, radius: int = 2, equivalize_between_methyl_carbons: bool = True, equivalize_between_methyl_hydrogens: bool = True, equivalize_between_other_heavy_atoms: bool = True, equivalize_between_other_hydrogen_atoms: bool = True, ) -> list[LibraryChargeParameter]: """Fit constrained charges for one or more molecules. When multiple molecules are provided, atoms in **different** molecules that share the same atom type label are constrained to carry the same charge. For a single molecule, within-molecule symmetry constraints are still applied. Labels can be generated automatically via Morgan fingerprints and within-molecule symmetry, or supplied directly. Parameters ---------- multipole_records One multipole record (GDMA or MBIS) per molecule (one conformer each). molecules The molecules corresponding to each multipole record, in the same order. The formal charge of each molecule is read from ``molecule.total_charge``. solver The constrained solver to use. Defaults to ``ConstrainedSciPySolver``. atom_type_labels Per-molecule atom type labels. If ``None``, labels are generated automatically via ``generate_global_atom_type_labels``. radius Morgan fingerprint radius passed to ``generate_global_atom_type_labels`` when ``atom_type_labels`` is ``None``. equivalize_between_methyl_carbons Whether topologically symmetric methyl(ene) carbons (matched by ``[#6X4H3,#6H4,#6X4H2:1]``) in **different** molecules should be assigned an equivalent charge. equivalize_between_methyl_hydrogens Whether topologically symmetric methyl(ene) hydrogens (attached to a methyl(ene) carbon) in **different** molecules should be assigned an equivalent charge. equivalize_between_other_heavy_atoms Whether topologically symmetric heavy atoms that are not methyl(ene) carbons in **different** molecules should be assigned an equivalent charge. equivalize_between_other_hydrogen_atoms Whether topologically symmetric hydrogens that are not methyl(ene) hydrogens in **different** molecules should be assigned an equivalent charge. Returns ------- One ``LibraryChargeParameter`` per molecule. """ from pympfit.mpfit.solvers import ( ConstrainedMPFITSolver, ConstrainedMPFITState, ConstrainedSciPySolver, build_quse_matrix, ) if len(multipole_records) != len(molecules): msg = ( f"multipole_records has {len(multipole_records)} entries, " f"but molecules has {len(molecules)}" ) raise ValueError(msg) if solver is None: solver = ConstrainedSciPySolver() elif not isinstance(solver, ConstrainedMPFITSolver): msg = ( f"solver must be a ConstrainedMPFITSolver instance, " f"got {type(solver).__name__}" ) raise TypeError(msg) if atom_type_labels is None: atom_type_labels = generate_global_atom_type_labels( molecules, radius=radius, equivalize_between_methyl_carbons=equivalize_between_methyl_carbons, equivalize_between_methyl_hydrogens=equivalize_between_methyl_hydrogens, equivalize_between_other_heavy_atoms=equivalize_between_other_heavy_atoms, equivalize_between_other_hydrogen_atoms=equivalize_between_other_hydrogen_atoms, ) if len(atom_type_labels) != len(molecules): msg = ( f"atom_type_labels has {len(atom_type_labels)} entries, " f"but molecules has {len(molecules)}" ) raise ValueError(msg) flat_labels = tuple( label for mol_labels in atom_type_labels for label in mol_labels ) mol_charges = tuple( mol.total_charge.m_as(unit.elementary_charge) for mol in molecules ) all_xyz = [] all_multipoles = [] all_rvdw = [] all_lmax = [] atom_counts = [] for record in multipole_records: arrays = MPFITObjective.extract_arrays(record) all_xyz.append(arrays["bohr_conformer"]) all_multipoles.append(arrays["multipoles"]) all_rvdw.append(arrays["rvdw"]) all_lmax.append(arrays["lmax"]) atom_counts.append(arrays["n_atoms"]) # Get settings from first record (works for both GDMA and MBIS) first_record = multipole_records[0] if isinstance(first_record, MoleculeGDMARecord): settings = first_record.gdma_settings max_rank = settings.limit # GDMA uses 0-based indexing else: settings = first_record.mbis_settings max_rank = settings.limit - 1 # MBIS uses 1-based indexing xyzcharge = np.vstack(all_xyz) xyzmult = np.vstack(all_xyz) rvdw = np.concatenate(all_rvdw) state = ConstrainedMPFITState( xyzmult=xyzmult, xyzcharge=xyzcharge, multipoles=np.vstack(all_multipoles), quse=build_quse_matrix(xyzmult, xyzcharge, rvdw), atomtype=flat_labels, rvdw=rvdw, lmax=np.concatenate(all_lmax), r1=settings.mpfit_inner_radius, r2=settings.mpfit_outer_radius, maxl=max_rank, atom_counts=tuple(atom_counts), molecule_charges=mol_charges, ) qstore = solver.solve(state) # split charges by molecule → LibraryChargeParameter ── parameters = [] offset = 0 for molecule, n in zip(molecules, atom_counts, strict=False): mol_charges_arr = qstore[offset : offset + n] param = molecule_to_mpfit_library_charge(molecule) param.value = mol_charges_arr.tolist() parameters.append(param) offset += n return parameters