import warnings
from typing import TYPE_CHECKING
import numpy as np
from openff.recharge.charges.library import LibraryChargeParameter
from openff.recharge.utilities.toolkits import molecule_to_tagged_smiles
from openff.toolkit.utils.exceptions import AtomMappingWarning
from openff.units import unit
from pympfit.gdma.storage import MoleculeGDMARecord
from pympfit.mbis.storage import MoleculeMBISRecord
from pympfit.mpfit.solvers import MPFITSolver
from pympfit.optimize import MPFITObjective
# Type alias for records that can be used with MPFIT
MultipoleRecord = MoleculeGDMARecord | MoleculeMBISRecord
if TYPE_CHECKING:
from openff.recharge.charges.vsite import VirtualSiteCollection
from openff.toolkit import Molecule
from pympfit.mpfit.solvers import ConstrainedMPFITSolver
def _generate_dummy_values(smiles: str) -> list[float]:
"""Generate a list of dummy values for a ``LibraryChargeParameter``.
The values sum to the correct total charge.
"""
from openff.toolkit import Molecule
with warnings.catch_warnings():
warnings.simplefilter("ignore", AtomMappingWarning)
molecule: Molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True)
total_charge = molecule.total_charge.m_as(unit.elementary_charge)
per_atom_charge = total_charge / molecule.n_atoms
n_values = len(set(molecule.properties["atom_map"].values()))
return [per_atom_charge] * n_values
[docs]
def molecule_to_mpfit_library_charge(molecule: "Molecule") -> LibraryChargeParameter:
"""Create a library charge parameter from a molecule.
Parameters
----------
molecule
The molecule to create the SMILES pattern from.
Returns
-------
The library charge parameter with one charge value per atom.
"""
hydrogen_indices = [
i for i, atom in enumerate(molecule.atoms) if atom.atomic_number == 1
]
other_indices = [
i for i, atom in enumerate(molecule.atoms) if atom.atomic_number != 1
]
atom_indices = list(range(1, molecule.n_atoms + 1))
tagged_smiles = molecule_to_tagged_smiles(molecule, atom_indices)
return LibraryChargeParameter(
smiles=tagged_smiles,
value=_generate_dummy_values(tagged_smiles),
provenance={
"hydrogen-indices": hydrogen_indices,
"other-indices": other_indices,
},
)
def _fit_single_conformer(
multipole_record: MultipoleRecord,
solver: "MPFITSolver",
vsite_collection: "VirtualSiteCollection | None" = None,
fit_limit: int | None = None,
) -> tuple[np.ndarray, np.ndarray | None]:
"""Fit charges for a single conformer.
Parameters
----------
multipole_record
The multipole record (GDMA or MBIS) for this conformer.
solver
The solver to use for fitting.
vsite_collection
Optional virtual site collection defining extra charge sites.
fit_limit
Optional maximum multipole rank (0-indexed) for fitting. When
provided and less than the record's available rank, the multipole
tensor is truncated so only terms up to this rank are used.
Returns
-------
Tuple of (atom_charges, vsite_charges). vsite_charges is None if
no vsites are present.
"""
from openff.toolkit import Molecule
# Generate objective term for this single conformer
objective_terms_and_masks = list(
MPFITObjective.compute_objective_terms(
[multipole_record],
vsite_collection=vsite_collection,
return_quse_masks=True,
fit_limit=fit_limit,
)
)
term, mask_dict = objective_terms_and_masks[0]
quse_masks = mask_dict["quse_masks"]
n_vsites = mask_dict.get("n_vsites", 0)
# Solve for this conformer
all_charges = solver.solve(
np.array(term.atom_charge_design_matrix, dtype=object),
np.array(term.reference_values, dtype=object),
ancillary_arrays={"quse_masks": quse_masks, "n_vsites": n_vsites},
)
# Split into atom and vsite charges
molecule = Molecule.from_mapped_smiles(
multipole_record.tagged_smiles, allow_undefined_stereo=True
)
n_atoms = molecule.n_atoms
atom_charges = all_charges[:n_atoms].flatten()
vsite_charges = all_charges[n_atoms:].flatten() if n_vsites > 0 else None
return atom_charges, vsite_charges
[docs]
def generate_mpfit_charge_parameter(
multipole_records: list[MultipoleRecord],
solver: MPFITSolver | None = None,
vsite_collection: "VirtualSiteCollection | None" = None,
fit_limit: int | None = None,
) -> LibraryChargeParameter | tuple[LibraryChargeParameter, np.ndarray]:
"""Generate point charges that reproduce the distributed multipole analysis data.
For multiple conformers, charges are fit independently for each conformer
and then averaged.
Parameters
----------
multipole_records
The records containing the distributed multipole data (GDMA or MBIS).
If multiple records are provided, charges are fit independently for
each and averaged.
solver
The solver to use when finding the charges that minimize the MPFIT loss
function. By default, the SVD solver is used.
vsite_collection
Optional virtual site collection defining extra charge sites beyond
atoms. When provided, charges are fit at both atom and vsite positions.
fit_limit
Optional maximum multipole rank (0-indexed) for fitting. When provided
and less than the record's available rank, the multipole tensor is
truncated so only terms up to this rank are used. Allows running the
QM/multipole step once at a high rank (e.g., GDMA limit=8 or MBIS
max_radial_moment=4) and fitting charges at multiple lower ranks
without rerunning the QM calculation.
Returns
-------
When vsite_collection is None: LibraryChargeParameter with atom charges.
When vsite_collection is provided: Tuple of (LibraryChargeParameter,
vsite_charges) where vsite_charges is a numpy array of shape (n_vsites,).
"""
from openff.toolkit import Molecule
from pympfit.mpfit.solvers import MPFITSVDSolver
solver = MPFITSVDSolver() if solver is None else solver
# Ensure all records are for the same molecule
unique_smiles = {
Molecule.from_mapped_smiles(
record.tagged_smiles, allow_undefined_stereo=True
).to_smiles(mapped=False)
for record in multipole_records
}
if len(unique_smiles) != 1:
msg = "all multipole records must be generated for the same molecule"
raise ValueError(msg)
molecule = Molecule.from_smiles(
next(iter(unique_smiles)), allow_undefined_stereo=True
)
# Create the charge parameter
mpfit_parameter = molecule_to_mpfit_library_charge(molecule)
# Fit each conformer independently and average the results
all_atom_charges = []
all_vsite_charges = []
for record in multipole_records:
atom_charges, vsite_charges = _fit_single_conformer(
record, solver, vsite_collection, fit_limit=fit_limit
)
all_atom_charges.append(atom_charges)
if vsite_charges is not None:
all_vsite_charges.append(vsite_charges)
# Average atom charges across conformers
averaged_atom_charges = np.mean(all_atom_charges, axis=0)
mpfit_parameter.value = averaged_atom_charges.tolist()
# Return tuple only when vsites are present (backward compatible)
if vsite_collection is not None and all_vsite_charges:
averaged_vsite_charges = np.mean(all_vsite_charges, axis=0)
return mpfit_parameter, averaged_vsite_charges
return mpfit_parameter
[docs]
def generate_global_atom_type_labels(
molecules: list["Molecule"],
radius: int = 2,
equivalize_between_methyl_carbons: bool = True,
equivalize_between_methyl_hydrogens: bool = True,
equivalize_between_other_heavy_atoms: bool = True,
equivalize_between_other_hydrogen_atoms: bool = True,
) -> list[list[str]]:
"""Generate consistent atom type labels across multiple molecules.
This function implements a hybrid, hierarchical atom typing approach, in which:
1. **Within-molecule** equivalence is set via
``get_atom_symmetries`` for topologically equivalent
atoms for the same molecule
2. **Cross-molecule** equivalence is set via Morgan
fingerprints, where local chemical environments
establish equivalence.
Two atoms receive the same label only if they share both the same
Morgan environment hash **and** the same within-molecule symmetry
group (relative to their Morgan group).
Parameters
----------
molecules
The molecules to generate labels for.
radius
The Morgan fingerprint radius (in bonds) controlling cross-molecule
equivalence scope. Smaller values (2-3) match shared substructures
(e.g., imidazolium ring across EMIM/BMIM). Larger values require
more of the surrounding structure to match.
equivalize_between_methyl_carbons
Whether topologically symmetric methyl(ene) carbons (matched by
``[#6X4H3,#6H4,#6X4H2:1]``) in **different** molecules should be
assigned an equivalent charge.
equivalize_between_methyl_hydrogens
Whether topologically symmetric methyl(ene) hydrogens (attached to
a methyl(ene) carbon) in **different** molecules should be assigned
an equivalent charge.
equivalize_between_other_heavy_atoms
Whether topologically symmetric heavy atoms that are not
methyl(ene) carbons in **different** molecules should be assigned
an equivalent charge.
equivalize_between_other_hydrogen_atoms
Whether topologically symmetric hydrogens that are not methyl(ene)
hydrogens in **different** molecules should be assigned an
equivalent charge.
Returns
-------
A list of label lists, one per molecule. Each inner list contains
one string label per atom, ordered by atom index.
"""
from collections import defaultdict
from openff.recharge.utilities.toolkits import get_atom_symmetries
from openff.units.elements import SYMBOLS
from rdkit.Chem import rdFingerprintGenerator
# Global registries for cross-molecule label assignment
env_to_label_map: dict[tuple, str] = {}
element_counters: defaultdict[str, int] = defaultdict(int)
unique_atom_counter = 0
all_molecule_labels: list[list[str]] = []
for molecule in molecules:
mol_labels: list[str] = []
atom_symmetries = get_atom_symmetries(molecule)
# Atom classification ("within" molecule)
methyl_carbons = {
i
for (i,) in molecule.chemical_environment_matches("[#6X4H3,#6H4,#6X4H2:1]")
}
methyl_hydrogens = {
atom.molecule_atom_index
for i in methyl_carbons
for atom in molecule.atoms[i].bonded_atoms
if atom.atomic_number == 1
}
other_heavy_atoms = {
i
for i, atom in enumerate(molecule.atoms)
if atom.atomic_number != 1 and i not in methyl_carbons
}
other_hydrogen_atoms = {
i
for i, atom in enumerate(molecule.atoms)
if atom.atomic_number == 1 and i not in methyl_hydrogens
}
# Cross-molecule hashing via Morgan fingerprints
rdkit_mol = molecule.to_rdkit()
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=radius)
atom_morgan_hashes = []
for i in range(molecule.n_atoms):
fp = morgan_gen.GetSparseCountFingerprint(rdkit_mol, fromAtoms=[i])
atom_morgan_hashes.append(tuple(sorted(fp.GetNonzeroElements().items())))
# have atoms shared within molecules take precendence
sym_group_to_atoms: defaultdict[int, list[int]] = defaultdict(list)
for i, sym in enumerate(atom_symmetries):
sym_group_to_atoms[sym].append(i)
representative_hash = list(atom_morgan_hashes)
for atom_indices in sym_group_to_atoms.values():
rep = atom_morgan_hashes[atom_indices[0]]
for idx in atom_indices:
representative_hash[idx] = rep
# Assign labels per atom
for i, atom in enumerate(molecule.atoms):
element = SYMBOLS[atom.atomic_number]
is_methyl_c = i in methyl_carbons
is_methyl_h = i in methyl_hydrogens
# Determine if cross-molecule equivalence applies
if is_methyl_c:
should_equivalize = equivalize_between_methyl_carbons
elif is_methyl_h:
should_equivalize = equivalize_between_methyl_hydrogens
elif i in other_heavy_atoms:
should_equivalize = equivalize_between_other_heavy_atoms
elif i in other_hydrogen_atoms:
should_equivalize = equivalize_between_other_hydrogen_atoms
else:
should_equivalize = False
if should_equivalize:
env_key = (element, representative_hash[i])
if env_key not in env_to_label_map:
element_counters[element] += 1
env_to_label_map[env_key] = f"{element}{element_counters[element]}"
mol_labels.append(env_to_label_map[env_key])
else:
# unique label across all molecules
unique_atom_counter += 1
mol_labels.append(f"{element}_u{unique_atom_counter}")
all_molecule_labels.append(mol_labels)
return all_molecule_labels
[docs]
def generate_constrained_mpfit_charge_parameter(
multipole_records: list[MultipoleRecord],
molecules: list["Molecule"],
solver: "ConstrainedMPFITSolver | None" = None,
atom_type_labels: list[list[str]] | None = None,
radius: int = 2,
equivalize_between_methyl_carbons: bool = True,
equivalize_between_methyl_hydrogens: bool = True,
equivalize_between_other_heavy_atoms: bool = True,
equivalize_between_other_hydrogen_atoms: bool = True,
) -> list[LibraryChargeParameter]:
"""Fit constrained charges for one or more molecules.
When multiple molecules are provided, atoms in **different** molecules
that share the same atom type label are constrained to carry the same
charge. For a single molecule, within-molecule symmetry constraints
are still applied. Labels can be generated automatically via Morgan
fingerprints and within-molecule symmetry, or supplied directly.
Parameters
----------
multipole_records
One multipole record (GDMA or MBIS) per molecule (one conformer each).
molecules
The molecules corresponding to each multipole record, in the same order.
The formal charge of each molecule is read from
``molecule.total_charge``.
solver
The constrained solver to use. Defaults to ``ConstrainedSciPySolver``.
atom_type_labels
Per-molecule atom type labels. If ``None``, labels are generated
automatically via ``generate_global_atom_type_labels``.
radius
Morgan fingerprint radius passed to
``generate_global_atom_type_labels`` when ``atom_type_labels``
is ``None``.
equivalize_between_methyl_carbons
Whether topologically symmetric methyl(ene) carbons (matched by
``[#6X4H3,#6H4,#6X4H2:1]``) in **different** molecules should be
assigned an equivalent charge.
equivalize_between_methyl_hydrogens
Whether topologically symmetric methyl(ene) hydrogens (attached to
a methyl(ene) carbon) in **different** molecules should be assigned
an equivalent charge.
equivalize_between_other_heavy_atoms
Whether topologically symmetric heavy atoms that are not
methyl(ene) carbons in **different** molecules should be assigned
an equivalent charge.
equivalize_between_other_hydrogen_atoms
Whether topologically symmetric hydrogens that are not methyl(ene)
hydrogens in **different** molecules should be assigned an
equivalent charge.
Returns
-------
One ``LibraryChargeParameter`` per molecule.
"""
from pympfit.mpfit.solvers import (
ConstrainedMPFITSolver,
ConstrainedMPFITState,
ConstrainedSciPySolver,
build_quse_matrix,
)
if len(multipole_records) != len(molecules):
msg = (
f"multipole_records has {len(multipole_records)} entries, "
f"but molecules has {len(molecules)}"
)
raise ValueError(msg)
if solver is None:
solver = ConstrainedSciPySolver()
elif not isinstance(solver, ConstrainedMPFITSolver):
msg = (
f"solver must be a ConstrainedMPFITSolver instance, "
f"got {type(solver).__name__}"
)
raise TypeError(msg)
if atom_type_labels is None:
atom_type_labels = generate_global_atom_type_labels(
molecules,
radius=radius,
equivalize_between_methyl_carbons=equivalize_between_methyl_carbons,
equivalize_between_methyl_hydrogens=equivalize_between_methyl_hydrogens,
equivalize_between_other_heavy_atoms=equivalize_between_other_heavy_atoms,
equivalize_between_other_hydrogen_atoms=equivalize_between_other_hydrogen_atoms,
)
if len(atom_type_labels) != len(molecules):
msg = (
f"atom_type_labels has {len(atom_type_labels)} entries, "
f"but molecules has {len(molecules)}"
)
raise ValueError(msg)
flat_labels = tuple(
label for mol_labels in atom_type_labels for label in mol_labels
)
mol_charges = tuple(
mol.total_charge.m_as(unit.elementary_charge) for mol in molecules
)
all_xyz = []
all_multipoles = []
all_rvdw = []
all_lmax = []
atom_counts = []
for record in multipole_records:
arrays = MPFITObjective.extract_arrays(record)
all_xyz.append(arrays["bohr_conformer"])
all_multipoles.append(arrays["multipoles"])
all_rvdw.append(arrays["rvdw"])
all_lmax.append(arrays["lmax"])
atom_counts.append(arrays["n_atoms"])
# Get settings from first record (works for both GDMA and MBIS)
first_record = multipole_records[0]
if isinstance(first_record, MoleculeGDMARecord):
settings = first_record.gdma_settings
max_rank = settings.limit # GDMA uses 0-based indexing
else:
settings = first_record.mbis_settings
max_rank = settings.limit - 1 # MBIS uses 1-based indexing
xyzcharge = np.vstack(all_xyz)
xyzmult = np.vstack(all_xyz)
rvdw = np.concatenate(all_rvdw)
state = ConstrainedMPFITState(
xyzmult=xyzmult,
xyzcharge=xyzcharge,
multipoles=np.vstack(all_multipoles),
quse=build_quse_matrix(xyzmult, xyzcharge, rvdw),
atomtype=flat_labels,
rvdw=rvdw,
lmax=np.concatenate(all_lmax),
r1=settings.mpfit_inner_radius,
r2=settings.mpfit_outer_radius,
maxl=max_rank,
atom_counts=tuple(atom_counts),
molecule_charges=mol_charges,
)
qstore = solver.solve(state)
# split charges by molecule → LibraryChargeParameter ──
parameters = []
offset = 0
for molecule, n in zip(molecules, atom_counts, strict=False):
mol_charges_arr = qstore[offset : offset + n]
param = molecule_to_mpfit_library_charge(molecule)
param.value = mol_charges_arr.tolist()
parameters.append(param)
offset += n
return parameters