Feature Extraction
Transformers
Safetensors
esmfold2
biology
protein-structure
multimodal-protein-model
custom_code
Instructions to use Synthyra/ESMFold2-Fast with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Synthyra/ESMFold2-Fast with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="Synthyra/ESMFold2-Fast", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Synthyra/ESMFold2-Fast", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """CCD conformer loading utilities. | |
| Loads idealized conformer coordinates from a CCD pickle file containing RDKit molecules. | |
| Conformer priority follows AF3 Section 2.8: Computed > Ideal > first available. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import pickle | |
| from pathlib import Path | |
| import numpy as np | |
| from huggingface_hub import hf_hub_download | |
| from .esmfold2_constants import RES_TYPE_TO_CCD | |
| if os.environ.get("ESMCFOLD_CCD_PATH"): | |
| CCD_PICKLE_PATH = Path(os.environ["ESMCFOLD_CCD_PATH"]) | |
| else: | |
| CCD_PICKLE_PATH = None | |
| # Lazily loaded CCD dictionary | |
| _CCD_MOLECULES: dict | None = None | |
| # Caches | |
| _CCD_CONFORMERS: dict[str, dict[str, np.ndarray]] = {} | |
| _CCD_ATOM_CACHE: dict[str, list[tuple[str, str, int]]] = {} | |
| _CCD_BONDS_CACHE: dict[str, list[tuple[str, str]]] = {} | |
| _CCD_LEAVING_ATOMS_CACHE: dict[str, set[str]] = {} | |
| _IDEALIZED_POS_CACHE: dict[tuple[int, str], np.ndarray | None] = {} | |
| _LIGAND_IDEALIZED_POS_CACHE: dict[tuple[str, str], np.ndarray | None] = {} | |
| def load_ccd(cache_dir: Path | str | None = None) -> dict: | |
| """Load CCD molecules from pickle file, downloading if needed. | |
| Args: | |
| cache_dir: Directory to cache the downloaded CCD pickle. | |
| If None, uses CCD_PICKLE_PATH env var or downloads to ~/.cache/esmcfold/. | |
| """ | |
| global _CCD_MOLECULES | |
| if _CCD_MOLECULES is not None: | |
| return _CCD_MOLECULES | |
| # Determine pickle path | |
| if CCD_PICKLE_PATH is not None and CCD_PICKLE_PATH.exists(): | |
| pkl_path = CCD_PICKLE_PATH | |
| elif cache_dir is not None: | |
| cache_dir = Path(cache_dir) | |
| cache_dir.mkdir(parents=True, exist_ok=True) | |
| pkl_path = cache_dir / "ccd.pkl" | |
| else: | |
| try: | |
| pkl_path = Path( | |
| hf_hub_download(repo_id="biohub/ESMFold2", filename="ccd.pkl") | |
| ) | |
| except Exception as e: | |
| raise FileNotFoundError( | |
| f"Failed to download CCD pickle file from Hugging Face repository: {e}" | |
| ) | |
| if not pkl_path.exists(): | |
| raise FileNotFoundError( | |
| f"CCD pickle file not found: {pkl_path}. Please set the ESMCFOLD_CCD_PATH environment variable to the path of a valid CCD pickle file or download the file from the Hugging Face repository." | |
| ) | |
| print(f"Loading CCD dictionary from {pkl_path}") | |
| with open(pkl_path, "rb") as f: | |
| _CCD_MOLECULES = pickle.load(f) | |
| if _CCD_MOLECULES is None: | |
| _CCD_MOLECULES = {} | |
| return _CCD_MOLECULES | |
| def _get_ccd_molecules() -> dict: | |
| """Get CCD molecules, loading lazily on first call.""" | |
| global _CCD_MOLECULES | |
| if _CCD_MOLECULES is None: | |
| return load_ccd() | |
| return _CCD_MOLECULES | |
| def _get_ccd_mol_with_significant_h(comp_id: str): | |
| """Get CCD molecule with only chemically significant hydrogens. | |
| Returns (mol, conformer) tuple or (None, None) if not available. | |
| """ | |
| ccd = _get_ccd_molecules() | |
| if comp_id not in ccd: | |
| return None, None | |
| mol = ccd[comp_id] | |
| if mol.GetNumConformers() == 0: | |
| return None, None | |
| # Find the "Computed" conformer (RDKit ETKDGv3), fall back to "Ideal" | |
| conf_idx = 0 | |
| for i, c in enumerate(mol.GetConformers()): | |
| props = c.GetPropsAsDict() | |
| if props.get("name") == "Computed": | |
| conf_idx = i | |
| break | |
| else: | |
| for i, c in enumerate(mol.GetConformers()): | |
| props = c.GetPropsAsDict() | |
| if props.get("name") == "Ideal": | |
| conf_idx = i | |
| break | |
| from rdkit import Chem | |
| mol_no_h = Chem.RemoveHs(mol, sanitize=False) | |
| if mol_no_h.GetNumConformers() == 0: | |
| return None, None | |
| return mol_no_h, mol_no_h.GetConformer( | |
| min(conf_idx, mol_no_h.GetNumConformers() - 1) | |
| ) | |
| def get_ccd_conformer(comp_id: str) -> dict[str, np.ndarray] | None: | |
| """Get idealized conformer as dict of atom_name -> position [3]. | |
| Conformer priority: Computed > Ideal > first available. | |
| """ | |
| if comp_id in _CCD_CONFORMERS: | |
| cached = _CCD_CONFORMERS[comp_id] | |
| return cached if cached else None | |
| mol, conf = _get_ccd_mol_with_significant_h(comp_id) | |
| if mol is None or conf is None: | |
| _CCD_CONFORMERS[comp_id] = {} | |
| return None | |
| conformer: dict[str, np.ndarray] = {} | |
| for atom in mol.GetAtoms(): | |
| props = atom.GetPropsAsDict() | |
| atom_name = props.get("name") | |
| if not isinstance(atom_name, str) or not atom_name: | |
| continue | |
| idx = atom.GetIdx() | |
| pos = conf.GetAtomPosition(idx) | |
| conformer[atom_name] = np.array([pos.x, pos.y, pos.z], dtype=np.float32) | |
| _CCD_CONFORMERS[comp_id] = conformer | |
| return conformer if conformer else None | |
| def get_idealized_atom_pos(res_type: int, atom_name: str) -> np.ndarray | None: | |
| """Get idealized position for a standard residue atom. | |
| Uses res_type index to look up CCD component, then returns position. | |
| Returns None if not found. | |
| """ | |
| cache_key = (res_type, atom_name) | |
| if cache_key in _IDEALIZED_POS_CACHE: | |
| return _IDEALIZED_POS_CACHE[cache_key] | |
| comp_id = RES_TYPE_TO_CCD.get(res_type) | |
| if comp_id: | |
| ccd_conformer = get_ccd_conformer(comp_id) | |
| if ccd_conformer and atom_name in ccd_conformer: | |
| pos = ccd_conformer[atom_name] | |
| _IDEALIZED_POS_CACHE[cache_key] = pos | |
| return pos | |
| _IDEALIZED_POS_CACHE[cache_key] = None | |
| return None | |
| def get_ligand_idealized_atom_pos(res_name: str, atom_name: str) -> np.ndarray | None: | |
| """Get idealized position for a ligand/modified residue atom. | |
| Returns None if not found. | |
| """ | |
| cache_key = (res_name, atom_name) | |
| if cache_key in _LIGAND_IDEALIZED_POS_CACHE: | |
| return _LIGAND_IDEALIZED_POS_CACHE[cache_key] | |
| ccd_conformer = get_ccd_conformer(res_name) | |
| if ccd_conformer and atom_name in ccd_conformer: | |
| pos = ccd_conformer[atom_name] | |
| _LIGAND_IDEALIZED_POS_CACHE[cache_key] = pos | |
| return pos | |
| _LIGAND_IDEALIZED_POS_CACHE[cache_key] = None | |
| return None | |
| def get_ligand_ccd_atoms_with_charges( | |
| comp_id: str, | |
| ) -> list[tuple[str, str, int]] | None: | |
| """Get list of (atom_name, element, charge) for a CCD component. | |
| Uses RDKit RemoveHs(sanitize=False) to keep chemically significant hydrogens. | |
| Returns None if CCD data not available. | |
| """ | |
| if comp_id in _CCD_ATOM_CACHE: | |
| cached = _CCD_ATOM_CACHE[comp_id] | |
| return cached if cached else None | |
| mol, _ = _get_ccd_mol_with_significant_h(comp_id) | |
| if mol is None: | |
| _CCD_ATOM_CACHE[comp_id] = [] | |
| return None | |
| atoms: list[tuple[str, str, int]] = [] | |
| for atom in mol.GetAtoms(): | |
| props = atom.GetPropsAsDict() | |
| atom_name = props.get("name") | |
| if not isinstance(atom_name, str) or not atom_name: | |
| continue | |
| element = atom.GetSymbol() | |
| charge = atom.GetFormalCharge() | |
| atoms.append((atom_name, element, charge)) | |
| _CCD_ATOM_CACHE[comp_id] = atoms | |
| return atoms if atoms else None | |
| def get_ligand_ccd_bonds(comp_id: str) -> list[tuple[str, str]] | None: | |
| """Get list of (atom1_name, atom2_name) bonds for a CCD component. | |
| Returns None if CCD data not available. | |
| """ | |
| if comp_id in _CCD_BONDS_CACHE: | |
| cached = _CCD_BONDS_CACHE[comp_id] | |
| return cached if cached else None | |
| mol, _ = _get_ccd_mol_with_significant_h(comp_id) | |
| if mol is None: | |
| _CCD_BONDS_CACHE[comp_id] = [] | |
| return None | |
| # Get included atom names | |
| included_atoms = set() | |
| for atom in mol.GetAtoms(): | |
| props = atom.GetPropsAsDict() | |
| atom_name = props.get("name") | |
| if isinstance(atom_name, str) and atom_name: | |
| included_atoms.add(atom_name) | |
| bonds: list[tuple[str, str]] = [] | |
| for bond in mol.GetBonds(): | |
| a1 = bond.GetBeginAtom() | |
| a2 = bond.GetEndAtom() | |
| n1 = a1.GetPropsAsDict().get("name") | |
| n2 = a2.GetPropsAsDict().get("name") | |
| if ( | |
| isinstance(n1, str) | |
| and isinstance(n2, str) | |
| and n1 | |
| and n2 | |
| and n1 in included_atoms | |
| and n2 in included_atoms | |
| ): | |
| bonds.append((n1, n2)) | |
| _CCD_BONDS_CACHE[comp_id] = bonds | |
| return bonds if bonds else None | |
| def get_ccd_leaving_atoms(comp_id: str) -> set[str]: | |
| """Get set of atom names marked as leaving atoms in CCD. | |
| Leaving atoms are removed during polymerization (e.g., OP3 in nucleotides). | |
| """ | |
| if comp_id in _CCD_LEAVING_ATOMS_CACHE: | |
| return _CCD_LEAVING_ATOMS_CACHE[comp_id] | |
| ccd = _get_ccd_molecules() | |
| if comp_id not in ccd: | |
| _CCD_LEAVING_ATOMS_CACHE[comp_id] = set() | |
| return set() | |
| mol = ccd[comp_id] | |
| leaving_atoms = set() | |
| for atom in mol.GetAtoms(): | |
| if atom.HasProp("leaving_atom"): | |
| if atom.GetProp("leaving_atom") == "1": | |
| name = atom.GetProp("name") if atom.HasProp("name") else "" | |
| if name: | |
| leaving_atoms.add(name) | |
| _CCD_LEAVING_ATOMS_CACHE[comp_id] = leaving_atoms | |
| return leaving_atoms | |