Spaces:
Running
Running
| """Load CB[7] host-guest features by joining four private HF datasets. | |
| The dashboard does NOT recompute docking/xtb features live. It reads the GEOM | |
| guest table as the left-side base and joins ligand, pose, and cavity feature | |
| tables on ``inchikey``. Missing feature datasets degrade to empty/NaN columns so | |
| the UI can still list guests; a missing GEOM base is fatal. | |
| HF_TOKEN read token with access to the private datasets | |
| HF_DS_GEOM dataset repo id (default SupraBench/SupraDB-GEOM) | |
| HF_DS_LIGAND dataset repo id (default SupraBench/SupraDB-LigandScore) | |
| HF_DS_POSE dataset repo id (default SupraBench/SupraDB-PoseFeat) | |
| HF_DS_CAVITY dataset repo id (default SupraBench/SupraDB-CavityScore) | |
| Each dataset also supports LOCAL_GEOM / LOCAL_LIGAND / LOCAL_POSE / | |
| LOCAL_CAVITY CSV overrides for offline development. | |
| """ | |
| from __future__ import annotations | |
| import functools | |
| import logging | |
| import os | |
| # pandas is imported lazily inside load_features() so that importing data_loader | |
| # (and by extension app/prompts) works without pandas — e.g. in the smoke-test | |
| # environment or any context that only needs FEATURES / build_prompt logic. | |
| _LOG = logging.getLogger(__name__) | |
| _DATASETS = { | |
| "geom": { | |
| "repo_env": "HF_DS_GEOM", | |
| "file_env": "HF_DS_GEOM_FILE", | |
| "local_env": "LOCAL_GEOM", | |
| "repo": "SupraBench/SupraDB-GEOM", | |
| "file": "guests.csv", | |
| "columns": ["inchikey", "name", "smiles", "logka", | |
| "scaffold_family", "known_scaffold", "tmax_known", "is_novel"], | |
| "required": True, | |
| }, | |
| "ligand": { | |
| "repo_env": "HF_DS_LIGAND", | |
| "file_env": "HF_DS_LIGAND_FILE", | |
| "local_env": "LOCAL_LIGAND", | |
| "repo": "SupraBench/SupraDB-LigandScore", | |
| "file": "features.csv", | |
| "columns": [ | |
| "inchikey", | |
| "S_charge", | |
| "S_hydrophobic", | |
| "S_rigidity", | |
| "S_desolvation", | |
| "S_packing", | |
| "S_shape", | |
| "S_conformer_diversity", | |
| "S_boltzmann_concentration", | |
| "S_bad", | |
| ], | |
| "required": False, | |
| }, | |
| "pose": { | |
| "repo_env": "HF_DS_POSE", | |
| "file_env": "HF_DS_POSE_FILE", | |
| "local_env": "LOCAL_POSE", | |
| "repo": "SupraBench/SupraDB-PoseFeat", | |
| "file": "features.csv", | |
| "columns": [ | |
| "inchikey", | |
| "DockingScore", | |
| "Pose_Energy", | |
| "Distance_to_Cavity_Center", | |
| "Distance_to_Portal", | |
| "Insertion_Depth", | |
| "Packing_Coefficient", | |
| "Occupancy", | |
| "Hydrophobic_Occupancy", | |
| "Shape_Complementarity", | |
| "Steric_Clash", | |
| "Guest_CB7_Min_Distance", | |
| "Pose_RMSD_to_Template", | |
| "Portal_Compatibility", | |
| "Positive_Center_to_Portal_Distance", | |
| "Positive_Center_Orientation", | |
| "Charge_Accessibility", | |
| "Portal_Facing_Accessibility", | |
| "HBond_Count", | |
| "HBond_Geometry", | |
| "Carbonyl_Oxygen_Contact_Count", | |
| "Hydrophobic_Contact", | |
| "Polar_Contact_Penalty", | |
| "Bad_Group_Portal_Exposure", | |
| "Desolvation_Penalty", | |
| "boltzmann_weight", | |
| "delta_e", | |
| ], | |
| "required": False, | |
| }, | |
| "cavity": { | |
| "repo_env": "HF_DS_CAVITY", | |
| "file_env": "HF_DS_CAVITY_FILE", | |
| "local_env": "LOCAL_CAVITY", | |
| "repo": "SupraBench/SupraDB-CavityScore", | |
| "file": "features.csv", | |
| "columns": [ | |
| "inchikey", | |
| "S_occupancy", | |
| "S_portal", | |
| "S_accessibility", | |
| "S_orientation", | |
| ], | |
| "required": False, | |
| }, | |
| } | |
| _LOAD_STATUS = {name: False for name in _DATASETS} | |
| # 22 surfaced features, label -> column, in prompt order (mirrors gen_label_studio.py) | |
| FEATURES = [ | |
| ("Binding energy ΔE_bind", "DockingScore"), | |
| ("Packing coefficient", "Packing_Coefficient"), | |
| ("Cavity occupancy", "Occupancy"), | |
| ("Hydrophobic occupancy", "Hydrophobic_Occupancy"), | |
| ("Shape complementarity", "Shape_Complementarity"), | |
| ("Insertion depth", "Insertion_Depth"), | |
| ("Steric clashes", "Steric_Clash"), | |
| ("Positive-center-to-portal distance", "Positive_Center_to_Portal_Distance"), | |
| ("H-bond count", "HBond_Count"), | |
| ("Carbonyl-oxygen contacts", "Carbonyl_Oxygen_Contact_Count"), | |
| ("Charge", "S_charge"), | |
| ("Hydrophobicity", "S_hydrophobic"), | |
| ("Rigidity", "S_rigidity"), | |
| ("Desolvation ease", "S_desolvation"), | |
| ("Packing quality", "S_packing"), | |
| ("Hydrophobic cavity filling", "S_occupancy"), | |
| ("Shape compactness", "S_shape"), | |
| ("Preorganization", "S_conformer_diversity"), | |
| ("Portal engagement", "S_portal"), | |
| ("Positive-center exposure", "S_accessibility"), | |
| ("Positive-center orientation score", "S_orientation"), | |
| ("Unfavorable-feature penalty", "S_bad"), | |
| ] | |
| def load_features(): | |
| """Return the per-guest feature table (pandas DataFrame), indexed by inchikey. | |
| GEOM is the left-side base and provides ``inchikey``, ``guest_name``, | |
| ``smiles``, and optional ``logka``. Ligand, pose, and cavity datasets are | |
| optional feature groups. Pose rows are defensively collapsed to the row with | |
| the largest ``boltzmann_weight`` per ``inchikey``. Cached for the process | |
| lifetime. | |
| """ | |
| import pandas as pd # lazy: keeps module importable without pandas installed | |
| for name in _LOAD_STATUS: | |
| _LOAD_STATUS[name] = False | |
| def _empty(columns: list[str]): | |
| return pd.DataFrame(columns=columns) | |
| def _read_dataset(name: str): | |
| cfg = _DATASETS[name] | |
| local = os.environ.get(cfg["local_env"]) | |
| try: | |
| if local: | |
| if not os.path.exists(local): | |
| raise FileNotFoundError(local) | |
| df = pd.read_csv(local) | |
| else: | |
| from huggingface_hub import hf_hub_download | |
| repo = os.environ.get(cfg["repo_env"], cfg["repo"]) | |
| fname = os.environ.get(cfg["file_env"], cfg["file"]) | |
| token = os.environ.get("HF_TOKEN") | |
| path = hf_hub_download( | |
| repo_id=repo, | |
| filename=fname, | |
| repo_type="dataset", | |
| token=token, | |
| ) | |
| df = pd.read_csv(path) | |
| except Exception as exc: | |
| _LOAD_STATUS[name] = False | |
| if cfg["required"]: | |
| raise RuntimeError(f"failed to load required GEOM dataset: {exc}") from exc | |
| _LOG.warning("failed to load optional %s dataset; continuing with NaN columns: %s", name, exc) | |
| return _empty(cfg["columns"]) | |
| if "inchikey" not in df.columns: | |
| _LOAD_STATUS[name] = False | |
| if cfg["required"]: | |
| raise ValueError("required GEOM dataset is missing 'inchikey'") | |
| _LOG.warning("optional %s dataset is missing 'inchikey'; continuing with NaN columns", name) | |
| return _empty(cfg["columns"]) | |
| _LOAD_STATUS[name] = True | |
| return df | |
| geom = _read_dataset("geom") | |
| if "guest_name" not in geom.columns and "name" in geom.columns: | |
| geom = geom.rename(columns={"name": "guest_name"}) | |
| if "guest_name" not in geom.columns: | |
| geom["guest_name"] = geom["inchikey"] | |
| for col in ("smiles", "logka"): | |
| if col not in geom.columns: | |
| geom[col] = pd.NA | |
| geom = geom.drop_duplicates(subset="inchikey", keep="first").reset_index(drop=True) | |
| # Novelty annotation (computed offline by engineering/annotate_geom_novelty.py | |
| # and shipped in guests.csv). Missing on older dataset snapshots -> NaN columns | |
| # so the board degrades to a blank Novelty cell instead of crashing. | |
| _novelty = ["scaffold_family", "known_scaffold", "tmax_known", "is_novel"] | |
| for col in _novelty: | |
| if col not in geom.columns: | |
| geom[col] = pd.NA | |
| merged = geom[["inchikey", "guest_name", "smiles", "logka", *_novelty]].copy() | |
| for name in ("ligand", "pose", "cavity"): | |
| cfg = _DATASETS[name] | |
| df = _read_dataset(name) | |
| if name == "pose" and "boltzmann_weight" in df.columns: | |
| df = ( | |
| df.sort_values("boltzmann_weight", ascending=False) | |
| .drop_duplicates(subset="inchikey", keep="first") | |
| .reset_index(drop=True) | |
| ) | |
| else: | |
| df = df.drop_duplicates(subset="inchikey", keep="first").reset_index(drop=True) | |
| for col in cfg["columns"]: | |
| if col not in df.columns: | |
| df[col] = pd.NA | |
| merged = merged.merge(df[cfg["columns"]], on="inchikey", how="left") | |
| return merged.set_index("inchikey", drop=False) | |
| def load_status() -> dict[str, bool]: | |
| """Report whether each of the four source datasets loaded successfully.""" | |
| if load_features.cache_info().currsize == 0: | |
| load_features() | |
| return dict(_LOAD_STATUS) | |
| def guest_choices() -> list[str]: | |
| """Dropdown labels: 'guest_name' values (falls back to inchikey).""" | |
| df = load_features() | |
| col = "guest_name" if "guest_name" in df.columns else "inchikey" | |
| return sorted(df[col].dropna().astype(str).unique().tolist()) | |
| def get_record(guest_name: str) -> dict: | |
| """Look up one guest's row by guest_name (or inchikey) -> dict.""" | |
| df = load_features() | |
| key = "guest_name" if "guest_name" in df.columns else "inchikey" | |
| hit = df[df[key].astype(str) == str(guest_name)] | |
| if hit.empty and "inchikey" in df.columns: | |
| hit = df[df["inchikey"].astype(str) == str(guest_name)] | |
| if hit.empty: | |
| raise KeyError(f"no feature row for {guest_name!r}") | |
| return hit.iloc[0].to_dict() | |
| # The host is fixed (CB[7]) for this benchmark. PubChem CID 6096207 resolves the | |
| # canonical 2D/3D depiction; the connectivity SMILES is the RDKit fallback. | |
| _HOST = { | |
| "inchikey": "ZDOBFUIMGBWEAB-UHFFFAOYSA-N", | |
| "smiles": ( | |
| "C1N2C3C4N(C2=O)CN5C6C7N(C5=O)CN8C9C2N(C8=O)CN5C8C%10N(C5=O)CN5C%11C%12" | |
| "N(C5=O)CN5C%13C%14N(C5=O)CN5C%15C(N1C5=O)N1CN3C(=O)N4CN6C(=O)N7CN9C(=O)" | |
| "N2CN8C(=O)N%10CN%11C(=O)N%12CN%13C(=O)N%14CN%15C1=O" | |
| ), | |
| "guest_name": "Cucurbit[7]uril (CB[7])", | |
| } | |
| def host_record() -> dict: | |
| """Return the fixed host (CB[7]) identifiers for structure rendering.""" | |
| return dict(_HOST) | |