"""Load CB[7] host-guest features by joining four private HF datasets.

The dashboard does NOT recompute docking/xtb features live. It reads the GEOM
guest table as the left-side base and joins ligand, pose, and cavity feature
tables on ``inchikey``. Missing feature datasets degrade to empty/NaN columns so
the UI can still list guests; a missing GEOM base is fatal.

    HF_TOKEN        read token with access to the private datasets
    HF_DS_GEOM      dataset repo id (default SupraBench/SupraDB-GEOM)
    HF_DS_LIGAND    dataset repo id (default SupraBench/SupraDB-LigandScore)
    HF_DS_POSE      dataset repo id (default SupraBench/SupraDB-PoseFeat)
    HF_DS_CAVITY    dataset repo id (default SupraBench/SupraDB-CavityScore)

Each dataset also supports LOCAL_GEOM / LOCAL_LIGAND / LOCAL_POSE /
LOCAL_CAVITY CSV overrides for offline development.
"""
from __future__ import annotations

import functools
import logging
import os

# pandas is imported lazily inside load_features() so that importing data_loader
# (and by extension app/prompts) works without pandas — e.g. in the smoke-test
# environment or any context that only needs FEATURES / build_prompt logic.

_LOG = logging.getLogger(__name__)

_DATASETS = {
    "geom": {
        "repo_env": "HF_DS_GEOM",
        "file_env": "HF_DS_GEOM_FILE",
        "local_env": "LOCAL_GEOM",
        "repo": "SupraBench/SupraDB-GEOM",
        "file": "guests.csv",
        "columns": ["inchikey", "name", "smiles", "logka",
                    "scaffold_family", "known_scaffold", "tmax_known", "is_novel"],
        "required": True,
    },
    "ligand": {
        "repo_env": "HF_DS_LIGAND",
        "file_env": "HF_DS_LIGAND_FILE",
        "local_env": "LOCAL_LIGAND",
        "repo": "SupraBench/SupraDB-LigandScore",
        "file": "features.csv",
        "columns": [
            "inchikey",
            "S_charge",
            "S_hydrophobic",
            "S_rigidity",
            "S_desolvation",
            "S_packing",
            "S_shape",
            "S_conformer_diversity",
            "S_boltzmann_concentration",
            "S_bad",
        ],
        "required": False,
    },
    "pose": {
        "repo_env": "HF_DS_POSE",
        "file_env": "HF_DS_POSE_FILE",
        "local_env": "LOCAL_POSE",
        "repo": "SupraBench/SupraDB-PoseFeat",
        "file": "features.csv",
        "columns": [
            "inchikey",
            "DockingScore",
            "Pose_Energy",
            "Distance_to_Cavity_Center",
            "Distance_to_Portal",
            "Insertion_Depth",
            "Packing_Coefficient",
            "Occupancy",
            "Hydrophobic_Occupancy",
            "Shape_Complementarity",
            "Steric_Clash",
            "Guest_CB7_Min_Distance",
            "Pose_RMSD_to_Template",
            "Portal_Compatibility",
            "Positive_Center_to_Portal_Distance",
            "Positive_Center_Orientation",
            "Charge_Accessibility",
            "Portal_Facing_Accessibility",
            "HBond_Count",
            "HBond_Geometry",
            "Carbonyl_Oxygen_Contact_Count",
            "Hydrophobic_Contact",
            "Polar_Contact_Penalty",
            "Bad_Group_Portal_Exposure",
            "Desolvation_Penalty",
            "boltzmann_weight",
            "delta_e",
        ],
        "required": False,
    },
    "cavity": {
        "repo_env": "HF_DS_CAVITY",
        "file_env": "HF_DS_CAVITY_FILE",
        "local_env": "LOCAL_CAVITY",
        "repo": "SupraBench/SupraDB-CavityScore",
        "file": "features.csv",
        "columns": [
            "inchikey",
            "S_occupancy",
            "S_portal",
            "S_accessibility",
            "S_orientation",
        ],
        "required": False,
    },
}

_LOAD_STATUS = {name: False for name in _DATASETS}

# 22 surfaced features, label -> column, in prompt order (mirrors gen_label_studio.py)
FEATURES = [
    ("Binding energy ΔE_bind", "DockingScore"),
    ("Packing coefficient", "Packing_Coefficient"),
    ("Cavity occupancy", "Occupancy"),
    ("Hydrophobic occupancy", "Hydrophobic_Occupancy"),
    ("Shape complementarity", "Shape_Complementarity"),
    ("Insertion depth", "Insertion_Depth"),
    ("Steric clashes", "Steric_Clash"),
    ("Positive-center-to-portal distance", "Positive_Center_to_Portal_Distance"),
    ("H-bond count", "HBond_Count"),
    ("Carbonyl-oxygen contacts", "Carbonyl_Oxygen_Contact_Count"),
    ("Charge", "S_charge"),
    ("Hydrophobicity", "S_hydrophobic"),
    ("Rigidity", "S_rigidity"),
    ("Desolvation ease", "S_desolvation"),
    ("Packing quality", "S_packing"),
    ("Hydrophobic cavity filling", "S_occupancy"),
    ("Shape compactness", "S_shape"),
    ("Preorganization", "S_conformer_diversity"),
    ("Portal engagement", "S_portal"),
    ("Positive-center exposure", "S_accessibility"),
    ("Positive-center orientation score", "S_orientation"),
    ("Unfavorable-feature penalty", "S_bad"),
]


@functools.lru_cache(maxsize=1)
def load_features():
    """Return the per-guest feature table (pandas DataFrame), indexed by inchikey.

    GEOM is the left-side base and provides ``inchikey``, ``guest_name``,
    ``smiles``, and optional ``logka``. Ligand, pose, and cavity datasets are
    optional feature groups. Pose rows are defensively collapsed to the row with
    the largest ``boltzmann_weight`` per ``inchikey``. Cached for the process
    lifetime.
    """
    import pandas as pd  # lazy: keeps module importable without pandas installed

    for name in _LOAD_STATUS:
        _LOAD_STATUS[name] = False

    def _empty(columns: list[str]):
        return pd.DataFrame(columns=columns)

    def _read_dataset(name: str):
        cfg = _DATASETS[name]
        local = os.environ.get(cfg["local_env"])
        try:
            if local:
                if not os.path.exists(local):
                    raise FileNotFoundError(local)
                df = pd.read_csv(local)
            else:
                from huggingface_hub import hf_hub_download

                repo = os.environ.get(cfg["repo_env"], cfg["repo"])
                fname = os.environ.get(cfg["file_env"], cfg["file"])
                token = os.environ.get("HF_TOKEN")
                path = hf_hub_download(
                    repo_id=repo,
                    filename=fname,
                    repo_type="dataset",
                    token=token,
                )
                df = pd.read_csv(path)
        except Exception as exc:
            _LOAD_STATUS[name] = False
            if cfg["required"]:
                raise RuntimeError(f"failed to load required GEOM dataset: {exc}") from exc
            _LOG.warning("failed to load optional %s dataset; continuing with NaN columns: %s", name, exc)
            return _empty(cfg["columns"])

        if "inchikey" not in df.columns:
            _LOAD_STATUS[name] = False
            if cfg["required"]:
                raise ValueError("required GEOM dataset is missing 'inchikey'")
            _LOG.warning("optional %s dataset is missing 'inchikey'; continuing with NaN columns", name)
            return _empty(cfg["columns"])

        _LOAD_STATUS[name] = True
        return df

    geom = _read_dataset("geom")
    if "guest_name" not in geom.columns and "name" in geom.columns:
        geom = geom.rename(columns={"name": "guest_name"})
    if "guest_name" not in geom.columns:
        geom["guest_name"] = geom["inchikey"]
    for col in ("smiles", "logka"):
        if col not in geom.columns:
            geom[col] = pd.NA
    geom = geom.drop_duplicates(subset="inchikey", keep="first").reset_index(drop=True)

    # Novelty annotation (computed offline by engineering/annotate_geom_novelty.py
    # and shipped in guests.csv). Missing on older dataset snapshots -> NaN columns
    # so the board degrades to a blank Novelty cell instead of crashing.
    _novelty = ["scaffold_family", "known_scaffold", "tmax_known", "is_novel"]
    for col in _novelty:
        if col not in geom.columns:
            geom[col] = pd.NA

    merged = geom[["inchikey", "guest_name", "smiles", "logka", *_novelty]].copy()
    for name in ("ligand", "pose", "cavity"):
        cfg = _DATASETS[name]
        df = _read_dataset(name)
        if name == "pose" and "boltzmann_weight" in df.columns:
            df = (
                df.sort_values("boltzmann_weight", ascending=False)
                .drop_duplicates(subset="inchikey", keep="first")
                .reset_index(drop=True)
            )
        else:
            df = df.drop_duplicates(subset="inchikey", keep="first").reset_index(drop=True)

        for col in cfg["columns"]:
            if col not in df.columns:
                df[col] = pd.NA
        merged = merged.merge(df[cfg["columns"]], on="inchikey", how="left")

    return merged.set_index("inchikey", drop=False)


def load_status() -> dict[str, bool]:
    """Report whether each of the four source datasets loaded successfully."""
    if load_features.cache_info().currsize == 0:
        load_features()
    return dict(_LOAD_STATUS)


def guest_choices() -> list[str]:
    """Dropdown labels: 'guest_name' values (falls back to inchikey)."""
    df = load_features()
    col = "guest_name" if "guest_name" in df.columns else "inchikey"
    return sorted(df[col].dropna().astype(str).unique().tolist())


def get_record(guest_name: str) -> dict:
    """Look up one guest's row by guest_name (or inchikey) -> dict."""
    df = load_features()
    key = "guest_name" if "guest_name" in df.columns else "inchikey"
    hit = df[df[key].astype(str) == str(guest_name)]
    if hit.empty and "inchikey" in df.columns:
        hit = df[df["inchikey"].astype(str) == str(guest_name)]
    if hit.empty:
        raise KeyError(f"no feature row for {guest_name!r}")
    return hit.iloc[0].to_dict()


# The host is fixed (CB[7]) for this benchmark. PubChem CID 6096207 resolves the
# canonical 2D/3D depiction; the connectivity SMILES is the RDKit fallback.
_HOST = {
    "inchikey": "ZDOBFUIMGBWEAB-UHFFFAOYSA-N",
    "smiles": (
        "C1N2C3C4N(C2=O)CN5C6C7N(C5=O)CN8C9C2N(C8=O)CN5C8C%10N(C5=O)CN5C%11C%12"
        "N(C5=O)CN5C%13C%14N(C5=O)CN5C%15C(N1C5=O)N1CN3C(=O)N4CN6C(=O)N7CN9C(=O)"
        "N2CN8C(=O)N%10CN%11C(=O)N%12CN%13C(=O)N%14CN%15C1=O"
    ),
    "guest_name": "Cucurbit[7]uril (CB[7])",
}


def host_record() -> dict:
    """Return the fixed host (CB[7]) identifiers for structure rendering."""
    return dict(_HOST)