Spaces:

sobinalosious92
/

POLYMER-PROPERTY

Running

App Files Files Community

sobinalosious92 commited on 24 days ago

Commit

a22718f

verified ·

1 Parent(s): 83601ad

Delete src

Browse files

Files changed (19) hide show

src/conv.py +0 -258
src/data_builder.py +0 -818
src/discover_llm.py +0 -829
src/discovery.py +0 -767
src/fpscores.pkl.gz +0 -3
src/lookup.py +0 -222
src/model.py +0 -312
src/predictor.py +0 -193
src/predictor_multitask.py +0 -209
src/predictor_router.py +0 -45
src/rnn_smiles/__init__.py +0 -22
src/rnn_smiles/generator.py +0 -175
src/rnn_smiles/rnn.py +0 -89
src/rnn_smiles/utils.py +0 -15
src/rnn_smiles/vocabulary.py +0 -69
src/sascorer.py +0 -192
src/streamlit_app.py +0 -40
src/ui_style.py +0 -1003
src/utils.py +0 -338

src/conv.py DELETED Viewed

@@ -1,258 +0,0 @@
-# conv.py
-# Clean, dependency-light graph encoder blocks for molecular GNNs.
-# - Single source of truth for convolution choices: "gine", "gin", "gcn"
-# - Edge attributes are supported for "gine" (recommended for chemistry)
-# - No duplication with PyG built-ins; everything wraps torch_geometric.nn
-# - Consistent encoder API: GNNEncoder(...).forward(x, edge_index, edge_attr, batch) -> graph embedding [B, emb_dim]
-from __future__ import annotations
-from typing import Literal, Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch_geometric.nn import (
-    GINEConv,
-    GINConv,
-    GCNConv,
-    global_mean_pool,
-    global_add_pool,
-    global_max_pool,
-)
-def get_activation(name: str) -> nn.Module:
-    name = name.lower()
-    if name == "relu":
-        return nn.ReLU()
-    if name == "gelu":
-        return nn.GELU()
-    if name == "silu":
-        return nn.SiLU()
-    if name in ("leaky_relu", "lrelu"):
-        return nn.LeakyReLU(0.1)
-    raise ValueError(f"Unknown activation: {name}")
-class MLP(nn.Module):
-    """Small MLP used inside GNN layers and projections."""
-    def __init__(
-        self,
-        in_dim: int,
-        hidden_dim: int,
-        out_dim: int,
-        num_layers: int = 2,
-        act: str = "relu",
-        dropout: float = 0.0,
-        bias: bool = True,
-    ):
-        super().__init__()
-        assert num_layers >= 1
-        layers: list[nn.Module] = []
-        dims = [in_dim] + [hidden_dim] * (num_layers - 1) + [out_dim]
-        for i in range(len(dims) - 1):
-            layers.append(nn.Linear(dims[i], dims[i + 1], bias=bias))
-            if i < len(dims) - 2:
-                layers.append(get_activation(act))
-                if dropout > 0:
-                    layers.append(nn.Dropout(dropout))
-        self.net = nn.Sequential(*layers)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.net(x)
-class NodeProjector(nn.Module):
-    """Projects raw node features to model embedding size."""
-    def __init__(self, in_dim_node: int, emb_dim: int, act: str = "relu"):
-        super().__init__()
-        if in_dim_node == emb_dim:
-            self.proj = nn.Identity()
-        else:
-            self.proj = nn.Sequential(
-                nn.Linear(in_dim_node, emb_dim),
-                get_activation(act),
-            )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.proj(x)
-class EdgeProjector(nn.Module):
-    """Projects raw edge attributes to model embedding size for GINE."""
-    def __init__(self, in_dim_edge: int, emb_dim: int, act: str = "relu"):
-        super().__init__()
-        if in_dim_edge <= 0:
-            raise ValueError("in_dim_edge must be > 0 when using edge attributes")
-        self.proj = nn.Sequential(
-            nn.Linear(in_dim_edge, emb_dim),
-            get_activation(act),
-        )
-    def forward(self, e: torch.Tensor) -> torch.Tensor:
-        return self.proj(e)
-class GNNEncoder(nn.Module):
-    """
-    Backbone GNN with selectable conv type.
-    gnn_type:
-        - "gine": chemistry-ready, uses edge_attr (recommended)
-        - "gin" : ignores edge_attr, strong node MPNN
-        - "gcn" : ignores edge_attr, fast spectral conv
-    norm: "batch" | "layer" | "none"
-    readout: "mean" | "sum" | "max"
-    """
-    def __init__(
-        self,
-        in_dim_node: int,
-        emb_dim: int,
-        num_layers: int = 5,
-        gnn_type: Literal["gine", "gin", "gcn"] = "gine",
-        in_dim_edge: int = 0,
-        act: str = "relu",
-        dropout: float = 0.0,
-        residual: bool = True,
-        norm: Literal["batch", "layer", "none"] = "batch",
-        readout: Literal["mean", "sum", "max"] = "mean",
-    ):
-        super().__init__()
-        assert num_layers >= 1
-        self.gnn_type = gnn_type.lower()
-        self.emb_dim = emb_dim
-        self.num_layers = num_layers
-        self.residual = residual
-        self.dropout_p = float(dropout)
-        self.readout = readout.lower()
-        self.node_proj = NodeProjector(in_dim_node, emb_dim, act=act)
-        self.edge_proj: Optional[EdgeProjector] = None
-        if self.gnn_type == "gine":
-            if in_dim_edge <= 0:
-                raise ValueError(
-                    "gine selected but in_dim_edge <= 0. Provide edge attributes or switch gnn_type."
-                )
-            self.edge_proj = EdgeProjector(in_dim_edge, emb_dim, act=act)
-        # Build conv stack
-        self.convs = nn.ModuleList()
-        self.norms = nn.ModuleList()
-        for _ in range(num_layers):
-            if self.gnn_type == "gine":
-                # edge_attr must be projected to emb_dim
-                nn_mlp = MLP(emb_dim, emb_dim, emb_dim, num_layers=2, act=act, dropout=0.0)
-                conv = GINEConv(nn_mlp)
-            elif self.gnn_type == "gin":
-                nn_mlp = MLP(emb_dim, emb_dim, emb_dim, num_layers=2, act=act, dropout=0.0)
-                conv = GINConv(nn_mlp)
-            elif self.gnn_type == "gcn":
-                conv = GCNConv(emb_dim, emb_dim, add_self_loops=True, normalize=True)
-            else:
-                raise ValueError(f"Unknown gnn_type: {gnn_type}")
-            self.convs.append(conv)
-            if norm == "batch":
-                self.norms.append(nn.BatchNorm1d(emb_dim))
-            elif norm == "layer":
-                self.norms.append(nn.LayerNorm(emb_dim))
-            elif norm == "none":
-                self.norms.append(nn.Identity())
-            else:
-                raise ValueError(f"Unknown norm: {norm}")
-        self.act = get_activation(act)
-    def _readout(self, x: torch.Tensor, batch: torch.Tensor) -> torch.Tensor:
-        if self.readout == "mean":
-            return global_mean_pool(x, batch)
-        if self.readout == "sum":
-            return global_add_pool(x, batch)
-        if self.readout == "max":
-            return global_max_pool(x, batch)
-        raise ValueError(f"Unknown readout: {self.readout}")
-    def forward(
-        self,
-        x: torch.Tensor,
-        edge_index: torch.Tensor,
-        edge_attr: Optional[torch.Tensor],
-        batch: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        """
-        Returns a graph-level embedding of shape [B, emb_dim].
-        If batch is None, assumes a single graph and creates a zero batch vector.
-        """
-        if batch is None:
-            batch = x.new_zeros(x.size(0), dtype=torch.long)
-        # Project features (ensure float dtype)
-        x = x.float()
-        x = self.node_proj(x)
-        e = None
-        if self.gnn_type == "gine":
-            if edge_attr is None:
-                raise ValueError("GINE requires edge_attr, but got None.")
-            e = self.edge_proj(edge_attr.float())
-        # Message passing
-        h = x
-        for conv, norm in zip(self.convs, self.norms):
-            if self.gnn_type == "gcn":
-                h_next = conv(h, edge_index)  # GCNConv ignores edge_attr
-            elif self.gnn_type == "gin":
-                h_next = conv(h, edge_index)  # GINConv ignores edge_attr
-            else:  # gine
-                h_next = conv(h, edge_index, e)
-            h_next = norm(h_next)
-            h_next = self.act(h_next)
-            if self.residual and h_next.shape == h.shape:
-                h = h + h_next
-            else:
-                h = h_next
-            if self.dropout_p > 0:
-                h = F.dropout(h, p=self.dropout_p, training=self.training)
-        g = self._readout(h, batch)
-        return g  # [B, emb_dim]
-def build_gnn_encoder(
-    in_dim_node: int,
-    emb_dim: int,
-    num_layers: int = 5,
-    gnn_type: Literal["gine", "gin", "gcn"] = "gine",
-    in_dim_edge: int = 0,
-    act: str = "relu",
-    dropout: float = 0.0,
-    residual: bool = True,
-    norm: Literal["batch", "layer", "none"] = "batch",
-    readout: Literal["mean", "sum", "max"] = "mean",
-) -> GNNEncoder:
-    """
-    Factory to create a GNNEncoder with a consistent, minimal API.
-    Prefer calling this from model.py so encoder construction is centralized.
-    """
-    return GNNEncoder(
-        in_dim_node=in_dim_node,
-        emb_dim=emb_dim,
-        num_layers=num_layers,
-        gnn_type=gnn_type,
-        in_dim_edge=in_dim_edge,
-        act=act,
-        dropout=dropout,
-        residual=residual,
-        norm=norm,
-        readout=readout,
-    )
-__all__ = ["GNNEncoder", "build_gnn_encoder"]

src/data_builder.py DELETED Viewed

@@ -1,818 +0,0 @@
-# data_builder.py
-from __future__ import annotations
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Sequence
-import json
-import warnings
-import numpy as np
-import pandas as pd
-import torch
-from torch.utils.data import Dataset
-from torch_geometric.data import Data
-# RDKit is required
-from rdkit import Chem
-from rdkit.Chem.rdchem import HybridizationType, BondType, BondStereo
-# ---------------------------------------------------------
-# Fidelity handling
-# ---------------------------------------------------------
-FID_PRIORITY = ["exp", "dft", "md", "gc"]  # internal lower-case canonical order
-def _norm_fid(fid: str) -> str:
-    return fid.strip().lower()
-def _ensure_targets_order(requested: Sequence[str]) -> List[str]:
-    seen = set()
-    ordered = []
-    for t in requested:
-        key = t.strip()
-        if key in seen:
-            continue
-        seen.add(key)
-        ordered.append(key)
-    return ordered
-# ---------------------------------------------------------
-# RDKit featurization
-# ---------------------------------------------------------
-_ATOMS = ["H", "C", "N", "O", "F", "P", "S", "Cl", "Br", "I"]
-_ATOM2IDX = {s: i for i, s in enumerate(_ATOMS)}
-_HYBS = [HybridizationType.SP, HybridizationType.SP2, HybridizationType.SP3, HybridizationType.SP3D, HybridizationType.SP3D2]
-_HYB2IDX = {h: i for i, h in enumerate(_HYBS)}
-_BOND_STEREOS = [
-    BondStereo.STEREONONE,
-    BondStereo.STEREOANY,
-    BondStereo.STEREOZ,
-    BondStereo.STEREOE,
-    BondStereo.STEREOCIS,
-    BondStereo.STEREOTRANS,
-]
-_STEREO2IDX = {s: i for i, s in enumerate(_BOND_STEREOS)}
-def _one_hot(index: int, size: int) -> List[float]:
-    v = [0.0] * size
-    if 0 <= index < size:
-        v[index] = 1.0
-    return v
-def atom_features(atom: Chem.Atom) -> List[float]:
-    # Element one-hot with "other"
-    elem_idx = _ATOM2IDX.get(atom.GetSymbol(), None)
-    elem_oh = _one_hot(elem_idx if elem_idx is not None else len(_ATOMS), len(_ATOMS) + 1)
-    # Degree one-hot up to 5 (bucket 5+)
-    deg = min(int(atom.GetDegree()), 5)
-    deg_oh = _one_hot(deg, 6)
-    # Formal charge one-hot in [-2,-1,0,+1,+2]
-    fc = max(-2, min(2, int(atom.GetFormalCharge())))
-    fc_oh = _one_hot(fc + 2, 5)
-    # Aromatic, in ring flags
-    aromatic = [1.0 if atom.GetIsAromatic() else 0.0]
-    in_ring = [1.0 if atom.IsInRing() else 0.0]
-    # Hybridization one-hot with "other"
-    hyb_idx = _HYB2IDX.get(atom.GetHybridization(), None)
-    hyb_oh = _one_hot(hyb_idx if hyb_idx is not None else len(_HYBS), len(_HYBS) + 1)
-    # Implicit H count capped at 4
-    imp_h = min(int(atom.GetTotalNumHs(includeNeighbors=True)), 4)
-    imp_h_oh = _one_hot(imp_h, 5)
-    # length: 11+6+5+1+1+6+5 = 35 (element has 11 buckets incl. "other")
-    feats = elem_oh + deg_oh + fc_oh + aromatic + in_ring + hyb_oh + imp_h_oh
-    return feats
-def bond_features(bond: Chem.Bond) -> List[float]:
-    bt = bond.GetBondType()
-    single = 1.0 if bt == BondType.SINGLE else 0.0
-    double = 1.0 if bt == BondType.DOUBLE else 0.0
-    triple = 1.0 if bt == BondType.TRIPLE else 0.0
-    aromatic = 1.0 if bt == BondType.AROMATIC else 0.0
-    conj = 1.0 if bond.GetIsConjugated() else 0.0
-    in_ring = 1.0 if bond.IsInRing() else 0.0
-    stereo_oh = _one_hot(_STEREO2IDX.get(bond.GetStereo(), 0), len(_BOND_STEREOS))
-    # length: 4 + 1 + 1 + 6 = 12
-    return [single, double, triple, aromatic, conj, in_ring] + stereo_oh
-def featurize_smiles(smiles: str) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    mol = Chem.MolFromSmiles(smiles)
-    if mol is None:
-        raise ValueError(f"RDKit failed to parse SMILES: {smiles}")
-    # Nodes
-    x = torch.tensor([atom_features(a) for a in mol.GetAtoms()], dtype=torch.float32)
-    # Edges (bidirectional)
-    rows, cols, eattr = [], [], []
-    for b in mol.GetBonds():
-        i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
-        bf = bond_features(b)
-        rows.extend([i, j])
-        cols.extend([j, i])
-        eattr.extend([bf, bf])
-    if not rows:
-        # single-atom molecules, add a dummy self-loop edge
-        rows, cols = [0], [0]
-        eattr = [[0.0] * 12]
-    edge_index = torch.tensor([rows, cols], dtype=torch.long)
-    edge_attr = torch.tensor(eattr, dtype=torch.float32)
-    return x, edge_index, edge_attr
-# ---------------------------------------------------------
-# CSV discovery and reading
-# ---------------------------------------------------------
-def discover_target_fid_csvs(
-    root: Path,
-    targets: Sequence[str],
-    fidelities: Sequence[str],
-) -> Dict[tuple[str, str], Path]:
-    """
-    Discover CSV files for (target, fidelity) pairs.
-    Supported layouts (case-insensitive):
-      1) {root}/{fid}/{target}.csv
-         e.g. datafull/MD/SHEAR.csv, datafull/exp/cp.csv
-      2) {root}/{target}_{fid}.csv
-         e.g. datafull/SHEAR_MD.csv, datafull/cp_exp.csv
-    Matching is STRICT:
-      - target and fid must appear as full '_' tokens in the stem
-      - no substring matching, so 'he' will NOT match 'shear_md.csv'
-    """
-    root = Path(root)
-    targets = _ensure_targets_order(targets)
-    fids_lc = [_norm_fid(f) for f in fidelities]
-    # Collect all CSVs under root
-    all_paths = list(root.rglob("*.csv"))
-    # Pre-index: (parent_name_lower, stem_lower, tokens_lower)
-    indexed = []
-    for p in all_paths:
-        parent = p.parent.name.lower()
-        stem = p.stem.lower()  # filename without extension
-        tokens = stem.split("_")
-        tokens_l = [t.lower() for t in tokens]
-        indexed.append((p, parent, stem, tokens_l))
-    mapping: Dict[tuple[str, str], Path] = {}
-    for fid in fids_lc:
-        fid_l = fid.strip().lower()
-        for tgt in targets:
-            tgt_l = tgt.strip().lower()
-            # ---- 1) Prefer explicit folder layout: {root}/{fid}/{target}.csv ----
-            #      parent == fid AND stem == target  (case-insensitive)
-            folder_matches = [
-                p for (p, parent, stem, tokens_l) in indexed
-                if parent == fid_l and stem == tgt_l
-            ]
-            if folder_matches:
-                # If you ever get more than one, it’s a config problem
-                if len(folder_matches) > 1:
-                    warnings.warn(
-                        f"[discover_target_fid_csvs] Multiple matches for "
-                        f"target='{tgt}' fid='{fid}' under folder layout: "
-                        + ", ".join(str(p) for p in folder_matches)
-                    )
-                mapping[(tgt, fid)] = folder_matches[0]
-                continue
-            # ---- 2) Fallback: {target}_{fid}.csv anywhere under root ----
-            #      require BOTH tgt and fid as full '_' tokens
-            token_matches = [
-                p for (p, parent, stem, tokens_l) in indexed
-                if (tgt_l in tokens_l) and (fid_l in tokens_l)
-            ]
-            if token_matches:
-                if len(token_matches) > 1:
-                    warnings.warn(
-                        f"[discover_target_fid_csvs] Multiple token matches for "
-                        f"target='{tgt}' fid='{fid}': "
-                        + ", ".join(str(p) for p in token_matches)
-                    )
-                mapping[(tgt, fid)] = token_matches[0]
-                continue
-            # If neither layout exists, we simply do not add (tgt, fid) to mapping.
-            # build_long_table will just skip that combination.
-            # You can enable a warning if you want:
-            # warnings.warn(f"[discover_target_fid_csvs] No CSV for target='{tgt}', fid='{fid}'")
-    return mapping
-def read_target_csv(path: Path, target: str) -> pd.DataFrame:
-    """
-    Accepts:
-      - 'smiles' column (case-insensitive)
-      - value column named '{target}' or one of ['value','y' or lower-case target]
-    Deduplicates by SMILES with mean.
-    """
-    df = pd.read_csv(path)
-    # smiles column
-    smiles_col = next((c for c in df.columns if c.lower() == "smiles"), None)
-    if smiles_col is None:
-        raise ValueError(f"{path} must contain a 'smiles' column.")
-    df = df.rename(columns={smiles_col: "smiles"})
-    # value column
-    val_col = None
-    if target in df.columns:
-        val_col = target
-    else:
-        for c in df.columns:
-            if c.lower() in ("value", "y", target.lower()):
-                val_col = c
-                break
-    if val_col is None:
-        raise ValueError(f"{path} must contain a '{target}' column or one of ['value','y'].")
-    df = df[["smiles", val_col]].copy()
-    df = df.dropna(subset=[val_col])
-    df[val_col] = pd.to_numeric(df[val_col], errors="coerce")
-    df = df.dropna(subset=[val_col])
-    # Deduplicate SMILES by mean
-    if df.duplicated(subset=["smiles"]).any():
-        warnings.warn(f"[data_builder] Duplicates by SMILES in {path}. Averaging duplicates.")
-        df = df.groupby("smiles", as_index=False)[val_col].mean()
-    return df.rename(columns={val_col: target})
-def build_long_table(root: Path, targets: Sequence[str], fidelities: Sequence[str]) -> pd.DataFrame:
-    """
-    Returns long-form table with columns: [smiles, fid, fid_idx, target, value]
-    """
-    targets = _ensure_targets_order(targets)
-    fids_lc = [_norm_fid(f) for f in fidelities]
-    mapping = discover_target_fid_csvs(root, targets, fidelities)
-    if not mapping:
-        raise FileNotFoundError(f"No CSVs found under {root} for the given targets and fidelities.")
-    long_rows = []
-    for (tgt, fid), path in mapping.items():
-        df = read_target_csv(path, tgt)
-        df["fid"] = _norm_fid(fid)
-        df["target"] = tgt
-        df = df.rename(columns={tgt: "value"})
-        long_rows.append(df[["smiles", "fid", "target", "value"]])
-    long = pd.concat(long_rows, axis=0, ignore_index=True)
-    # attach fid index by priority
-    fid2idx = {f: i for i, f in enumerate(FID_PRIORITY)}
-    long["fid"] = long["fid"].str.lower()
-    unknown = sorted(set(long["fid"]) - set(fid2idx.keys()))
-    if unknown:
-        warnings.warn(f"[data_builder] Unknown fidelities found: {unknown}. Appending after known ones.")
-        start = len(fid2idx)
-        for i, f in enumerate(unknown):
-            fid2idx[f] = start + i
-    long["fid_idx"] = long["fid"].map(fid2idx)
-    return long
-def pivot_to_rows_by_smiles_fid(long: pd.DataFrame, targets: Sequence[str]) -> pd.DataFrame:
-    """
-    Input: long table [smiles, fid, fid_idx, target, value]
-    Output: row-per-(smiles,fid) with wide columns for each target
-    """
-    targets = _ensure_targets_order(targets)
-    wide = long.pivot_table(index=["smiles", "fid", "fid_idx"], columns="target", values="value", aggfunc="mean")
-    wide = wide.reset_index()
-    for t in targets:
-        if t not in wide.columns:
-            wide[t] = np.nan
-    cols = ["smiles", "fid", "fid_idx"] + list(targets)
-    return wide[cols]
-# ---------------------------------------------------------
-# Grouped split by SMILES and transforms/normalization
-# ---------------------------------------------------------
-def grouped_split_by_smiles(
-    df_rows: pd.DataFrame,
-    val_ratio: float = 0.1,
-    test_ratio: float = 0.1,
-    seed: int = 42,
-) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
-    uniq = df_rows["smiles"].drop_duplicates().values
-    rng = np.random.default_rng(seed)
-    uniq = rng.permutation(uniq)
-    n = len(uniq)
-    n_test = int(round(n * test_ratio))
-    n_val = int(round(n * val_ratio))
-    test_smiles = set(uniq[:n_test])
-    val_smiles = set(uniq[n_test:n_test + n_val])
-    train_smiles = set(uniq[n_test + n_val:])
-    train_idx = df_rows.index[df_rows["smiles"].isin(train_smiles)].to_numpy()
-    val_idx = df_rows.index[df_rows["smiles"].isin(val_smiles)].to_numpy()
-    test_idx = df_rows.index[df_rows["smiles"].isin(test_smiles)].to_numpy()
-    return train_idx, val_idx, test_idx
-# ---------------- Enhanced TargetScaler with per-task transforms ----------------
-class TargetScaler:
-    """
-    Per-task transform + standardization fitted on the training split only.
-    - transforms[t] in {"identity","log10"}
-    - eps[t] is added before log for numerical safety (only used if transforms[t]=="log10")
-    - mean/std are computed in the *transformed* domain
-    """
-    def __init__(self, transforms: Optional[Sequence[str]] = None, eps: Optional[Sequence[float] | torch.Tensor] = None):
-        self.mean: Optional[torch.Tensor] = None   # [T] (transformed domain)
-        self.std: Optional[torch.Tensor] = None    # [T] (transformed domain)
-        self.transforms: List[str] = [str(t).lower() for t in transforms] if transforms is not None else []
-        if eps is None:
-            self.eps: Optional[torch.Tensor] = None
-        else:
-            self.eps = torch.as_tensor(eps, dtype=torch.float32)
-        self._tiny = 1e-12
-    def _ensure_cfg(self, T: int):
-        if not self.transforms or len(self.transforms) != T:
-            self.transforms = ["identity"] * T
-        if self.eps is None or self.eps.numel() != T:
-            self.eps = torch.zeros(T, dtype=torch.float32)
-    def _forward_transform_only(self, y: torch.Tensor) -> torch.Tensor:
-        """
-        Apply per-task transforms *before* standardization.
-        y: [N, T] in original units. Returns transformed y_tf in same shape.
-        """
-        out = y.clone()
-        T = out.size(1)
-        self._ensure_cfg(T)
-        for t in range(T):
-            if self.transforms[t] == "log10":
-                out[:, t] = torch.log10(torch.clamp(out[:, t] + self.eps[t], min=self._tiny))
-        return out
-    def _inverse_transform_only(self, y_tf: torch.Tensor) -> torch.Tensor:
-        """
-        Inverse the per-task transform (no standardization here).
-        y_tf: [N, T] in transformed units.
-        """
-        out = y_tf.clone()
-        T = out.size(1)
-        self._ensure_cfg(T)
-        for t in range(T):
-            if self.transforms[t] == "log10":
-                out[:, t] = (10.0 ** out[:, t]) - self.eps[t]
-        return out
-    def fit(self, y: torch.Tensor, mask: torch.Tensor):
-        """
-        y: [N, T] original units; mask: [N, T] bool
-        Chooses eps automatically if not provided; mean/std computed in transformed space.
-        """
-        T = y.size(1)
-        self._ensure_cfg(T)
-        if self.eps is None or self.eps.numel() != T:
-            # Auto epsilon: 0.1 * min positive per task (robust)
-            eps_vals: List[float] = []
-            y_np = y.detach().cpu().numpy()
-            m_np = mask.detach().cpu().numpy().astype(bool)
-            for t in range(T):
-                if self.transforms[t] != "log10":
-                    eps_vals.append(0.0)
-                    continue
-                vals = y_np[m_np[:, t], t]
-                pos = vals[vals > 0]
-                if pos.size == 0:
-                    eps_vals.append(1e-8)
-                else:
-                    eps_vals.append(0.1 * float(max(np.min(pos), 1e-8)))
-            self.eps = torch.tensor(eps_vals, dtype=torch.float32)
-        y_tf = self._forward_transform_only(y)
-        eps = 1e-8
-        y_masked = torch.where(mask, y_tf, torch.zeros_like(y_tf))
-        counts = mask.sum(dim=0).clamp_min(1)
-        mean = y_masked.sum(dim=0) / counts
-        var = ((torch.where(mask, y_tf - mean, torch.zeros_like(y_tf))) ** 2).sum(dim=0) / counts
-        std = torch.sqrt(var + eps)
-        self.mean, self.std = mean, std
-    def transform(self, y: torch.Tensor) -> torch.Tensor:
-        y_tf = self._forward_transform_only(y)
-        return (y_tf - self.mean) / self.std
-    def inverse(self, y_std: torch.Tensor) -> torch.Tensor:
-        """
-        Inverse standardization + inverse transform → original units.
-        y_std: [N, T] in standardized-transformed space
-        """
-        y_tf = y_std * self.std + self.mean
-        return self._inverse_transform_only(y_tf)
-    def state_dict(self) -> Dict[str, torch.Tensor | List[str]]:
-        return {
-            "mean": self.mean,
-            "std": self.std,
-            "transforms": self.transforms,
-            "eps": self.eps,
-        }
-    def load_state_dict(self, state: Dict[str, torch.Tensor | List[str]]):
-        self.mean = state["mean"]
-        self.std = state["std"]
-        self.transforms = [str(t) for t in state.get("transforms", [])]
-        eps = state.get("eps", None)
-        self.eps = torch.as_tensor(eps, dtype=torch.float32) if eps is not None else None
-def auto_select_task_transforms(
-    y_train: torch.Tensor,          # [N, T] original units (train split only)
-    mask_train: torch.Tensor,       # [N, T] bool
-    task_names: Sequence[str],
-    *,
-    min_pos_frac: float = 0.95,     # ≥95% of labels positive
-    orders_threshold: float = 2.0,  # ≥2 orders of magnitude between p95 and p5
-    tiny: float = 1e-12,
-) -> tuple[List[str], torch.Tensor]:
-    """
-    Decide per-task transform: "log10" if (mostly-positive AND large dynamic range), else "identity".
-    Returns (transforms, eps_vector) where eps is only used for log tasks.
-    """
-    Y = y_train.detach().cpu().numpy()
-    M = mask_train.detach().cpu().numpy().astype(bool)
-    transforms: List[str] = []
-    eps_vals: List[float] = []
-    for t in range(Y.shape[1]):
-        yt = Y[M[:, t], t]
-        if yt.size == 0:
-            transforms.append("identity")
-            eps_vals.append(0.0)
-            continue
-        pos_frac = (yt > 0).mean()
-        p5 = float(np.percentile(yt, 5))
-        p95 = float(np.percentile(yt, 95))
-        denom = max(p5, tiny)
-        dyn_orders = float(np.log10(max(p95 / denom, 1.0)))
-        use_log = (pos_frac >= min_pos_frac) and (dyn_orders >= orders_threshold)
-        if use_log:
-            pos_vals = yt[yt > 0]
-            if pos_vals.size == 0:
-                eps_vals.append(1e-8)
-            else:
-                eps_vals.append(0.1 * float(max(np.min(pos_vals), 1e-8)))
-            transforms.append("log10")
-        else:
-            transforms.append("identity")
-            eps_vals.append(0.0)
-    return transforms, torch.tensor(eps_vals, dtype=torch.float32)
-# ---------------------------------------------------------
-# Dataset
-# ---------------------------------------------------------
-class MultiFidelityMoleculeDataset(Dataset):
-    """
-    Each item is a PyG Data with:
-      - x: [N_nodes, F_node]
-      - edge_index: [2, N_edges]
-      - edge_attr: [N_edges, F_edge]
-      - y: [T] normalized targets (zeros where missing)
-      - y_mask: [T] bool mask of present targets
-      - fid_idx: [1] long
-      - .smiles and .fid_str added for debugging
-    Targets are kept in the exact order provided by the user.
-    """
-    def __init__(
-        self,
-        rows: pd.DataFrame,
-        targets: Sequence[str],
-        scaler: Optional[TargetScaler],
-        smiles_graph_cache: Dict[str, tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
-    ):
-        super().__init__()
-        self.rows = rows.reset_index(drop=True).copy()
-        self.targets = _ensure_targets_order(targets)
-        self.scaler = scaler
-        self.smiles_graph_cache = smiles_graph_cache
-        # Build y and mask tensors
-        ys, masks = [], []
-        for _, r in self.rows.iterrows():
-            yv, mv = [], []
-            for t in self.targets:
-                v = r[t]
-                if pd.isna(v):
-                    yv.append(np.nan)
-                    mv.append(False)
-                else:
-                    yv.append(float(v))
-                    mv.append(True)
-            ys.append(yv)
-            masks.append(mv)
-        y = torch.tensor(np.array(ys, dtype=np.float32))   # [N, T]
-        mask = torch.tensor(np.array(masks, dtype=np.bool_))
-        if scaler is not None and scaler.mean is not None:
-            y_norm = torch.where(mask, scaler.transform(y), torch.zeros_like(y))
-        else:
-            y_norm = y
-        self.y = y_norm
-        self.mask = mask
-        # Input dims
-        any_smiles = self.rows.iloc[0]["smiles"]
-        x0, _, e0 = smiles_graph_cache[any_smiles]
-        self.in_dim_node = x0.shape[1]
-        self.in_dim_edge = e0.shape[1]
-        # Fidelity metadata for reference (local indexing in this dataset)
-        self.fids = sorted(
-            self.rows["fid"].str.lower().unique().tolist(),
-            key=lambda f: (FID_PRIORITY + [f]).index(f) if f in FID_PRIORITY else len(FID_PRIORITY),
-        )
-        self.fid2idx = {f: i for i, f in enumerate(self.fids)}
-        self.rows["fid_idx_local"] = self.rows["fid"].str.lower().map(self.fid2idx)
-    def __len__(self) -> int:
-        return len(self.rows)
-    def __getitem__(self, idx: int) -> Data:
-        idx = int(idx)
-        r = self.rows.iloc[idx]
-        smi = r["smiles"]
-        x, edge_index, edge_attr = self.smiles_graph_cache[smi]
-        # Ensure [1, T] so batches become [B, T]
-        y_i = self.y[idx].clone().unsqueeze(0)      # [1, T]
-        m_i = self.mask[idx].clone().unsqueeze(0)   # [1, T]
-        fid_idx = int(r["fid_idx_local"])
-        d = Data(
-            x=x.clone(),
-            edge_index=edge_index.clone(),
-            edge_attr=edge_attr.clone(),
-            y=y_i,
-            y_mask=m_i,
-            fid_idx=torch.tensor([fid_idx], dtype=torch.long),
-        )
-        d.smiles = smi
-        d.fid_str = r["fid"]
-        return d
-def subsample_train_indices(
-    rows: pd.DataFrame,
-    train_idx: np.ndarray,
-    *,
-    target: Optional[str],
-    fidelity: Optional[str],
-    pct: float = 1.0,
-    seed: int = 137,
-) -> np.ndarray:
-    """
-    Return a filtered train_idx that keeps only a 'pct' fraction (0<pct<=1)
-    of TRAIN rows for the specified (target, fidelity) block. Selection is
-    deterministic by unique SMILES. Rows outside the block are untouched.
-    rows: wide table with columns ["smiles","fid","fid_idx", <targets...>]
-    """
-    if target is None or fidelity is None or pct >= 0.999:
-        return train_idx
-    if target not in rows.columns:
-        return train_idx
-    fid_lc = fidelity.strip().lower()
-    # Identify TRAIN rows in the specified block: matching fid and having a label for 'target'
-    train_rows = rows.iloc[train_idx]
-    block_mask = (train_rows["fid"].str.lower() == fid_lc) & (~train_rows[target].isna())
-    if not bool(block_mask.any()):
-        return train_idx  # nothing to subsample
-    # Sample by unique SMILES (stable & grouped)
-    smiles_all = pd.Index(train_rows.loc[block_mask, "smiles"].unique())
-    n_all = len(smiles_all)
-    if n_all == 0:
-        return train_idx
-    if pct <= 0.0:
-        pct = 0.0001
-    n_keep = max(1, int(round(pct * n_all)))
-    rng = np.random.RandomState(int(seed))
-    smiles_sorted = np.array(sorted(smiles_all.tolist()))
-    keep_smiles = set(rng.choice(smiles_sorted, size=n_keep, replace=False).tolist())
-    # Keep all non-block rows; within block keep selected SMILES
-    keep_mask_local = (~block_mask) | (train_rows["smiles"].isin(keep_smiles))
-    kept_train_idx = train_rows.index[keep_mask_local].to_numpy()
-    return kept_train_idx
-# ---------------------------------------------------------
-# High-level builder
-# ---------------------------------------------------------
-def build_dataset_from_dir(
-    root_dir: str | Path,
-    targets: Sequence[str],
-    fidelities: Sequence[str] = ("exp", "dft", "md", "gc"),
-    val_ratio: float = 0.1,
-    test_ratio: float = 0.1,
-    seed: int = 42,
-    save_splits_path: Optional[str | Path] = None,
-    # Optional subsampling of a (target, fidelity) block in TRAIN
-    subsample_target: Optional[str] = None,
-    subsample_fidelity: Optional[str] = None,
-    subsample_pct: float = 1.0,
-    subsample_seed: int = 137,
-    # -------- NEW: auto/explicit log transforms --------
-    auto_log: bool = True,
-    log_orders_threshold: float = 2.0,
-    log_min_pos_frac: float = 0.95,
-    explicit_log_targets: Optional[Sequence[str]] = None,  # e.g. ["permeability"]
-) -> tuple[MultiFidelityMoleculeDataset, MultiFidelityMoleculeDataset, MultiFidelityMoleculeDataset, TargetScaler]:
-    """
-    Returns train_ds, val_ds, test_ds, scaler.
-    - Discovers CSVs for requested targets and fidelities
-    - Builds a row-per-(smiles,fid) table with columns for each target
-    - Splits by unique SMILES to avoid leakage across fidelity or targets
-    - Fits transform+normalization on the training split only, applies to val/test
-    - Builds RDKit graphs once per unique SMILES and reuses them
-    NEW:
-      - Auto per-task transform selection ("log10" vs "identity") by criteria
-      - Optional explicit override via explicit_log_targets
-    """
-    root = Path(root_dir)
-    targets = _ensure_targets_order(targets)
-    fids_lc = [_norm_fid(f) for f in fidelities]
-    # Build long and pivot to rows
-    long = build_long_table(root, targets, fids_lc)
-    rows = pivot_to_rows_by_smiles_fid(long, targets)
-    # Deterministic grouped split by SMILES
-    if save_splits_path is not None and Path(save_splits_path).exists():
-        with open(save_splits_path, "r") as f:
-            split_obj = json.load(f)
-        train_smiles = set(split_obj["train_smiles"])
-        val_smiles = set(split_obj["val_smiles"])
-        test_smiles = set(split_obj["test_smiles"])
-        train_idx = rows.index[rows["smiles"].isin(train_smiles)].to_numpy()
-        val_idx = rows.index[rows["smiles"].isin(val_smiles)].to_numpy()
-        test_idx = rows.index[rows["smiles"].isin(test_smiles)].to_numpy()
-    else:
-        train_idx, val_idx, test_idx = grouped_split_by_smiles(rows, val_ratio=val_ratio, test_ratio=test_ratio, seed=seed)
-        if save_splits_path is not None:
-            split_obj = {
-                "train_smiles": rows.iloc[train_idx]["smiles"].drop_duplicates().tolist(),
-                "val_smiles": rows.iloc[val_idx]["smiles"].drop_duplicates().tolist(),
-                "test_smiles": rows.iloc[test_idx]["smiles"].drop_duplicates().tolist(),
-                "seed": seed,
-                "val_ratio": val_ratio,
-                "test_ratio": test_ratio,
-            }
-            Path(save_splits_path).parent.mkdir(parents=True, exist_ok=True)
-            with open(save_splits_path, "w") as f:
-                json.dump(split_obj, f, indent=2)
-    # Build RDKit graphs once per unique SMILES
-    uniq_smiles = rows["smiles"].drop_duplicates().tolist()
-    smiles_graph_cache: Dict[str, tuple[torch.Tensor, torch.Tensor, torch.Tensor]] = {}
-    for smi in uniq_smiles:
-        try:
-            x, edge_index, edge_attr = featurize_smiles(smi)
-            smiles_graph_cache[smi] = (x, edge_index, edge_attr)
-        except Exception as e:
-            warnings.warn(f"[data_builder] Dropping SMILES due to RDKit parse error: {smi} ({e})")
-    # Filter rows to those that featurized successfully
-    rows = rows[rows["smiles"].isin(smiles_graph_cache.keys())].reset_index(drop=True)
-    # Re-map indices after filtering using smiles membership
-    train_idx = rows.index[rows["smiles"].isin(set(rows.iloc[train_idx]["smiles"]))].to_numpy()
-    val_idx = rows.index[rows["smiles"].isin(set(rows.iloc[val_idx]["smiles"]))].to_numpy()
-    test_idx = rows.index[rows["smiles"].isin(set(rows.iloc[test_idx]["smiles"]))].to_numpy()
-    # Optional subsampling (train only) for a specific (target, fidelity) block
-    train_idx = subsample_train_indices(
-        rows,
-        train_idx,
-        target=subsample_target,
-        fidelity=subsample_fidelity,
-        pct=float(subsample_pct),
-        seed=int(subsample_seed),
-    )
-    # Fit scaler on training split only
-    def build_y_mask(df_slice: pd.DataFrame) -> tuple[torch.Tensor, torch.Tensor]:
-        ys, ms = [], []
-        for _, r in df_slice.iterrows():
-            yv, mv = [], []
-            for t in targets:
-                v = r[t]
-                if pd.isna(v):
-                    yv.append(np.nan)
-                    mv.append(False)
-                else:
-                    yv.append(float(v))
-                    mv.append(True)
-            ys.append(yv)
-            ms.append(mv)
-        y = torch.tensor(np.array(ys, dtype=np.float32))
-        mask = torch.tensor(np.array(ms, dtype=np.bool_))
-        return y, mask
-    y_train, mask_train = build_y_mask(rows.iloc[train_idx])
-    # Decide transforms per task
-    if explicit_log_targets:
-        explicit_set = set(explicit_log_targets)
-        transforms = [("log10" if t in explicit_set else "identity") for t in targets]
-        eps_vec = None  # will be auto-chosen in scaler.fit if not provided
-    elif auto_log:
-        transforms, eps_vec = auto_select_task_transforms(
-            y_train,
-            mask_train,
-            targets,
-            min_pos_frac=float(log_min_pos_frac),
-            orders_threshold=float(log_orders_threshold),
-        )
-    else:
-        transforms, eps_vec = (["identity"] * len(targets), None)
-    scaler = TargetScaler(transforms=transforms, eps=eps_vec)
-    scaler.fit(y_train, mask_train)
-    # Build datasets
-    train_rows = rows.iloc[train_idx].reset_index(drop=True)
-    val_rows = rows.iloc[val_idx].reset_index(drop=True)
-    test_rows = rows.iloc[test_idx].reset_index(drop=True)
-    train_ds = MultiFidelityMoleculeDataset(train_rows, targets, scaler, smiles_graph_cache)
-    val_ds = MultiFidelityMoleculeDataset(val_rows, targets, scaler, smiles_graph_cache)
-    test_ds = MultiFidelityMoleculeDataset(test_rows, targets, scaler, smiles_graph_cache)
-    return train_ds, val_ds, test_ds, scaler
-__all__ = [
-    "build_dataset_from_dir",
-    "discover_target_fid_csvs",
-    "read_target_csv",
-    "build_long_table",
-    "pivot_to_rows_by_smiles_fid",
-    "grouped_split_by_smiles",
-    "TargetScaler",
-    "MultiFidelityMoleculeDataset",
-    "atom_features",
-    "bond_features",
-    "featurize_smiles",
-    "auto_select_task_transforms",
-]

src/discover_llm.py DELETED Viewed

@@ -1,829 +0,0 @@
-# src/discovery.py
-from __future__ import annotations
-import json
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Callable, Dict, List, Optional, Tuple
-import numpy as np
-import pandas as pd
-from rdkit import Chem, DataStructs
-from rdkit.Chem import AllChem
-from . import sascorer
-# Reuse your canonicalizer if you want; otherwise keep local
-def canonicalize_smiles(smiles: str) -> Optional[str]:
-    s = (smiles or "").strip()
-    if not s:
-        return None
-    m = Chem.MolFromSmiles(s)
-    if m is None:
-        return None
-    return Chem.MolToSmiles(m, canonical=True)
-# -------------------------
-# Spec schema (minimal v0)
-# -------------------------
-@dataclass
-class DiscoverySpec:
-    dataset: List[str]  # ["PI1M_PROPERTY.parquet", "POLYINFO_PROPERTY.parquet"]
-    polyinfo: str  # "POLYINFO_PROPERTY.parquet"
-    polyinfo_csv: str  # "POLYINFO.csv"
-    hard_constraints: Dict[str, Dict[str, float]]  # { "tg": {"min": 400}, "tc": {"max": 0.3} }
-    objectives: List[Dict[str, str]]  # [{"property":"cp","goal":"maximize"}, ...]
-    max_pool: int = 200000         # legacy (kept for compatibility; aligned to pareto_max)
-    pareto_max: int = 50000        # cap points used for Pareto + diversity fingerprinting
-    max_candidates: int = 30       # final output size
-    max_pareto_fronts: int = 5     # how many Pareto layers to keep for candidate pool
-    min_distance: float = 0.30     # diversity threshold in Tanimoto distance
-    fingerprint: str = "morgan"    # morgan only for now
-    random_seed: int = 7
-    use_canonical_smiles: bool = True
-    use_full_data: bool = False
-    trust_weights: Dict[str, float] | None = None
-    selection_weights: Dict[str, float] | None = None
-# -------------------------
-# Property metadata (local to discovery_llm)
-# -------------------------
-PROPERTY_META: Dict[str, Dict[str, str]] = {
-    # Thermal
-    "tm": {"name": "Melting temperature", "unit": "K"},
-    "tg": {"name": "Glass transition temperature", "unit": "K"},
-    "td": {"name": "Thermal diffusivity", "unit": "m^2/s"},
-    "tc": {"name": "Thermal conductivity", "unit": "W/m-K"},
-    "cp": {"name": "Specific heat capacity", "unit": "J/kg-K"},
-    # Mechanical
-    "young": {"name": "Young's modulus", "unit": "GPa"},
-    "shear": {"name": "Shear modulus", "unit": "GPa"},
-    "bulk": {"name": "Bulk modulus", "unit": "GPa"},
-    "poisson": {"name": "Poisson ratio", "unit": "-"},
-    # Transport
-    "visc": {"name": "Viscosity", "unit": "Pa-s"},
-    "dif": {"name": "Diffusivity", "unit": "cm^2/s"},
-    # Gas permeability
-    "phe": {"name": "He permeability", "unit": "Barrer"},
-    "ph2": {"name": "H2 permeability", "unit": "Barrer"},
-    "pco2": {"name": "CO2 permeability", "unit": "Barrer"},
-    "pn2": {"name": "N2 permeability", "unit": "Barrer"},
-    "po2": {"name": "O2 permeability", "unit": "Barrer"},
-    "pch4": {"name": "CH4 permeability", "unit": "Barrer"},
-    # Electronic / Optical
-    "alpha": {"name": "Polarizability", "unit": "a.u."},
-    "homo": {"name": "HOMO energy", "unit": "eV"},
-    "lumo": {"name": "LUMO energy", "unit": "eV"},
-    "bandgap": {"name": "Band gap", "unit": "eV"},
-    "mu": {"name": "Dipole moment", "unit": "Debye"},
-    "etotal": {"name": "Total electronic energy", "unit": "eV"},
-    "ri": {"name": "Refractive index", "unit": "-"},
-    "dc": {"name": "Dielectric constant", "unit": "-"},
-    "pe": {"name": "Permittivity", "unit": "-"},
-    # Structural / Physical
-    "rg": {"name": "Radius of gyration", "unit": "A"},
-    "rho": {"name": "Density", "unit": "g/cm^3"},
-}
-# -------------------------
-# Column mapping
-# -------------------------
-def mean_col(prop_key: str) -> str:
-    return f"mean_{prop_key.lower()}"
-def std_col(prop_key: str) -> str:
-    return f"std_{prop_key.lower()}"
-def normalize_weights(weights: Dict[str, float], defaults: Dict[str, float]) -> Dict[str, float]:
-    out: Dict[str, float] = {}
-    for k, v in defaults.items():
-        try:
-            vv = float(weights.get(k, v))
-        except Exception:
-            vv = float(v)
-        out[k] = max(0.0, vv)
-    s = float(sum(out.values()))
-    if s <= 0.0:
-        return defaults.copy()
-    return {k: float(v / s) for k, v in out.items()}
-def spec_from_dict(obj: dict, dataset_path: List[str], polyinfo_path: str, polyinfo_csv_path: str) -> DiscoverySpec:
-    pareto_max = int(obj.get("pareto_max", 50000))
-    return DiscoverySpec(
-        dataset=list(dataset_path),
-        polyinfo=polyinfo_path,
-        polyinfo_csv=polyinfo_csv_path,
-        hard_constraints=obj.get("hard_constraints", {}),
-        objectives=obj.get("objectives", []),
-        # Legacy field kept for compatibility; effectively collapsed to pareto_max.
-        max_pool=pareto_max,
-        pareto_max=pareto_max,
-        max_candidates=int(obj.get("max_candidates", 30)),
-        max_pareto_fronts=int(obj.get("max_pareto_fronts", 5)),
-        min_distance=float(obj.get("min_distance", 0.30)),
-        fingerprint=str(obj.get("fingerprint", "morgan")),
-        random_seed=int(obj.get("random_seed", 7)),
-        use_canonical_smiles=not bool(obj.get("skip_smiles_canonicalization", True)),
-        use_full_data=bool(obj.get("use_full_data", False)),
-        trust_weights=obj.get("trust_weights"),
-        selection_weights=obj.get("selection_weights"),
-    )
-# -------------------------
-# Parquet loading (safe)
-# -------------------------
-def load_parquet_columns(path: str | List[str], columns: List[str]) -> pd.DataFrame:
-    """
-    Load only requested columns from Parquet (critical for 1M rows).
-    Accepts a single path or a list of paths and concatenates rows.
-    """
-    def _load_one(fp: str, req_cols: List[str]) -> pd.DataFrame:
-        available: list[str]
-        try:
-            import pyarrow.parquet as pq
-            pf = pq.ParquetFile(fp)
-            available = [str(c) for c in pf.schema.names]
-        except Exception:
-            # If schema probing fails, fall back to direct read with requested columns.
-            return pd.read_parquet(fp, columns=req_cols)
-        available_set = set(available)
-        lower_to_actual = {c.lower(): c for c in available}
-        # Resolve requested names against actual parquet schema.
-        resolved: dict[str, str] = {}
-        for req in req_cols:
-            if req in available_set:
-                resolved[req] = req
-                continue
-            alt = lower_to_actual.get(str(req).lower())
-            if alt is not None:
-                resolved[req] = alt
-        use_cols = sorted(set(resolved.values()))
-        if not use_cols:
-            return pd.DataFrame(columns=req_cols)
-        out = pd.read_parquet(fp, columns=use_cols)
-        for req in req_cols:
-            src = resolved.get(req)
-            if src is None:
-                out[req] = np.nan
-            elif src != req:
-                out[req] = out[src]
-        return out[req_cols]
-    if isinstance(path, (list, tuple)):
-        frames = [_load_one(p, columns) for p in path]
-        if not frames:
-            return pd.DataFrame(columns=columns)
-        return pd.concat(frames, ignore_index=True)
-    return _load_one(path, columns)
-def normalize_smiles(smiles: str, use_canonical_smiles: bool) -> Optional[str]:
-    s = (smiles or "").strip()
-    if not s:
-        return None
-    if not use_canonical_smiles:
-        # Skip RDKit parsing entirely in fast mode.
-        return s
-    m = Chem.MolFromSmiles(s)
-    if m is None:
-        return None
-    if use_canonical_smiles:
-        return Chem.MolToSmiles(m, canonical=True)
-    return s
-def load_polyinfo_index(polyinfo_csv_path: str, use_canonical_smiles: bool = True) -> pd.DataFrame:
-    """
-    Expected CSV columns: SMILES, Polymer_Class, polymer_name (or common variants).
-    Returns dataframe with index on smiles_key and columns polymer_name/polymer_class.
-    """
-    df = pd.read_csv(polyinfo_csv_path)
-    # normalize column names
-    cols = {c: c for c in df.columns}
-    # map typical names
-    if "SMILES" in cols:
-        df = df.rename(columns={"SMILES": "smiles"})
-    elif "smiles" not in df.columns:
-        raise ValueError(f"{polyinfo_csv_path} missing SMILES/smiles column")
-    if "Polymer_Name" in df.columns:
-        df = df.rename(columns={"Polymer_Name": "polymer_name"})
-    if "polymer_Name" in df.columns:
-        df = df.rename(columns={"polymer_Name": "polymer_name"})
-    if "Polymer_Class" in df.columns:
-        df = df.rename(columns={"Polymer_Class": "polymer_class"})
-    if "polymer_name" not in df.columns:
-        df["polymer_name"] = pd.NA
-    if "polymer_class" not in df.columns:
-        df["polymer_class"] = pd.NA
-    df["smiles_key"] = df["smiles"].astype(str).map(lambda s: normalize_smiles(s, use_canonical_smiles))
-    df = df.dropna(subset=["smiles_key"]).drop_duplicates("smiles_key")
-    df = df.set_index("smiles_key", drop=True)
-    return df[["polymer_name", "polymer_class"]]
-# -------------------------
-# Pareto (2–3 objectives)
-# -------------------------
-def pareto_front_mask(X: np.ndarray) -> np.ndarray:
-    """
-    Returns mask for nondominated points.
-    X: (N, M), all objectives assumed to be minimized.
-    For maximize objectives, we invert before calling this.
-    """
-    N = X.shape[0]
-    is_efficient = np.ones(N, dtype=bool)
-    for i in range(N):
-        if not is_efficient[i]:
-            continue
-        # any point that is <= in all dims and < in at least one dominates
-        dominates = np.all(X <= X[i], axis=1) & np.any(X < X[i], axis=1)
-        # if a point dominates i, mark i inefficient
-        if np.any(dominates):
-            is_efficient[i] = False
-            continue
-        # otherwise, i may dominate others
-        dominated_by_i = np.all(X[i] <= X, axis=1) & np.any(X[i] < X, axis=1)
-        is_efficient[dominated_by_i] = False
-        is_efficient[i] = True
-    return is_efficient
-def pareto_layers(X: np.ndarray, max_layers: int = 10) -> np.ndarray:
-    """
-    Returns layer index per point: 1 = Pareto front, 2 = second layer, ...
-    Unassigned points beyond max_layers get 0.
-    """
-    N = X.shape[0]
-    layers = np.zeros(N, dtype=int)
-    remaining = np.arange(N)
-    layer = 1
-    while remaining.size > 0 and layer <= max_layers:
-        mask = pareto_front_mask(X[remaining])
-        front_idx = remaining[mask]
-        layers[front_idx] = layer
-        remaining = remaining[~mask]
-        layer += 1
-    return layers
-def pareto_front_mask_chunked(
-    X: np.ndarray,
-    chunk_size: int = 100000,
-    progress_callback: Optional[Callable[[int, int], None]] = None,
-) -> np.ndarray:
-    """
-    Exact global Pareto front mask via chunk-local front reduction + global reconcile.
-    This is exact for front-1:
-      1) compute exact local front within each chunk
-      2) union local fronts
-      3) compute exact front on the union
-    """
-    N = X.shape[0]
-    if N <= chunk_size:
-        if progress_callback is not None:
-            progress_callback(1, 1)
-        return pareto_front_mask(X)
-    local_front_idx = []
-    total_chunks = (N + chunk_size - 1) // chunk_size
-    done_chunks = 0
-    for start in range(0, N, chunk_size):
-        end = min(start + chunk_size, N)
-        idx = np.arange(start, end)
-        mask_local = pareto_front_mask(X[idx])
-        local_front_idx.append(idx[mask_local])
-        done_chunks += 1
-        if progress_callback is not None:
-            progress_callback(done_chunks, total_chunks)
-    if not local_front_idx:
-        return np.zeros(N, dtype=bool)
-    reduced_idx = np.concatenate(local_front_idx)
-    reduced_mask = pareto_front_mask(X[reduced_idx])
-    front_idx = reduced_idx[reduced_mask]
-    out = np.zeros(N, dtype=bool)
-    out[front_idx] = True
-    return out
-def pareto_layers_chunked(
-    X: np.ndarray,
-    max_layers: int = 10,
-    chunk_size: int = 100000,
-    progress_callback: Optional[Callable[[int, int, int], None]] = None,
-) -> np.ndarray:
-    """
-    Exact Pareto layers using repeated exact chunked front extraction.
-    """
-    N = X.shape[0]
-    layers = np.zeros(N, dtype=int)
-    remaining = np.arange(N)
-    layer = 1
-    while remaining.size > 0 and layer <= max_layers:
-        def on_chunk(done: int, total: int) -> None:
-            if progress_callback is not None:
-                progress_callback(layer, done, total)
-        mask = pareto_front_mask_chunked(X[remaining], chunk_size=chunk_size, progress_callback=on_chunk)
-        front_idx = remaining[mask]
-        layers[front_idx] = layer
-        remaining = remaining[~mask]
-        layer += 1
-    return layers
-# -------------------------
-# Fingerprints & diversity
-# -------------------------
-def morgan_fp(smiles: str, radius: int = 2, nbits: int = 2048):
-    m = Chem.MolFromSmiles(smiles)
-    if m is None:
-        return None
-    return AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=nbits)
-def tanimoto_distance(fp1, fp2) -> float:
-    return 1.0 - DataStructs.TanimotoSimilarity(fp1, fp2)
-def greedy_diverse_select(
-    smiles_list: List[str],
-    scores: np.ndarray,
-    max_k: int,
-    min_dist: float,
-) -> List[int]:
-    """
-    Greedy selection by descending score, enforcing min Tanimoto distance.
-    Returns indices into smiles_list.
-    """
-    fps = []
-    valid_idx = []
-    for i, s in enumerate(smiles_list):
-        fp = morgan_fp(s)
-        if fp is not None:
-            fps.append(fp)
-            valid_idx.append(i)
-    if not valid_idx:
-        return []
-    # rank candidates (higher score first)
-    order = np.argsort(-scores[valid_idx])
-    selected_global = []
-    selected_fps = []
-    for oi in order:
-        i = valid_idx[oi]
-        fp_i = fps[oi]  # aligned with valid_idx
-        ok = True
-        for fp_j in selected_fps:
-            if tanimoto_distance(fp_i, fp_j) < min_dist:
-                ok = False
-                break
-        if ok:
-            selected_global.append(i)
-            selected_fps.append(fp_i)
-        if len(selected_global) >= max_k:
-            break
-    return selected_global
-# -------------------------
-# Trust score (lightweight, robust)
-# -------------------------
-def internal_consistency_penalty(row: pd.Series) -> float:
-    """
-    Very simple physics/validity checks. Penalty in [0,1].
-    Adjust/add rules later.
-    """
-    viol = 0
-    total = 0
-    def chk(cond: bool):
-        nonlocal viol, total
-        total += 1
-        if not cond:
-            viol += 1
-    # positivity checks if present
-    for p in ["cp", "tc", "rho", "dif", "visc", "tg", "tm", "bandgap"]:
-        c = mean_col(p)
-        if c in row.index and pd.notna(row[c]):
-            if p in ["bandgap", "tg", "tm"]:
-                chk(float(row[c]) >= 0.0)
-            else:
-                chk(float(row[c]) > 0.0)
-    # Poisson ratio bounds if present
-    if mean_col("poisson") in row.index and pd.notna(row[mean_col("poisson")]):
-        v = float(row[mean_col("poisson")])
-        chk(0.0 <= v <= 0.5)
-    # Tg <= Tm if both present
-    if mean_col("tg") in row.index and mean_col("tm") in row.index:
-        if pd.notna(row[mean_col("tg")]) and pd.notna(row[mean_col("tm")]):
-            chk(float(row[mean_col("tg")]) <= float(row[mean_col("tm")]))
-    if total == 0:
-        return 0.0
-    return viol / total
-def synthesizability_score(smiles: str) -> float:
-    """
-    RDKit SA-score based synthesizability proxy in [0,1].
-    SA-score is ~[1 (easy), 10 (hard)].
-    We map: 1 -> 1.0, 10 -> 0.0
-    """
-    m = Chem.MolFromSmiles(smiles)
-    if m is None:
-        return 0.0
-    # Guard against unexpected scorer failures / None for edge-case molecules.
-    try:
-        sa_raw = sascorer.calculateScore(m)
-    except Exception:
-        return 0.0
-    if sa_raw is None:
-        return 0.0
-    sa = float(sa_raw)  # ~ 1..10
-    s_syn = 1.0 - (sa - 1.0) / 9.0          # linear map to [0,1]
-    return float(np.clip(s_syn, 0.0, 1.0))
-def compute_trust_scores(
-    df: pd.DataFrame,
-    real_fps: List,
-    real_smiles: List[str],
-    trust_weights: Dict[str, float] | None = None,
-) -> np.ndarray:
-    """
-    Trust score in [0,1] (higher = more trustworthy / lower risk).
-    Components:
-      - distance to nearest real polymer (fingerprint distance)
-      - internal consistency penalty
-      - uncertainty penalty (if std columns exist)
-      - synthesizability
-    """
-    N = len(df)
-    trust = np.zeros(N, dtype=float)
-    tw_defaults = {"real": 0.45, "consistency": 0.25, "uncertainty": 0.10, "synth": 0.20}
-    tw = normalize_weights(trust_weights or {}, tw_defaults)
-    # nearest-real distance (expensive if done naively)
-    # We do it only for the (small) post-filter set, which is safe.
-    smiles_col = "smiles_key" if "smiles_key" in df.columns else "smiles_canon"
-    for i in range(N):
-        s = df.iloc[i][smiles_col]
-        fp = morgan_fp(s)
-        if fp is None or not real_fps:
-            d_real = 1.0
-        else:
-            sims = DataStructs.BulkTanimotoSimilarity(fp, real_fps)
-            d_real = 1.0 - float(max(sims))  # distance to nearest
-        # internal consistency
-        pen_cons = internal_consistency_penalty(df.iloc[i])
-        # uncertainty: average normalized std for any std_* columns present
-        std_cols = [c for c in df.columns if c.startswith("std_")]
-        if std_cols:
-            std_vals = df.iloc[i][std_cols].astype(float)
-            std_vals = std_vals.replace([np.inf, -np.inf], np.nan).dropna()
-            pen_unc = float(np.clip(std_vals.mean() / (std_vals.mean() + 1.0), 0.0, 1.0)) if len(std_vals) else 0.0
-        else:
-            pen_unc = 0.0
-        # synthesizability heuristic
-        s_syn = synthesizability_score(s)
-        # Combine (tunable weights)
-        # lower distance to real is better -> convert to score
-        s_real = 1.0 - np.clip(d_real, 0.0, 1.0)
-        trust[i] = (
-            tw["real"] * s_real +
-            tw["consistency"] * (1.0 - pen_cons) +
-            tw["uncertainty"] * (1.0 - pen_unc) +
-            tw["synth"] * s_syn
-        )
-    trust = np.clip(trust, 0.0, 1.0)
-    return trust
-# -------------------------
-# Main pipeline
-# -------------------------
-def run_discovery(
-    spec: DiscoverySpec,
-    progress_callback: Optional[Callable[[str, float], None]] = None,
-) -> Tuple[pd.DataFrame, Dict[str, float], pd.DataFrame]:
-    def report(step: str, pct: float) -> None:
-        if progress_callback is not None:
-            progress_callback(step, pct)
-    rng = np.random.default_rng(spec.random_seed)
-    # 1) Determine required columns
-    report("Preparing columns…", 0.02)
-    obj_props = [o["property"].lower() for o in spec.objectives]
-    cons_props = [p.lower() for p in spec.hard_constraints.keys()]
-    needed_props = sorted(set(obj_props + cons_props))
-    cols = ["SMILES"] + [mean_col(p) for p in needed_props]
-    # include std columns if available (not required, but used for trust)
-    std_cols = [std_col(p) for p in needed_props]
-    cols += std_cols
-    # 2) Load only needed columns
-    report("Loading data from parquet…", 0.05)
-    df = load_parquet_columns(spec.dataset, columns=[c for c in cols if c != "SMILES"] + ["SMILES"])
-    # normalize
-    if "SMILES" not in df.columns and "smiles" in df.columns:
-        df = df.rename(columns={"smiles": "SMILES"})
-    normalize_step = "Canonicalizing SMILES…" if spec.use_canonical_smiles else "Skipping SMILES normalization…"
-    report(normalize_step, 0.10)
-    df["smiles_key"] = df["SMILES"].astype(str).map(lambda s: normalize_smiles(s, spec.use_canonical_smiles))
-    df = df.dropna(subset=["smiles_key"]).reset_index(drop=True)
-    # 3) Hard constraints
-    report("Applying constraints…", 0.22)
-    for p, rule in spec.hard_constraints.items():
-        p = p.lower()
-        c = mean_col(p)
-        if c not in df.columns:
-            # if missing, nothing can satisfy
-            df = df.iloc[0:0]
-            break
-        if "min" in rule:
-            df = df[df[c] >= float(rule["min"])]
-        if "max" in rule:
-            df = df[df[c] <= float(rule["max"])]
-    n_after = len(df)
-    if n_after == 0:
-        empty_stats = {"n_total": 0, "n_after_constraints": 0, "n_pool": 0, "n_pareto_pool": 0, "n_selected": 0}
-        return df, empty_stats, pd.DataFrame()
-    n_pool = len(df)
-    # 5) Prepare objective matrix for Pareto
-    report("Building objective matrix…", 0.30)
-    # convert to minimization: maximize => negate
-    X = []
-    resolved_objectives = []
-    for o in spec.objectives:
-        prop = o["property"].lower()
-        goal = o["goal"].lower()
-        c = mean_col(prop)
-        if c not in df.columns:
-            continue
-        v = df[c].to_numpy(dtype=float)
-        if goal == "maximize":
-            v = -v
-        X.append(v)
-        resolved_objectives.append({"property": prop, "goal": goal})
-    if not X:
-        # Fallback to first available mean_* column to keep pipeline runnable.
-        fallback_col = next((c for c in df.columns if str(c).startswith("mean_")), None)
-        if fallback_col is None:
-            empty_stats = {"n_total": 0, "n_after_constraints": 0, "n_pool": 0, "n_pareto_pool": 0, "n_selected": 0}
-            return df.iloc[0:0], empty_stats, pd.DataFrame()
-        X = [df[fallback_col].to_numpy(dtype=float) * -1.0]
-        resolved_objectives = [{"property": fallback_col.replace("mean_", ""), "goal": "maximize"}]
-    X = np.stack(X, axis=1)  # (N, M)
-    obj_props = [o["property"] for o in resolved_objectives]
-    # Pareto cap before computing layers (optional safety)
-    if spec.use_full_data:
-        report("Using full dataset (no Pareto cap)…", 0.35)
-    elif len(df) > spec.pareto_max:
-        idx = rng.choice(len(df), size=spec.pareto_max, replace=False)
-        df = df.iloc[idx].reset_index(drop=True)
-        X = X[idx]
-    # 6) Pareto layers (only 5 layers needed for candidate pool)
-    report("Computing Pareto layers…", 0.40)
-    pareto_start = 0.40
-    pareto_end = 0.54
-    max_layers_for_pool = max(1, int(spec.max_pareto_fronts))
-    pareto_chunk_ref = {"chunks_per_layer": None}
-    def on_pareto_chunk(layer_i: int, done_chunks: int, total_chunks: int) -> None:
-        if pareto_chunk_ref["chunks_per_layer"] is None:
-            pareto_chunk_ref["chunks_per_layer"] = max(1, int(total_chunks))
-        ref_chunks = pareto_chunk_ref["chunks_per_layer"]
-        total_units = max_layers_for_pool * ref_chunks
-        done_units = min(total_units, ((layer_i - 1) * ref_chunks) + done_chunks)
-        pareto_pct = int(round(100.0 * done_units / max(1, total_units)))
-        layer_progress = done_chunks / max(1, total_chunks)
-        overall = ((layer_i - 1) + layer_progress) / max_layers_for_pool
-        pct = pareto_start + (pareto_end - pareto_start) * min(1.0, max(0.0, overall))
-        report(
-            f"Computing Pareto layers… {pareto_pct}% (Layer {layer_i}/{max_layers_for_pool}, chunk {done_chunks}/{total_chunks})",
-            pct,
-        )
-    layers = pareto_layers_chunked(
-        X,
-        max_layers=max_layers_for_pool,
-        chunk_size=100000,
-        progress_callback=on_pareto_chunk,
-    )
-    report("Computing Pareto layers…", pareto_end)
-    df["pareto_layer"] = layers
-    plot_df = df[["smiles_key"] + [mean_col(p) for p in obj_props] + ["pareto_layer"]].copy()
-    plot_df = plot_df.rename(columns={"smiles_key": "SMILES"})
-    # Keep first few layers as candidate pool (avoid huge set)
-    cand = df[df["pareto_layer"].between(1, max_layers_for_pool)].copy()
-    if cand.empty:
-        cand = df[df["pareto_layer"] == 1].copy()
-    cand = cand.reset_index(drop=True)
-    n_pareto = len(cand)
-    # 7) Load real polymer metadata and fingerprints (from POLYINFO.csv)
-    report("Loading POLYINFO index…", 0.55)
-    polyinfo = load_polyinfo_index(spec.polyinfo_csv, use_canonical_smiles=spec.use_canonical_smiles)
-    real_smiles = polyinfo.index.to_list()
-    report("Building real-polymer fingerprints…", 0.60)
-    real_fps = []
-    for s in real_smiles:
-        fp = morgan_fp(s)
-        if fp is not None:
-            real_fps.append(fp)
-    # 8) Trust score on candidate pool (safe size)
-    report("Computing trust scores…", 0.70)
-    trust = compute_trust_scores(
-        cand,
-        real_fps=real_fps,
-        real_smiles=real_smiles,
-        trust_weights=spec.trust_weights,
-    )
-    cand["trust_score"] = trust
-    # 9) Diversity selection on candidate pool
-    report("Diversity selection…", 0.88)
-    # score for selection: prioritize Pareto layer 1 then trust
-    # higher is better
-    sw_defaults = {"pareto": 0.60, "trust": 0.40}
-    sw = normalize_weights(spec.selection_weights or {}, sw_defaults)
-    pareto_bonus = (
-        (max_layers_for_pool + 1) - np.clip(cand["pareto_layer"].to_numpy(dtype=int), 1, max_layers_for_pool)
-    ) / float(max_layers_for_pool)
-    sel_score = sw["pareto"] * pareto_bonus + sw["trust"] * cand["trust_score"].to_numpy(dtype=float)
-    chosen_idx = greedy_diverse_select(
-        smiles_list=cand["smiles_key"].tolist(),
-        scores=sel_score,
-        max_k=spec.max_candidates,
-        min_dist=spec.min_distance,
-    )
-    out = cand.iloc[chosen_idx].copy().reset_index(drop=True)
-    # 10) Attach Polymer_Name/Class if available (only for matches)
-    report("Finalizing results…", 0.96)
-    out = out.set_index("smiles_key", drop=False)
-    out = out.join(polyinfo, how="left")
-    out = out.reset_index(drop=True)
-    # 11) Make a clean output bundle with requested columns
-    # Keep SMILES (canonical), name/class, pareto layer, trust score, properties used
-    keep = ["smiles_key", "polymer_name", "polymer_class", "pareto_layer", "trust_score"]
-    for p in needed_props:
-        mc = mean_col(p)
-        sc = std_col(p)
-        if mc in out.columns:
-            keep.append(mc)
-        if sc in out.columns:
-            keep.append(sc)
-    out = out[keep].rename(columns={"smiles_key": "SMILES"})
-    stats = {
-        "n_total": float(len(df)),
-        "n_after_constraints": float(n_after),
-        "n_pool": float(n_pool),
-        "n_pareto_pool": float(n_pareto),
-        "n_selected": float(len(out)),
-    }
-    report("Done.", 1.0)
-    return out, stats, plot_df
-def build_pareto_plot_df(spec: DiscoverySpec, max_plot_points: int = 30000) -> pd.DataFrame:
-    """
-    Returns a small dataframe for plotting (sampled), with objective columns and pareto_layer.
-    Does NOT compute trust/diversity. Safe for live plotting.
-    """
-    rng = np.random.default_rng(spec.random_seed)
-    obj_props = [o["property"].lower() for o in spec.objectives]
-    cons_props = [p.lower() for p in spec.hard_constraints.keys()]
-    needed_props = sorted(set(obj_props + cons_props))
-    cols = ["SMILES"] + [mean_col(p) for p in needed_props]
-    df = load_parquet_columns(spec.dataset, columns=cols)
-    if "SMILES" not in df.columns and "smiles" in df.columns:
-        df = df.rename(columns={"smiles": "SMILES"})
-    df["smiles_key"] = df["SMILES"].astype(str).map(lambda s: normalize_smiles(s, spec.use_canonical_smiles))
-    df = df.dropna(subset=["smiles_key"]).reset_index(drop=True)
-    # Hard constraints
-    for p, rule in spec.hard_constraints.items():
-        p = p.lower()
-        c = mean_col(p)
-        if c not in df.columns:
-            return df.iloc[0:0]
-        if "min" in rule:
-            df = df[df[c] >= float(rule["min"])]
-        if "max" in rule:
-            df = df[df[c] <= float(rule["max"])]
-    if len(df) == 0:
-        return df
-    # Pareto cap for plotting
-    plot_cap = min(int(max_plot_points), int(spec.pareto_max))
-    if len(df) > plot_cap:
-        idx = rng.choice(len(df), size=plot_cap, replace=False)
-        df = df.iloc[idx].reset_index(drop=True)
-    # Build objective matrix (minimization)
-    X = []
-    resolved_obj_props = []
-    for o in spec.objectives:
-        prop = o["property"].lower()
-        goal = o["goal"].lower()
-        c = mean_col(prop)
-        if c not in df.columns:
-            continue
-        v = df[c].to_numpy(dtype=float)
-        if goal == "maximize":
-            v = -v
-        X.append(v)
-        resolved_obj_props.append(prop)
-    if not X:
-        fallback_col = next((c for c in df.columns if str(c).startswith("mean_")), None)
-        if fallback_col is None:
-            return df.iloc[0:0]
-        X = [df[fallback_col].to_numpy(dtype=float) * -1.0]
-        resolved_obj_props = [fallback_col.replace("mean_", "")]
-    X = np.stack(X, axis=1)
-    df["pareto_layer"] = pareto_layers(X, max_layers=5)
-    # Return only what plotting needs
-    keep = ["smiles_key", "pareto_layer"] + [mean_col(p) for p in resolved_obj_props]
-    out = df[keep].rename(columns={"smiles_key": "SMILES"})
-    return out
-def parse_spec(text: str, dataset_path: List[str], polyinfo_path: str, polyinfo_csv_path: str) -> DiscoverySpec:
-    obj = json.loads(text)
-    pareto_max = int(obj.get("pareto_max", 50000))
-    return DiscoverySpec(
-        dataset=list(dataset_path),
-        polyinfo=polyinfo_path,
-        polyinfo_csv=polyinfo_csv_path,
-        hard_constraints=obj.get("hard_constraints", {}),
-        objectives=obj.get("objectives", []),
-        max_pool=pareto_max,
-        pareto_max=pareto_max,
-        max_candidates=int(obj.get("max_candidates", 30)),
-        max_pareto_fronts=int(obj.get("max_pareto_fronts", 5)),
-        min_distance=float(obj.get("min_distance", 0.30)),
-        fingerprint=str(obj.get("fingerprint", "morgan")),
-        random_seed=int(obj.get("random_seed", 7)),
-        use_canonical_smiles=not bool(obj.get("skip_smiles_canonicalization", True)),
-        use_full_data=bool(obj.get("use_full_data", False)),
-        trust_weights=obj.get("trust_weights"),
-        selection_weights=obj.get("selection_weights"),
-    )

src/discovery.py DELETED Viewed

@@ -1,767 +0,0 @@
-# src/discovery.py
-from __future__ import annotations
-import json
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Callable, Dict, List, Optional, Tuple
-import numpy as np
-import pandas as pd
-from rdkit import Chem, DataStructs
-from rdkit.Chem import AllChem
-from . import sascorer
-# Reuse your canonicalizer if you want; otherwise keep local
-def canonicalize_smiles(smiles: str) -> Optional[str]:
-    s = (smiles or "").strip()
-    if not s:
-        return None
-    m = Chem.MolFromSmiles(s)
-    if m is None:
-        return None
-    return Chem.MolToSmiles(m, canonical=True)
-# -------------------------
-# Spec schema (minimal v0)
-# -------------------------
-@dataclass
-class DiscoverySpec:
-    dataset: List[str]  # ["PI1M_PROPERTY.parquet", "POLYINFO_PROPERTY.parquet"]
-    polyinfo: str  # "POLYINFO_PROPERTY.parquet"
-    polyinfo_csv: str  # "POLYINFO.csv"
-    hard_constraints: Dict[str, Dict[str, float]]  # { "tg": {"min": 400}, "tc": {"max": 0.3} }
-    objectives: List[Dict[str, str]]  # [{"property":"cp","goal":"maximize"}, ...]
-    max_pool: int = 200000         # legacy (kept for compatibility; aligned to pareto_max)
-    pareto_max: int = 50000        # cap points used for Pareto + diversity fingerprinting
-    max_candidates: int = 30       # final output size
-    max_pareto_fronts: int = 5     # how many Pareto layers to keep for candidate pool
-    min_distance: float = 0.30     # diversity threshold in Tanimoto distance
-    fingerprint: str = "morgan"    # morgan only for now
-    random_seed: int = 7
-    use_canonical_smiles: bool = True
-    use_full_data: bool = False
-    trust_weights: Dict[str, float] | None = None
-    selection_weights: Dict[str, float] | None = None
-# -------------------------
-# Column mapping
-# -------------------------
-def mean_col(prop_key: str) -> str:
-    return f"mean_{prop_key.lower()}"
-def std_col(prop_key: str) -> str:
-    return f"std_{prop_key.lower()}"
-def normalize_weights(weights: Dict[str, float], defaults: Dict[str, float]) -> Dict[str, float]:
-    out: Dict[str, float] = {}
-    for k, v in defaults.items():
-        try:
-            vv = float(weights.get(k, v))
-        except Exception:
-            vv = float(v)
-        out[k] = max(0.0, vv)
-    s = float(sum(out.values()))
-    if s <= 0.0:
-        return defaults.copy()
-    return {k: float(v / s) for k, v in out.items()}
-def spec_from_dict(obj: dict, dataset_path: List[str], polyinfo_path: str, polyinfo_csv_path: str) -> DiscoverySpec:
-    pareto_max = int(obj.get("pareto_max", 50000))
-    return DiscoverySpec(
-        dataset=list(dataset_path),
-        polyinfo=polyinfo_path,
-        polyinfo_csv=polyinfo_csv_path,
-        hard_constraints=obj.get("hard_constraints", {}),
-        objectives=obj.get("objectives", []),
-        # Legacy field kept for compatibility; effectively collapsed to pareto_max.
-        max_pool=pareto_max,
-        pareto_max=pareto_max,
-        max_candidates=int(obj.get("max_candidates", 30)),
-        max_pareto_fronts=int(obj.get("max_pareto_fronts", 5)),
-        min_distance=float(obj.get("min_distance", 0.30)),
-        fingerprint=str(obj.get("fingerprint", "morgan")),
-        random_seed=int(obj.get("random_seed", 7)),
-        use_canonical_smiles=not bool(obj.get("skip_smiles_canonicalization", True)),
-        use_full_data=bool(obj.get("use_full_data", False)),
-        trust_weights=obj.get("trust_weights"),
-        selection_weights=obj.get("selection_weights"),
-    )
-# -------------------------
-# Parquet loading (safe)
-# -------------------------
-def load_parquet_columns(path: str | List[str], columns: List[str]) -> pd.DataFrame:
-    """
-    Load only requested columns from Parquet (critical for 1M rows).
-    Accepts a single path or a list of paths and concatenates rows.
-    """
-    def _load_one(fp: str, req_cols: List[str]) -> pd.DataFrame:
-        available: list[str]
-        try:
-            import pyarrow.parquet as pq
-            pf = pq.ParquetFile(fp)
-            available = [str(c) for c in pf.schema.names]
-        except Exception:
-            # If schema probing fails, fall back to direct read with requested columns.
-            return pd.read_parquet(fp, columns=req_cols)
-        available_set = set(available)
-        lower_to_actual = {c.lower(): c for c in available}
-        # Resolve requested names against actual parquet schema.
-        resolved: dict[str, str] = {}
-        for req in req_cols:
-            if req in available_set:
-                resolved[req] = req
-                continue
-            alt = lower_to_actual.get(str(req).lower())
-            if alt is not None:
-                resolved[req] = alt
-        use_cols = sorted(set(resolved.values()))
-        if not use_cols:
-            return pd.DataFrame(columns=req_cols)
-        out = pd.read_parquet(fp, columns=use_cols)
-        for req in req_cols:
-            src = resolved.get(req)
-            if src is None:
-                out[req] = np.nan
-            elif src != req:
-                out[req] = out[src]
-        return out[req_cols]
-    if isinstance(path, (list, tuple)):
-        frames = [_load_one(p, columns) for p in path]
-        if not frames:
-            return pd.DataFrame(columns=columns)
-        return pd.concat(frames, ignore_index=True)
-    return _load_one(path, columns)
-def normalize_smiles(smiles: str, use_canonical_smiles: bool) -> Optional[str]:
-    s = (smiles or "").strip()
-    if not s:
-        return None
-    if not use_canonical_smiles:
-        # Skip RDKit parsing entirely in fast mode.
-        return s
-    m = Chem.MolFromSmiles(s)
-    if m is None:
-        return None
-    if use_canonical_smiles:
-        return Chem.MolToSmiles(m, canonical=True)
-    return s
-def load_polyinfo_index(polyinfo_csv_path: str, use_canonical_smiles: bool = True) -> pd.DataFrame:
-    """
-    Expected CSV columns: SMILES, Polymer_Class, polymer_name (or common variants).
-    Returns dataframe with index on smiles_key and columns polymer_name/polymer_class.
-    """
-    df = pd.read_csv(polyinfo_csv_path)
-    # normalize column names
-    cols = {c: c for c in df.columns}
-    # map typical names
-    if "SMILES" in cols:
-        df = df.rename(columns={"SMILES": "smiles"})
-    elif "smiles" not in df.columns:
-        raise ValueError(f"{polyinfo_csv_path} missing SMILES/smiles column")
-    if "Polymer_Name" in df.columns:
-        df = df.rename(columns={"Polymer_Name": "polymer_name"})
-    if "polymer_Name" in df.columns:
-        df = df.rename(columns={"polymer_Name": "polymer_name"})
-    if "Polymer_Class" in df.columns:
-        df = df.rename(columns={"Polymer_Class": "polymer_class"})
-    if "polymer_name" not in df.columns:
-        df["polymer_name"] = pd.NA
-    if "polymer_class" not in df.columns:
-        df["polymer_class"] = pd.NA
-    df["smiles_key"] = df["smiles"].astype(str).map(lambda s: normalize_smiles(s, use_canonical_smiles))
-    df = df.dropna(subset=["smiles_key"]).drop_duplicates("smiles_key")
-    df = df.set_index("smiles_key", drop=True)
-    return df[["polymer_name", "polymer_class"]]
-# -------------------------
-# Pareto (2–3 objectives)
-# -------------------------
-def pareto_front_mask(X: np.ndarray) -> np.ndarray:
-    """
-    Returns mask for nondominated points.
-    X: (N, M), all objectives assumed to be minimized.
-    For maximize objectives, we invert before calling this.
-    """
-    N = X.shape[0]
-    is_efficient = np.ones(N, dtype=bool)
-    for i in range(N):
-        if not is_efficient[i]:
-            continue
-        # any point that is <= in all dims and < in at least one dominates
-        dominates = np.all(X <= X[i], axis=1) & np.any(X < X[i], axis=1)
-        # if a point dominates i, mark i inefficient
-        if np.any(dominates):
-            is_efficient[i] = False
-            continue
-        # otherwise, i may dominate others
-        dominated_by_i = np.all(X[i] <= X, axis=1) & np.any(X[i] < X, axis=1)
-        is_efficient[dominated_by_i] = False
-        is_efficient[i] = True
-    return is_efficient
-def pareto_layers(X: np.ndarray, max_layers: int = 10) -> np.ndarray:
-    """
-    Returns layer index per point: 1 = Pareto front, 2 = second layer, ...
-    Unassigned points beyond max_layers get 0.
-    """
-    N = X.shape[0]
-    layers = np.zeros(N, dtype=int)
-    remaining = np.arange(N)
-    layer = 1
-    while remaining.size > 0 and layer <= max_layers:
-        mask = pareto_front_mask(X[remaining])
-        front_idx = remaining[mask]
-        layers[front_idx] = layer
-        remaining = remaining[~mask]
-        layer += 1
-    return layers
-def pareto_front_mask_chunked(
-    X: np.ndarray,
-    chunk_size: int = 100000,
-    progress_callback: Optional[Callable[[int, int], None]] = None,
-) -> np.ndarray:
-    """
-    Exact global Pareto front mask via chunk-local front reduction + global reconcile.
-    This is exact for front-1:
-      1) compute exact local front within each chunk
-      2) union local fronts
-      3) compute exact front on the union
-    """
-    N = X.shape[0]
-    if N <= chunk_size:
-        if progress_callback is not None:
-            progress_callback(1, 1)
-        return pareto_front_mask(X)
-    local_front_idx = []
-    total_chunks = (N + chunk_size - 1) // chunk_size
-    done_chunks = 0
-    for start in range(0, N, chunk_size):
-        end = min(start + chunk_size, N)
-        idx = np.arange(start, end)
-        mask_local = pareto_front_mask(X[idx])
-        local_front_idx.append(idx[mask_local])
-        done_chunks += 1
-        if progress_callback is not None:
-            progress_callback(done_chunks, total_chunks)
-    if not local_front_idx:
-        return np.zeros(N, dtype=bool)
-    reduced_idx = np.concatenate(local_front_idx)
-    reduced_mask = pareto_front_mask(X[reduced_idx])
-    front_idx = reduced_idx[reduced_mask]
-    out = np.zeros(N, dtype=bool)
-    out[front_idx] = True
-    return out
-def pareto_layers_chunked(
-    X: np.ndarray,
-    max_layers: int = 10,
-    chunk_size: int = 100000,
-    progress_callback: Optional[Callable[[int, int, int], None]] = None,
-) -> np.ndarray:
-    """
-    Exact Pareto layers using repeated exact chunked front extraction.
-    """
-    N = X.shape[0]
-    layers = np.zeros(N, dtype=int)
-    remaining = np.arange(N)
-    layer = 1
-    while remaining.size > 0 and layer <= max_layers:
-        def on_chunk(done: int, total: int) -> None:
-            if progress_callback is not None:
-                progress_callback(layer, done, total)
-        mask = pareto_front_mask_chunked(X[remaining], chunk_size=chunk_size, progress_callback=on_chunk)
-        front_idx = remaining[mask]
-        layers[front_idx] = layer
-        remaining = remaining[~mask]
-        layer += 1
-    return layers
-# -------------------------
-# Fingerprints & diversity
-# -------------------------
-def morgan_fp(smiles: str, radius: int = 2, nbits: int = 2048):
-    m = Chem.MolFromSmiles(smiles)
-    if m is None:
-        return None
-    return AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=nbits)
-def tanimoto_distance(fp1, fp2) -> float:
-    return 1.0 - DataStructs.TanimotoSimilarity(fp1, fp2)
-def greedy_diverse_select(
-    smiles_list: List[str],
-    scores: np.ndarray,
-    max_k: int,
-    min_dist: float,
-) -> List[int]:
-    """
-    Greedy selection by descending score, enforcing min Tanimoto distance.
-    Returns indices into smiles_list.
-    """
-    fps = []
-    valid_idx = []
-    for i, s in enumerate(smiles_list):
-        fp = morgan_fp(s)
-        if fp is not None:
-            fps.append(fp)
-            valid_idx.append(i)
-    if not valid_idx:
-        return []
-    # rank candidates (higher score first)
-    order = np.argsort(-scores[valid_idx])
-    selected_global = []
-    selected_fps = []
-    for oi in order:
-        i = valid_idx[oi]
-        fp_i = fps[oi]  # aligned with valid_idx
-        ok = True
-        for fp_j in selected_fps:
-            if tanimoto_distance(fp_i, fp_j) < min_dist:
-                ok = False
-                break
-        if ok:
-            selected_global.append(i)
-            selected_fps.append(fp_i)
-        if len(selected_global) >= max_k:
-            break
-    return selected_global
-# -------------------------
-# Trust score (lightweight, robust)
-# -------------------------
-def internal_consistency_penalty(row: pd.Series) -> float:
-    """
-    Very simple physics/validity checks. Penalty in [0,1].
-    Adjust/add rules later.
-    """
-    viol = 0
-    total = 0
-    def chk(cond: bool):
-        nonlocal viol, total
-        total += 1
-        if not cond:
-            viol += 1
-    # positivity checks if present
-    for p in ["cp", "tc", "rho", "dif", "visc", "tg", "tm", "bandgap"]:
-        c = mean_col(p)
-        if c in row.index and pd.notna(row[c]):
-            if p in ["bandgap", "tg", "tm"]:
-                chk(float(row[c]) >= 0.0)
-            else:
-                chk(float(row[c]) > 0.0)
-    # Poisson ratio bounds if present
-    if mean_col("poisson") in row.index and pd.notna(row[mean_col("poisson")]):
-        v = float(row[mean_col("poisson")])
-        chk(0.0 <= v <= 0.5)
-    # Tg <= Tm if both present
-    if mean_col("tg") in row.index and mean_col("tm") in row.index:
-        if pd.notna(row[mean_col("tg")]) and pd.notna(row[mean_col("tm")]):
-            chk(float(row[mean_col("tg")]) <= float(row[mean_col("tm")]))
-    if total == 0:
-        return 0.0
-    return viol / total
-def synthesizability_score(smiles: str) -> float:
-    """
-    RDKit SA-score based synthesizability proxy in [0,1].
-    SA-score is ~[1 (easy), 10 (hard)].
-    We map: 1 -> 1.0, 10 -> 0.0
-    """
-    m = Chem.MolFromSmiles(smiles)
-    if m is None:
-        return 0.0
-    # Guard against unexpected scorer failures / None for edge-case molecules.
-    try:
-        sa_raw = sascorer.calculateScore(m)
-    except Exception:
-        return 0.0
-    if sa_raw is None:
-        return 0.0
-    sa = float(sa_raw)  # ~ 1..10
-    s_syn = 1.0 - (sa - 1.0) / 9.0          # linear map to [0,1]
-    return float(np.clip(s_syn, 0.0, 1.0))
-def compute_trust_scores(
-    df: pd.DataFrame,
-    real_fps: List,
-    real_smiles: List[str],
-    trust_weights: Dict[str, float] | None = None,
-) -> np.ndarray:
-    """
-    Trust score in [0,1] (higher = more trustworthy / lower risk).
-    Components:
-      - distance to nearest real polymer (fingerprint distance)
-      - internal consistency penalty
-      - uncertainty penalty (if std columns exist)
-      - synthesizability
-    """
-    N = len(df)
-    trust = np.zeros(N, dtype=float)
-    tw_defaults = {"real": 0.45, "consistency": 0.25, "uncertainty": 0.10, "synth": 0.20}
-    tw = normalize_weights(trust_weights or {}, tw_defaults)
-    # nearest-real distance (expensive if done naively)
-    # We do it only for the (small) post-filter set, which is safe.
-    smiles_col = "smiles_key" if "smiles_key" in df.columns else "smiles_canon"
-    for i in range(N):
-        s = df.iloc[i][smiles_col]
-        fp = morgan_fp(s)
-        if fp is None or not real_fps:
-            d_real = 1.0
-        else:
-            sims = DataStructs.BulkTanimotoSimilarity(fp, real_fps)
-            d_real = 1.0 - float(max(sims))  # distance to nearest
-        # internal consistency
-        pen_cons = internal_consistency_penalty(df.iloc[i])
-        # uncertainty: average normalized std for any std_* columns present
-        std_cols = [c for c in df.columns if c.startswith("std_")]
-        if std_cols:
-            std_vals = df.iloc[i][std_cols].astype(float)
-            std_vals = std_vals.replace([np.inf, -np.inf], np.nan).dropna()
-            pen_unc = float(np.clip(std_vals.mean() / (std_vals.mean() + 1.0), 0.0, 1.0)) if len(std_vals) else 0.0
-        else:
-            pen_unc = 0.0
-        # synthesizability heuristic
-        s_syn = synthesizability_score(s)
-        # Combine (tunable weights)
-        # lower distance to real is better -> convert to score
-        s_real = 1.0 - np.clip(d_real, 0.0, 1.0)
-        trust[i] = (
-            tw["real"] * s_real +
-            tw["consistency"] * (1.0 - pen_cons) +
-            tw["uncertainty"] * (1.0 - pen_unc) +
-            tw["synth"] * s_syn
-        )
-    trust = np.clip(trust, 0.0, 1.0)
-    return trust
-# -------------------------
-# Main pipeline
-# -------------------------
-def run_discovery(
-    spec: DiscoverySpec,
-    progress_callback: Optional[Callable[[str, float], None]] = None,
-) -> Tuple[pd.DataFrame, Dict[str, float], pd.DataFrame]:
-    def report(step: str, pct: float) -> None:
-        if progress_callback is not None:
-            progress_callback(step, pct)
-    rng = np.random.default_rng(spec.random_seed)
-    # 1) Determine required columns
-    report("Preparing columns…", 0.02)
-    obj_props = [o["property"].lower() for o in spec.objectives]
-    cons_props = [p.lower() for p in spec.hard_constraints.keys()]
-    needed_props = sorted(set(obj_props + cons_props))
-    cols = ["SMILES"] + [mean_col(p) for p in needed_props]
-    # include std columns if available (not required, but used for trust)
-    std_cols = [std_col(p) for p in needed_props]
-    cols += std_cols
-    # 2) Load only needed columns
-    report("Loading data from parquet…", 0.05)
-    df = load_parquet_columns(spec.dataset, columns=[c for c in cols if c != "SMILES"] + ["SMILES"])
-    # normalize
-    if "SMILES" not in df.columns and "smiles" in df.columns:
-        df = df.rename(columns={"smiles": "SMILES"})
-    normalize_step = "Canonicalizing SMILES…" if spec.use_canonical_smiles else "Skipping SMILES normalization…"
-    report(normalize_step, 0.10)
-    df["smiles_key"] = df["SMILES"].astype(str).map(lambda s: normalize_smiles(s, spec.use_canonical_smiles))
-    df = df.dropna(subset=["smiles_key"]).reset_index(drop=True)
-    # 3) Hard constraints
-    report("Applying constraints…", 0.22)
-    for p, rule in spec.hard_constraints.items():
-        p = p.lower()
-        c = mean_col(p)
-        if c not in df.columns:
-            # if missing, nothing can satisfy
-            df = df.iloc[0:0]
-            break
-        if "min" in rule:
-            df = df[df[c] >= float(rule["min"])]
-        if "max" in rule:
-            df = df[df[c] <= float(rule["max"])]
-    n_after = len(df)
-    if n_after == 0:
-        empty_stats = {"n_total": 0, "n_after_constraints": 0, "n_pool": 0, "n_pareto_pool": 0, "n_selected": 0}
-        return df, empty_stats, pd.DataFrame()
-    n_pool = len(df)
-    # 5) Prepare objective matrix for Pareto
-    report("Building objective matrix…", 0.30)
-    # convert to minimization: maximize => negate
-    X = []
-    for o in spec.objectives:
-        prop = o["property"].lower()
-        goal = o["goal"].lower()
-        c = mean_col(prop)
-        if c not in df.columns:
-            raise ValueError(f"Objective column missing: {c}")
-        v = df[c].to_numpy(dtype=float)
-        if goal == "maximize":
-            v = -v
-        X.append(v)
-    X = np.stack(X, axis=1)  # (N, M)
-    # Pareto cap before computing layers (optional safety)
-    if spec.use_full_data:
-        report("Using full dataset (no Pareto cap)…", 0.35)
-    elif len(df) > spec.pareto_max:
-        idx = rng.choice(len(df), size=spec.pareto_max, replace=False)
-        df = df.iloc[idx].reset_index(drop=True)
-        X = X[idx]
-    # 6) Pareto layers (only 5 layers needed for candidate pool)
-    report("Computing Pareto layers…", 0.40)
-    pareto_start = 0.40
-    pareto_end = 0.54
-    max_layers_for_pool = max(1, int(spec.max_pareto_fronts))
-    pareto_chunk_ref = {"chunks_per_layer": None}
-    def on_pareto_chunk(layer_i: int, done_chunks: int, total_chunks: int) -> None:
-        if pareto_chunk_ref["chunks_per_layer"] is None:
-            pareto_chunk_ref["chunks_per_layer"] = max(1, int(total_chunks))
-        ref_chunks = pareto_chunk_ref["chunks_per_layer"]
-        total_units = max_layers_for_pool * ref_chunks
-        done_units = min(total_units, ((layer_i - 1) * ref_chunks) + done_chunks)
-        pareto_pct = int(round(100.0 * done_units / max(1, total_units)))
-        layer_progress = done_chunks / max(1, total_chunks)
-        overall = ((layer_i - 1) + layer_progress) / max_layers_for_pool
-        pct = pareto_start + (pareto_end - pareto_start) * min(1.0, max(0.0, overall))
-        report(
-            f"Computing Pareto layers… {pareto_pct}% (Layer {layer_i}/{max_layers_for_pool}, chunk {done_chunks}/{total_chunks})",
-            pct,
-        )
-    layers = pareto_layers_chunked(
-        X,
-        max_layers=max_layers_for_pool,
-        chunk_size=100000,
-        progress_callback=on_pareto_chunk,
-    )
-    report("Computing Pareto layers…", pareto_end)
-    df["pareto_layer"] = layers
-    plot_df = df[["smiles_key"] + [mean_col(p) for p in obj_props] + ["pareto_layer"]].copy()
-    plot_df = plot_df.rename(columns={"smiles_key": "SMILES"})
-    # Keep first few layers as candidate pool (avoid huge set)
-    cand = df[df["pareto_layer"].between(1, max_layers_for_pool)].copy()
-    if cand.empty:
-        cand = df[df["pareto_layer"] == 1].copy()
-    cand = cand.reset_index(drop=True)
-    n_pareto = len(cand)
-    # 7) Load real polymer metadata and fingerprints (from POLYINFO.csv)
-    report("Loading POLYINFO index…", 0.55)
-    polyinfo = load_polyinfo_index(spec.polyinfo_csv, use_canonical_smiles=spec.use_canonical_smiles)
-    real_smiles = polyinfo.index.to_list()
-    report("Building real-polymer fingerprints…", 0.60)
-    real_fps = []
-    for s in real_smiles:
-        fp = morgan_fp(s)
-        if fp is not None:
-            real_fps.append(fp)
-    # 8) Trust score on candidate pool (safe size)
-    report("Computing trust scores…", 0.70)
-    trust = compute_trust_scores(
-        cand,
-        real_fps=real_fps,
-        real_smiles=real_smiles,
-        trust_weights=spec.trust_weights,
-    )
-    cand["trust_score"] = trust
-    # 9) Diversity selection on candidate pool
-    report("Diversity selection…", 0.88)
-    # score for selection: prioritize Pareto layer 1 then trust
-    # higher is better
-    sw_defaults = {"pareto": 0.60, "trust": 0.40}
-    sw = normalize_weights(spec.selection_weights or {}, sw_defaults)
-    pareto_bonus = (
-        (max_layers_for_pool + 1) - np.clip(cand["pareto_layer"].to_numpy(dtype=int), 1, max_layers_for_pool)
-    ) / float(max_layers_for_pool)
-    sel_score = sw["pareto"] * pareto_bonus + sw["trust"] * cand["trust_score"].to_numpy(dtype=float)
-    chosen_idx = greedy_diverse_select(
-        smiles_list=cand["smiles_key"].tolist(),
-        scores=sel_score,
-        max_k=spec.max_candidates,
-        min_dist=spec.min_distance,
-    )
-    out = cand.iloc[chosen_idx].copy().reset_index(drop=True)
-    # 10) Attach Polymer_Name/Class if available (only for matches)
-    report("Finalizing results…", 0.96)
-    out = out.set_index("smiles_key", drop=False)
-    out = out.join(polyinfo, how="left")
-    out = out.reset_index(drop=True)
-    # 11) Make a clean output bundle with requested columns
-    # Keep SMILES (canonical), name/class, pareto layer, trust score, properties used
-    keep = ["smiles_key", "polymer_name", "polymer_class", "pareto_layer", "trust_score"]
-    for p in needed_props:
-        mc = mean_col(p)
-        sc = std_col(p)
-        if mc in out.columns:
-            keep.append(mc)
-        if sc in out.columns:
-            keep.append(sc)
-    out = out[keep].rename(columns={"smiles_key": "SMILES"})
-    stats = {
-        "n_total": float(len(df)),
-        "n_after_constraints": float(n_after),
-        "n_pool": float(n_pool),
-        "n_pareto_pool": float(n_pareto),
-        "n_selected": float(len(out)),
-    }
-    report("Done.", 1.0)
-    return out, stats, plot_df
-def build_pareto_plot_df(spec: DiscoverySpec, max_plot_points: int = 30000) -> pd.DataFrame:
-    """
-    Returns a small dataframe for plotting (sampled), with objective columns and pareto_layer.
-    Does NOT compute trust/diversity. Safe for live plotting.
-    """
-    rng = np.random.default_rng(spec.random_seed)
-    obj_props = [o["property"].lower() for o in spec.objectives]
-    cons_props = [p.lower() for p in spec.hard_constraints.keys()]
-    needed_props = sorted(set(obj_props + cons_props))
-    cols = ["SMILES"] + [mean_col(p) for p in needed_props]
-    df = load_parquet_columns(spec.dataset, columns=cols)
-    if "SMILES" not in df.columns and "smiles" in df.columns:
-        df = df.rename(columns={"smiles": "SMILES"})
-    df["smiles_key"] = df["SMILES"].astype(str).map(lambda s: normalize_smiles(s, spec.use_canonical_smiles))
-    df = df.dropna(subset=["smiles_key"]).reset_index(drop=True)
-    # Hard constraints
-    for p, rule in spec.hard_constraints.items():
-        p = p.lower()
-        c = mean_col(p)
-        if c not in df.columns:
-            return df.iloc[0:0]
-        if "min" in rule:
-            df = df[df[c] >= float(rule["min"])]
-        if "max" in rule:
-            df = df[df[c] <= float(rule["max"])]
-    if len(df) == 0:
-        return df
-    # Pareto cap for plotting
-    plot_cap = min(int(max_plot_points), int(spec.pareto_max))
-    if len(df) > plot_cap:
-        idx = rng.choice(len(df), size=plot_cap, replace=False)
-        df = df.iloc[idx].reset_index(drop=True)
-    # Build objective matrix (minimization)
-    X = []
-    for o in spec.objectives:
-        prop = o["property"].lower()
-        goal = o["goal"].lower()
-        c = mean_col(prop)
-        v = df[c].to_numpy(dtype=float)
-        if goal == "maximize":
-            v = -v
-        X.append(v)
-    X = np.stack(X, axis=1)
-    df["pareto_layer"] = pareto_layers(X, max_layers=5)
-    # Return only what plotting needs
-    keep = ["smiles_key", "pareto_layer"] + [mean_col(p) for p in obj_props]
-    out = df[keep].rename(columns={"smiles_key": "SMILES"})
-    return out
-def parse_spec(text: str, dataset_path: List[str], polyinfo_path: str, polyinfo_csv_path: str) -> DiscoverySpec:
-    obj = json.loads(text)
-    pareto_max = int(obj.get("pareto_max", 50000))
-    return DiscoverySpec(
-        dataset=list(dataset_path),
-        polyinfo=polyinfo_path,
-        polyinfo_csv=polyinfo_csv_path,
-        hard_constraints=obj.get("hard_constraints", {}),
-        objectives=obj.get("objectives", []),
-        max_pool=pareto_max,
-        pareto_max=pareto_max,
-        max_candidates=int(obj.get("max_candidates", 30)),
-        max_pareto_fronts=int(obj.get("max_pareto_fronts", 5)),
-        min_distance=float(obj.get("min_distance", 0.30)),
-        fingerprint=str(obj.get("fingerprint", "morgan")),
-        random_seed=int(obj.get("random_seed", 7)),
-        use_canonical_smiles=not bool(obj.get("skip_smiles_canonicalization", True)),
-        use_full_data=bool(obj.get("use_full_data", False)),
-        trust_weights=obj.get("trust_weights"),
-        selection_weights=obj.get("selection_weights"),
-    )

src/fpscores.pkl.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9abb6f4c322d27fa05c8a1115a463bcef312d2bed0b447c347de33bfefa83316
-size 132

src/lookup.py DELETED Viewed

@@ -1,222 +0,0 @@
-from __future__ import annotations
-import pandas as pd
-import streamlit as st
-from rdkit import Chem
-from rdkit import RDLogger
-RDLogger.DisableLog("rdApp.*")
-# ----------------------------
-# Sources (property value files)
-# ----------------------------
-SOURCES = ["EXP", "MD", "DFT", "GC"]
-SOURCE_LABELS = {
-    "EXP": "Experimental",
-    "MD":  "Molecular Dynamics",
-    "DFT": "Density Functional Theory",
-    "GC":  "Group Contribution",
-}
-# ----------------------------
-# PolyInfo metadata file (name/class)
-# ----------------------------
-POLYINFO_FILE = "data/POLYINFO.csv"  # contains: SMILES, Polymer_Class, Polymer_Name
-def canonicalize_smiles(smiles: str) -> str | None:
-    smiles = (smiles or "").strip()
-    if not smiles:
-        return None
-    mol = Chem.MolFromSmiles(smiles)
-    if mol is None:
-        return None
-    return Chem.MolToSmiles(mol, canonical=True)
-# --- Property meta (full name + unit) ---
-PROPERTY_META = {
-    # Thermal
-    "tm":  {"name": "Melting temperature", "unit": "K"},
-    "tg":  {"name": "Glass transition temperature", "unit": "K"},
-    "td":  {"name": "Thermal diffusivity", "unit": "m^2/s"},
-    "tc":  {"name": "Thermal conductivity", "unit": "W/m·K"},
-    "cp":  {"name": "Specific heat capacity", "unit": "J/kg·K"},
-    # Mechanical
-    "young":   {"name": "Young's modulus", "unit": "GPa"},
-    "shear":   {"name": "Shear modulus", "unit": "GPa"},
-    "bulk":    {"name": "Bulk modulus", "unit": "GPa"},
-    "poisson": {"name": "Poisson ratio", "unit": "-"},
-    # Transport
-    "visc": {"name": "Viscosity", "unit": "Pa·s"},
-    "dif":  {"name": "Diffusivity", "unit": "cm^2/s"},
-    # Gas permeability
-    "phe":  {"name": "He permeability", "unit": "Barrer"},
-    "ph2":  {"name": "H2 permeability", "unit": "Barrer"},
-    "pco2": {"name": "CO2 permeability", "unit": "Barrer"},
-    "pn2":  {"name": "N2 permeability", "unit": "Barrer"},
-    "po2":  {"name": "O2 permeability", "unit": "Barrer"},
-    "pch4": {"name": "CH4 permeability", "unit": "Barrer"},
-    # Electronic / Optical
-    "alpha":   {"name": "Polarizability", "unit": "a.u."},
-    "homo":    {"name": "HOMO energy", "unit": "eV"},
-    "lumo":    {"name": "LUMO energy", "unit": "eV"},
-    "bandgap": {"name": "Band gap", "unit": "eV"},
-    "mu":      {"name": "Dipole moment", "unit": "Debye"},
-    "etotal":  {"name": "Total electronic energy", "unit": "eV"},
-    "ri":      {"name": "Refractive index", "unit": "-"},
-    "dc":      {"name": "Dielectric constant", "unit": "-"},
-    "pe":      {"name": "Permittivity", "unit": "-"},
-    # Structural / Physical
-    "rg":  {"name": "Radius of gyration", "unit": "Å"},
-    "rho": {"name": "Density", "unit": "g/cm^3"},
-}
-@st.cache_data
-def load_source_csv(source: str) -> pd.DataFrame:
-    """
-    Loads data/{SOURCE}.csv, normalizes:
-      - SMILES column -> 'smiles'
-      - property columns -> lowercase
-      - adds 'smiles_canon'
-    """
-    path = f"data/{source}.csv"
-    df = pd.read_csv(path)
-    # Normalize SMILES column name
-    if "SMILES" in df.columns:
-        df = df.rename(columns={"SMILES": "smiles"})
-    elif "smiles" not in df.columns:
-        raise ValueError(f"{path} missing SMILES column")
-    # Normalize property column names to lowercase
-    rename_map = {c: c.lower() for c in df.columns if c != "smiles"}
-    df = df.rename(columns=rename_map)
-    # Canonicalize SMILES
-    df["smiles_canon"] = df["smiles"].astype(str).apply(canonicalize_smiles)
-    df = df.dropna(subset=["smiles_canon"]).reset_index(drop=True)
-    return df
-@st.cache_data
-def build_index(df: pd.DataFrame) -> dict[str, int]:
-    """canonical smiles -> row index (first occurrence)"""
-    idx: dict[str, int] = {}
-    for i, s in enumerate(df["smiles_canon"].tolist()):
-        if s and s not in idx:
-            idx[s] = i
-    return idx
-@st.cache_data
-def load_polyinfo_csv() -> pd.DataFrame:
-    """
-    Loads data/POLYINFO.csv with columns:
-      SMILES, Polymer_Class, Polymer_Name
-    Adds canonical smiles column 'smiles_canon'.
-    Returns empty df if file missing.
-    """
-    try:
-        df = pd.read_csv(POLYINFO_FILE)
-    except Exception:
-        return pd.DataFrame(columns=["smiles", "polymer_class", "polymer_name", "smiles_canon"])
-    # Normalize columns
-    if "SMILES" in df.columns:
-        df = df.rename(columns={"SMILES": "smiles"})
-    elif "smiles" not in df.columns:
-        # If the file doesn't have a SMILES column as expected, return empty gracefully
-        return pd.DataFrame(columns=["smiles", "polymer_class", "polymer_name", "smiles_canon"])
-    # Normalize expected meta columns
-    ren = {}
-    if "Polymer_Class" in df.columns:
-        ren["Polymer_Class"] = "polymer_class"
-    if "Polymer_Name" in df.columns:
-        ren["Polymer_Name"] = "polymer_name"
-    df = df.rename(columns=ren)
-    # Ensure the columns exist (even if missing in the file)
-    if "polymer_class" not in df.columns:
-        df["polymer_class"] = pd.NA
-    if "polymer_name" not in df.columns:
-        df["polymer_name"] = pd.NA
-    # Canonicalize smiles
-    df["smiles_canon"] = df["smiles"].astype(str).apply(canonicalize_smiles)
-    df = df.dropna(subset=["smiles_canon"]).reset_index(drop=True)
-    return df
-@st.cache_data
-def load_all_sources():
-    """
-    Returns dict:
-      db["EXP"/"MD"/"DFT"/"GC"] = {"df": df, "idx": idx}
-      db["POLYINFO"] = {"df": df, "idx": idx}
-    """
-    db = {}
-    for src in SOURCES:
-        df = load_source_csv(src)
-        idx = build_index(df)
-        db[src] = {"df": df, "idx": idx}
-    # PolyInfo metadata
-    pi_df = load_polyinfo_csv()
-    pi_idx = build_index(pi_df) if not pi_df.empty else {}
-    db["POLYINFO"] = {"df": pi_df, "idx": pi_idx}
-    return db
-def get_value(db, source: str, smiles_canon: str, prop_key: str):
-    pack = db[source]
-    df, idx = pack["df"], pack["idx"]
-    row_i = idx.get(smiles_canon, None)
-    if row_i is None:
-        return None
-    if prop_key not in df.columns:
-        return None
-    val = df.iloc[row_i][prop_key]
-    if pd.isna(val):
-        return None
-    return float(val)
-def get_polyinfo(db, smiles_canon: str) -> tuple[str | None, str | None]:
-    """
-    Returns (polymer_name, polymer_class) if available, else (None, None).
-    No 'not available' text here.
-    """
-    pack = db.get("POLYINFO", None)
-    if pack is None:
-        return None, None
-    df, idx = pack["df"], pack["idx"]
-    if df is None or df.empty:
-        return None, None
-    row_i = idx.get(smiles_canon, None)
-    if row_i is None:
-        return None, None
-    name = df.iloc[row_i].get("polymer_name", None)
-    cls = df.iloc[row_i].get("polymer_class", None)
-    # Clean up NA / empty
-    if pd.isna(name) or str(name).strip() == "":
-        name = None
-    else:
-        name = str(name).strip()
-    if pd.isna(cls) or str(cls).strip() == "":
-        cls = None
-    else:
-        cls = str(cls).strip()
-    return name, cls

src/model.py DELETED Viewed

@@ -1,312 +0,0 @@
-# model.py
-from __future__ import annotations
-from typing import List, Optional, Literal
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch_geometric.data import Batch
-from src.conv import build_gnn_encoder, GNNEncoder
-def get_activation(name: str) -> nn.Module:
-    name = name.lower()
-    if name == "relu":
-        return nn.ReLU()
-    if name == "gelu":
-        return nn.GELU()
-    if name == "silu":
-        return nn.SiLU()
-    if name in ("leaky_relu", "lrelu"):
-        return nn.LeakyReLU(0.1)
-    raise ValueError(f"Unknown activation: {name}")
-class FiLM(nn.Module):
-    """
-    Simple FiLM: gamma, beta from condition vector; apply to features as (1+gamma)*h + beta
-    """
-    def __init__(self, feat_dim: int, cond_dim: int):
-        super().__init__()
-        self.gamma = nn.Linear(cond_dim, feat_dim)
-        self.beta = nn.Linear(cond_dim, feat_dim)
-    def forward(self, h: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
-        g = self.gamma(cond)
-        b = self.beta(cond)
-        return (1.0 + g) * h + b
-class TaskHead(nn.Module):
-    """
-    Per-task MLP head. Input is concatenation of [graph_embed, optional task_embed].
-    Outputs either a mean only (scalar) or mean+logvar (heteroscedastic).
-    """
-    def __init__(
-        self,
-        in_dim: int,
-        hidden_dim: int = 512,
-        depth: int = 2,
-        act: str = "relu",
-        dropout: float = 0.0,
-        heteroscedastic: bool = False,
-    ):
-        super().__init__()
-        layers: List[nn.Module] = []
-        d = in_dim
-        for _ in range(depth):
-            layers.append(nn.Linear(d, hidden_dim))
-            layers.append(get_activation(act))
-            if dropout > 0:
-                layers.append(nn.Dropout(dropout))
-            d = hidden_dim
-        out_dim = 2 if heteroscedastic else 1
-        layers.append(nn.Linear(d, out_dim))
-        self.net = nn.Sequential(*layers)
-        self.hetero = heteroscedastic
-    def forward(self, z: torch.Tensor) -> torch.Tensor:
-        # returns [B, 1] or [B, 2] where [...,0] is mean and [...,1] is logvar if heteroscedastic
-        return self.net(z)
-class MultiTaskMultiFidelityModel(nn.Module):
-    """
-    General multi-task, multi-fidelity GNN.
-    - Any number of tasks (properties) via T = len(task_names)
-    - Any number of fidelities via num_fids
-    - Fidelity conditioning with an embedding and FiLM on the graph embedding
-    - Optional task embeddings concatenated into each task head input
-    - Single forward returning predictions [B, T] (means); if heteroscedastic, also returns log-variances
-    Expected input Batch fields (PyG):
-      - x          : [N_nodes, F_node]
-      - edge_index : [2, N_edges]
-      - edge_attr  : [N_edges, F_edge] (required if gnn_type="gine")
-      - batch      : [N_nodes]
-      - fid_idx    : [B] or [B, 1] long; integer fidelity per graph
-    Notes:
-      - Targets should already be normalized outside the model; apply inverse transform for plots.
-      - Loss weighting/equal-importance and curriculum happen in the trainer, not here.
-    """
-    def __init__(
-        self,
-        in_dim_node: int,
-        in_dim_edge: int,
-        task_names: List[str],
-        num_fids: int,
-        gnn_type: Literal["gine", "gin", "gcn"] = "gine",
-        gnn_emb_dim: int = 256,
-        gnn_layers: int = 5,
-        gnn_norm: Literal["batch", "layer", "none"] = "batch",
-        gnn_readout: Literal["mean", "sum", "max"] = "mean",
-        gnn_act: str = "relu",
-        gnn_dropout: float = 0.0,
-        gnn_residual: bool = True,
-        # Fidelity conditioning
-        fid_emb_dim: int = 64,
-        use_film: bool = True,
-        # Task conditioning
-        use_task_embed: bool = True,
-        task_emb_dim: int = 32,
-        # Heads
-        head_hidden: int = 512,
-        head_depth: int = 2,
-        head_act: str = "relu",
-        head_dropout: float = 0.0,
-        heteroscedastic: bool = False,
-        # Optional homoscedastic task uncertainty (used in loss, kept here for checkpoint parity)
-        use_task_uncertainty: bool = False,
-        # Embedding regularization (used via regularization_loss)
-        fid_emb_l2: float = 0.0,
-        task_emb_l2: float = 0.0,
-    ):
-        super().__init__()
-        self.task_names = list(task_names)
-        self.num_tasks = len(task_names)
-        self.num_fids = int(num_fids)
-        self.hetero = heteroscedastic
-        self.fid_emb_l2 = float(fid_emb_l2)
-        self.task_emb_l2 = float(task_emb_l2)
-        self.use_film = use_film
-        self.use_task_embed = use_task_embed
-        # Optional learned homoscedastic uncertainty per task (trainer may use it)
-        self.use_task_uncertainty = bool(use_task_uncertainty)
-        if self.use_task_uncertainty:
-            self.task_log_sigma2 = nn.Parameter(torch.zeros(self.num_tasks))
-        else:
-            self.task_log_sigma2 = None
-        # Encoder
-        self.encoder: GNNEncoder = build_gnn_encoder(
-            in_dim_node=in_dim_node,
-            emb_dim=gnn_emb_dim,
-            num_layers=gnn_layers,
-            gnn_type=gnn_type,
-            in_dim_edge=in_dim_edge,
-            act=gnn_act,
-            dropout=gnn_dropout,
-            residual=gnn_residual,
-            norm=gnn_norm,
-            readout=gnn_readout,
-        )
-        # Fidelity embedding + FiLM
-        self.fid_embed = nn.Embedding(self.num_fids, fid_emb_dim) if fid_emb_dim > 0 else None
-        self.film = FiLM(gnn_emb_dim, fid_emb_dim) if (use_film and fid_emb_dim > 0) else None
-        # --- Compute the true feature dim sent to heads ---
-        # If FiLM is ON: g stays [B, gnn_emb_dim]
-        # If FiLM is OFF but fid_embed exists: we CONCAT c → g becomes [B, gnn_emb_dim + fid_emb_dim]
-        self.gnn_out_dim = gnn_emb_dim + (fid_emb_dim if (self.fid_embed is not None and self.film is None) else 0)
-        # Task embeddings
-        self.task_embed = nn.Embedding(self.num_tasks, task_emb_dim) if (use_task_embed and task_emb_dim > 0) else None
-        # Per-task heads
-        head_in_dim = self.gnn_out_dim + (task_emb_dim if self.task_embed is not None else 0)
-        self.heads = nn.ModuleList([
-            TaskHead(
-                in_dim=head_in_dim,
-                hidden_dim=head_hidden,
-                depth=head_depth,
-                act=head_act,
-                dropout=head_dropout,
-                heteroscedastic=heteroscedastic,
-            ) for _ in range(self.num_tasks)
-        ])
-    def reset_parameters(self):
-        if self.fid_embed is not None:
-            nn.init.normal_(self.fid_embed.weight, mean=0.0, std=0.02)
-        if self.task_embed is not None:
-            nn.init.normal_(self.task_embed.weight, mean=0.0, std=0.02)
-        # Encoder/heads rely on their internal initializations.
-    def forward(self, data: Batch) -> dict:
-        """
-        Returns:
-          {
-            "pred":   [B, T] means,
-            "logvar": [B, T] optional if heteroscedastic,
-            "h":      [B, D] graph embedding after FiLM (useful for diagnostics).
-          }
-        """
-        x, edge_index = data.x, data.edge_index
-        edge_attr = getattr(data, "edge_attr", None)
-        batch = data.batch
-        if edge_attr is None and hasattr(self.encoder, "gnn_type") and self.encoder.gnn_type == "gine":
-            raise ValueError("GINE encoder requires edge_attr, but Batch.edge_attr is None.")
-        # Graph embedding
-        g = self.encoder(x, edge_index, edge_attr, batch)  # [B, D]
-        # Fidelity conditioning
-        fid_idx = data.fid_idx.view(-1).long()  # [B]
-        if self.fid_embed is not None:
-            c = self.fid_embed(fid_idx)  # [B, C]
-            if self.film is not None:
-                g = self.film(g, c)  # [B, D]
-            else:
-                g = torch.cat([g, c], dim=-1)
-        # Per-task heads
-        preds: List[torch.Tensor] = []
-        logvars: Optional[List[torch.Tensor]] = [] if self.hetero else None
-        for t_idx, head in enumerate(self.heads):
-            if self.task_embed is not None:
-                tvec = self.task_embed.weight[t_idx].unsqueeze(0).expand(g.size(0), -1)
-                z = torch.cat([g, tvec], dim=-1)
-            else:
-                z = g
-            out = head(z)  # [B, 1] or [B, 2]
-            if self.hetero:
-                mu = out[..., 0:1]
-                lv = out[..., 1:2]
-                preds.append(mu)
-                logvars.append(lv)  # type: ignore[arg-type]
-            else:
-                preds.append(out)
-        pred = torch.cat(preds, dim=-1)  # [B, T]
-        result = {"pred": pred, "h": g}
-        if self.hetero and logvars is not None:
-            result["logvar"] = torch.cat(logvars, dim=-1)  # [B, T]
-        return result
-    def regularization_loss(self) -> torch.Tensor:
-        """
-        Optional small L2 on embeddings to keep them bounded.
-        """
-        device = next(self.parameters()).device
-        reg = torch.zeros([], device=device)
-        if self.fid_embed is not None and self.fid_emb_l2 > 0:
-            reg = reg + self.fid_emb_l2 * (self.fid_embed.weight.pow(2).mean())
-        if self.task_embed is not None and self.task_emb_l2 > 0:
-            reg = reg + self.task_emb_l2 * (self.task_embed.weight.pow(2).mean())
-        return reg
-def build_model(
-    *,
-    in_dim_node: int,
-    in_dim_edge: int,
-    task_names: List[str],
-    num_fids: int,
-    gnn_type: Literal["gine", "gin", "gcn"] = "gine",
-    gnn_emb_dim: int = 256,
-    gnn_layers: int = 5,
-    gnn_norm: Literal["batch", "layer", "none"] = "batch",
-    gnn_readout: Literal["mean", "sum", "max"] = "mean",
-    gnn_act: str = "relu",
-    gnn_dropout: float = 0.0,
-    gnn_residual: bool = True,
-    fid_emb_dim: int = 64,
-    use_film: bool = True,
-    use_task_embed: bool = True,
-    task_emb_dim: int = 32,
-    head_hidden: int = 512,
-    use_task_uncertainty: bool = False,
-    head_depth: int = 2,
-    head_act: str = "relu",
-    head_dropout: float = 0.0,
-    heteroscedastic: bool = False,
-    fid_emb_l2: float = 0.0,
-    task_emb_l2: float = 0.0,
-) -> MultiTaskMultiFidelityModel:
-    """
-    Factory to construct the multi-task, multi-fidelity model with a consistent API.
-    """
-    return MultiTaskMultiFidelityModel(
-        in_dim_node=in_dim_node,
-        in_dim_edge=in_dim_edge,
-        task_names=task_names,
-        num_fids=num_fids,
-        gnn_type=gnn_type,
-        gnn_emb_dim=gnn_emb_dim,
-        gnn_layers=gnn_layers,
-        gnn_norm=gnn_norm,
-        gnn_readout=gnn_readout,
-        gnn_act=gnn_act,
-        gnn_dropout=gnn_dropout,
-        gnn_residual=gnn_residual,
-        fid_emb_dim=fid_emb_dim,
-        use_film=use_film,
-        use_task_embed=use_task_embed,
-        task_emb_dim=task_emb_dim,
-        head_hidden=head_hidden,
-        head_depth=head_depth,
-        head_act=head_act,
-        head_dropout=head_dropout,
-        heteroscedastic=heteroscedastic,
-        fid_emb_l2=fid_emb_l2,
-        task_emb_l2=task_emb_l2,
-        use_task_uncertainty=use_task_uncertainty,
-    )

src/predictor.py DELETED Viewed

@@ -1,193 +0,0 @@
-from __future__ import annotations
-import re
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-import numpy as np
-import torch
-from torch_geometric.data import Data
-from src.data_builder import featurize_smiles, TargetScaler
-from src.model import build_model
-from src.utils import to_device, apply_inverse_transform
-# -------------------------
-# Unit correction (ML only)
-# -------------------------
-POST_SCALE = {
-    "td":   1e-7,
-    "dif":  1e-5,
-    "visc": 1e-3,
-}
-def _load_scaler_compat(path: Path) -> TargetScaler:
-    blob = torch.load(path, map_location="cpu")
-    if "mean" not in blob or "std" not in blob:
-        raise RuntimeError(f"Unrecognized target_scaler format: {path}")
-    ts = TargetScaler(
-        transforms=blob.get("transforms", None),
-        eps=blob.get("eps", None),
-    )
-    ts.load_state_dict({
-        "mean": blob["mean"].float(),
-        "std": blob["std"].float(),
-        "transforms": blob.get("transforms", ts.transforms),
-        "eps": blob.get("eps", ts.eps),
-    })
-    ts.targets = [str(t).lower() for t in blob.get("targets", [])]
-    return ts
-def _infer_seed_from_name(path: Path) -> Optional[int]:
-    m = re.search(r"_([0-9]+)\.pt$", path.name)
-    return int(m.group(1)) if m else None
-def _make_one_graph(smiles: str) -> Data:
-    x, edge_index, edge_attr = featurize_smiles(smiles)
-    d = Data(
-        x=x,
-        edge_index=edge_index,
-        edge_attr=edge_attr,
-        y=torch.zeros(1, 1),
-        y_mask=torch.zeros(1, 1, dtype=torch.bool),
-        fid_idx=torch.tensor([0], dtype=torch.long),
-    )
-    d.smiles = smiles
-    return d
-class SingleTaskEnsemblePredictor:
-    """
-    Single-task ensemble:
-      models/single_models/{prop}_single_model_{seed}.pt
-      models/single_models/{prop}_single_scalar_{seed}.pt
-    """
-    def __init__(self, models_dir: str = "models/single_models", device: str = "cpu"):
-        self.models_dir = Path(models_dir)
-        self.device = torch.device(device if device == "cuda" and torch.cuda.is_available() else "cpu")
-        self._cache: Dict[Tuple[str, int], Tuple[Optional[torch.nn.Module], TargetScaler, dict]] = {}
-    def available_seeds(self, prop: str) -> List[int]:
-        prop = prop.lower()
-        seeds = []
-        for p in self.models_dir.glob(f"{prop}_single_model_*.pt"):
-            s = _infer_seed_from_name(p)
-            if s is not None:
-                seeds.append(s)
-        return sorted(set(seeds))
-    def _load_one(self, prop: str, seed: int):
-        prop = prop.lower()
-        key = (prop, seed)
-        if key in self._cache:
-            return self._cache[key]
-        ckpt_path = self.models_dir / f"{prop}_single_model_{seed}.pt"
-        scaler_path = self.models_dir / f"{prop}_single_scalar_{seed}.pt"
-        if not ckpt_path.exists() or not scaler_path.exists():
-            raise FileNotFoundError(f"Missing model/scaler for {prop} seed {seed}")
-        ckpt = torch.load(ckpt_path, map_location=self.device)
-        train_args = ckpt.get("args", {})
-        scaler = _load_scaler_compat(scaler_path)
-        task_names = list(getattr(scaler, "targets", [])) or [prop]
-        meta = {"train_args": train_args, "task_names": task_names}
-        self._cache[key] = (None, scaler, meta)
-        return self._cache[key]
-    def _build_model_if_needed(self, prop: str, seed: int, in_dim_node: int, in_dim_edge: int):
-        prop = prop.lower()
-        key = (prop, seed)
-        model, scaler, meta = self._cache[key]
-        if model is not None:
-            return model, scaler, meta
-        train_args = meta["train_args"]
-        task_names = meta["task_names"]
-        ckpt_path = self.models_dir / f"{prop}_single_model_{seed}.pt"
-        ckpt = torch.load(ckpt_path, map_location=self.device)
-        state_dict = ckpt["model"]
-        # infer num_fids from checkpoint
-        if "fid_embed.weight" in state_dict:
-            num_fids = state_dict["fid_embed.weight"].shape[0]
-        else:
-            num_fids = 1
-        model = build_model(
-            in_dim_node=in_dim_node,
-            in_dim_edge=in_dim_edge,
-            task_names=task_names,
-            num_fids=num_fids,
-            gnn_type=train_args.get("gnn_type", "gine"),
-            gnn_emb_dim=train_args.get("gnn_emb_dim", 256),
-            gnn_layers=train_args.get("gnn_layers", 5),
-            gnn_norm=train_args.get("gnn_norm", "batch"),
-            gnn_readout=train_args.get("gnn_readout", "mean"),
-            gnn_act=train_args.get("gnn_act", "relu"),
-            gnn_dropout=train_args.get("gnn_dropout", 0.0),
-            gnn_residual=train_args.get("gnn_residual", True),
-            fid_emb_dim=train_args.get("fid_emb_dim", 64),
-            use_film=train_args.get("use_film", True),
-            use_task_embed=train_args.get("use_task_embed", True),
-            task_emb_dim=train_args.get("task_emb_dim", 32),
-            head_hidden=train_args.get("head_hidden", 512),
-            head_depth=train_args.get("head_depth", 2),
-            head_act=train_args.get("head_act", "relu"),
-            head_dropout=train_args.get("head_dropout", 0.0),
-            heteroscedastic=train_args.get("heteroscedastic", False),
-            fid_emb_l2=0.0,
-            task_emb_l2=0.0,
-            use_task_uncertainty=train_args.get("task_uncertainty", False),
-        ).to(self.device)
-        model.load_state_dict(state_dict, strict=True)
-        model.eval()
-        self._cache[key] = (model, scaler, meta)
-        return model, scaler, meta
-    def predict_mean_std(self, smiles: str, prop: str) -> Tuple[Optional[float], Optional[float], Dict[int, float]]:
-        prop = prop.lower()
-        seeds = self.available_seeds(prop)
-        if not seeds:
-            return None, None, {}
-        try:
-            g = _make_one_graph(smiles)
-        except Exception:
-            return None, None, {}
-        in_dim_node = g.x.shape[1]
-        in_dim_edge = g.edge_attr.shape[1]
-        per_seed: Dict[int, float] = {}
-        with torch.no_grad():
-            for seed in seeds:
-                self._load_one(prop, seed)
-                model, scaler, meta = self._build_model_if_needed(prop, seed, in_dim_node, in_dim_edge)
-                batch = to_device(g, self.device)
-                out = model(batch)
-                pred_n = out["pred"]  # [1, 1]
-                pred = apply_inverse_transform(pred_n, scaler).cpu().numpy().reshape(-1)
-                val = float(pred[0])
-                # unit correction
-                val *= POST_SCALE.get(prop, 1.0)
-                per_seed[seed] = val
-        vals = np.array(list(per_seed.values()), dtype=float)
-        mean = float(vals.mean())
-        std = float(vals.std(ddof=1)) if len(vals) > 1 else 0.0
-        return mean, std, per_seed

src/predictor_multitask.py DELETED Viewed

@@ -1,209 +0,0 @@
-from __future__ import annotations
-import re
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-import numpy as np
-import torch
-from torch_geometric.data import Data
-from src.data_builder import featurize_smiles, TargetScaler
-from src.model import build_model
-from src.utils import to_device, apply_inverse_transform
-# -------------------------
-# Unit correction (ML only)
-# -------------------------
-POST_SCALE = {
-    "td":   1e-7,
-    "dif":  1e-5,
-    "visc": 1e-3,
-}
-def _load_scaler_compat(path: Path) -> TargetScaler:
-    blob = torch.load(path, map_location="cpu")
-    if "mean" not in blob or "std" not in blob:
-        raise RuntimeError(f"Unrecognized target_scaler format: {path}")
-    ts = TargetScaler(
-        transforms=blob.get("transforms", None),
-        eps=blob.get("eps", None),
-    )
-    ts.load_state_dict({
-        "mean": blob["mean"].float(),
-        "std": blob["std"].float(),
-        "transforms": blob.get("transforms", ts.transforms),
-        "eps": blob.get("eps", ts.eps),
-    })
-    ts.targets = [str(t).lower() for t in blob.get("targets", [])]
-    return ts
-def _infer_seed(path: Path) -> Optional[int]:
-    m = re.search(r"_([0-9]+)\.pt$", path.name)
-    return int(m.group(1)) if m else None
-def _make_one_graph(smiles: str, T: int, fid_idx: int = 0) -> Data:
-    x, edge_index, edge_attr = featurize_smiles(smiles)
-    d = Data(
-        x=x,
-        edge_index=edge_index,
-        edge_attr=edge_attr,
-        y=torch.zeros(1, T),
-        y_mask=torch.zeros(1, T, dtype=torch.bool),
-        fid_idx=torch.tensor([fid_idx], dtype=torch.long),
-    )
-    d.smiles = smiles
-    return d
-class MultiTaskEnsemblePredictor:
-    """
-    Multi-task ensemble:
-      models/multitask_models/{task}_model_{seed}.pt
-      models/multitask_models/{task}_scalar_{seed}.pt
-    """
-    def __init__(self, models_dir: str = "models/multitask_models", device: str = "cpu"):
-        self.models_dir = Path(models_dir)
-        self.device = torch.device(device if device == "cuda" and torch.cuda.is_available() else "cpu")
-        self._cache: Dict[Tuple[str, int], Tuple[Optional[torch.nn.Module], TargetScaler, dict]] = {}
-    def available_seeds(self, task: str) -> List[int]:
-        task = task.strip().lower()
-        seeds = []
-        for p in self.models_dir.glob(f"{task}_model_*.pt"):
-            s = _infer_seed(p)
-            if s is not None:
-                seeds.append(s)
-        return sorted(set(seeds))
-    def _load_one_meta(self, task: str, seed: int):
-        task = task.strip().lower()
-        key = (task, seed)
-        if key in self._cache:
-            return self._cache[key]
-        ckpt_path = self.models_dir / f"{task}_model_{seed}.pt"
-        scaler_path = self.models_dir / f"{task}_scalar_{seed}.pt"
-        if not ckpt_path.exists() or not scaler_path.exists():
-            raise FileNotFoundError(f"Missing model/scaler for task={task} seed={seed}")
-        ckpt = torch.load(ckpt_path, map_location=self.device)
-        state_dict = ckpt["model"]
-        train_args = ckpt.get("args", {})
-        scaler = _load_scaler_compat(scaler_path)
-        task_names = list(getattr(scaler, "targets", []))
-        if not task_names:
-            raise RuntimeError(f"No targets found in scaler: {scaler_path}")
-        if "fid_embed.weight" in state_dict:
-            num_fids = state_dict["fid_embed.weight"].shape[0]
-        else:
-            num_fids = 1
-        meta = {
-            "train_args": train_args,
-            "task_names": task_names,
-            "num_fids": num_fids,
-        }
-        self._cache[key] = (None, scaler, meta)
-        return self._cache[key]
-    def _build_if_needed(self, task: str, seed: int, in_dim_node: int, in_dim_edge: int):
-        task = task.strip().lower()
-        key = (task, seed)
-        model, scaler, meta = self._cache[key]
-        if model is not None:
-            return model, scaler, meta
-        train_args = meta["train_args"]
-        task_names = meta["task_names"]
-        num_fids = meta["num_fids"]
-        model = build_model(
-            in_dim_node=in_dim_node,
-            in_dim_edge=in_dim_edge,
-            task_names=task_names,
-            num_fids=num_fids,
-            gnn_type=train_args.get("gnn_type", "gine"),
-            gnn_emb_dim=train_args.get("gnn_emb_dim", 256),
-            gnn_layers=train_args.get("gnn_layers", 5),
-            gnn_norm=train_args.get("gnn_norm", "batch"),
-            gnn_readout=train_args.get("gnn_readout", "mean"),
-            gnn_act=train_args.get("gnn_act", "relu"),
-            gnn_dropout=train_args.get("gnn_dropout", 0.0),
-            gnn_residual=train_args.get("gnn_residual", True),
-            fid_emb_dim=train_args.get("fid_emb_dim", 64),
-            use_film=train_args.get("use_film", True),
-            use_task_embed=train_args.get("use_task_embed", True),
-            task_emb_dim=train_args.get("task_emb_dim", 32),
-            head_hidden=train_args.get("head_hidden", 512),
-            head_depth=train_args.get("head_depth", 2),
-            head_act=train_args.get("head_act", "relu"),
-            head_dropout=train_args.get("head_dropout", 0.0),
-            heteroscedastic=train_args.get("heteroscedastic", False),
-            fid_emb_l2=0.0,
-            task_emb_l2=0.0,
-            use_task_uncertainty=train_args.get("task_uncertainty", False),
-        ).to(self.device)
-        ckpt_path = self.models_dir / f"{task}_model_{seed}.pt"
-        ckpt = torch.load(ckpt_path, map_location=self.device)
-        model.load_state_dict(ckpt["model"], strict=True)
-        model.eval()
-        self._cache[key] = (model, scaler, meta)
-        return model, scaler, meta
-    def predict_mean_std(self, smiles: str, prop_key: str, task: str) -> Tuple[Optional[float], Optional[float], Dict[int, float]]:
-        task = task.strip().lower()
-        prop_key = prop_key.lower()
-        seeds = self.available_seeds(task)
-        if not seeds:
-            return None, None, {}
-        self._load_one_meta(task, seeds[0])
-        _, scaler0, meta0 = self._cache[(task, seeds[0])]
-        targets = list(meta0["task_names"])  # already lower()
-        if prop_key not in targets:
-            return None, None, {}
-        t_idx = targets.index(prop_key)
-        T = len(targets)
-        try:
-            g = _make_one_graph(smiles, T=T, fid_idx=0)
-        except Exception:
-            return None, None, {}
-        in_dim_node = g.x.shape[1]
-        in_dim_edge = g.edge_attr.shape[1]
-        per_seed: Dict[int, float] = {}
-        with torch.no_grad():
-            for seed in seeds:
-                self._load_one_meta(task, seed)
-                model, scaler, meta = self._build_if_needed(task, seed, in_dim_node, in_dim_edge)
-                batch = to_device(g, self.device)
-                out = model(batch)
-                pred_n = out["pred"]  # [1, T]
-                pred = apply_inverse_transform(pred_n, scaler).cpu().numpy().reshape(-1)
-                val = float(pred[t_idx])
-                # unit correction
-                val *= POST_SCALE.get(prop_key, 1.0)
-                per_seed[seed] = val
-        vals = np.array(list(per_seed.values()), dtype=float)
-        mean = float(vals.mean())
-        std = float(vals.std(ddof=1)) if len(vals) > 1 else 0.0
-        return mean, std, per_seed

src/predictor_router.py DELETED Viewed

@@ -1,45 +0,0 @@
-from __future__ import annotations
-import json
-from pathlib import Path
-from typing import Dict, Optional, Tuple
-from src.predictor import SingleTaskEnsemblePredictor
-from src.predictor_multitask import MultiTaskEnsemblePredictor
-class RouterPredictor:
-    """
-    Routes each property to either:
-      - single-task ensemble (models/single_models)
-      - multitask ensemble (models/multitask_models/{task}_*)
-    based on models/best_model_map.json
-    """
-    def __init__(
-        self,
-        map_path: str = "models/best_model_map.json",
-        single_dir: str = "models/single_models",
-        multitask_dir: str = "models/multitask_models",
-        device: str = "cpu",
-    ):
-        self.map_path = Path(map_path)
-        self.map: Dict[str, dict] = json.load(open(self.map_path))
-        self.single = SingleTaskEnsemblePredictor(models_dir=single_dir, device=device)
-        self.multi = MultiTaskEnsemblePredictor(models_dir=multitask_dir, device=device)
-    def predict_mean_std(self, smiles: str, prop: str) -> Tuple[Optional[float], Optional[float], dict, str]:
-        prop = prop.lower()
-        cfg = self.map.get(prop, {"family": "single"})
-        fam = cfg.get("family", "single").lower()
-        if fam == "multitask":
-            task = str(cfg.get("task", "all")).lower()
-            mean, std, per_seed = self.multi.predict_mean_std(smiles, prop_key=prop, task=task)
-            label = f"multitask:{task}"
-            return mean, std, per_seed, label
-        # default: single
-        mean, std, per_seed = self.single.predict_mean_std(smiles, prop)
-        label = "single"
-        return mean, std, per_seed, label

src/rnn_smiles/__init__.py DELETED Viewed

@@ -1,22 +0,0 @@
-"""RNN-based SMILES generation helpers for Streamlit pages."""
-from .generator import (
-    canonicalize_smiles,
-    filter_novel_smiles,
-    generate_smiles,
-    load_existing_smiles_set,
-    load_rnn_model,
-)
-from .rnn import MultiGRU, RNN
-from .vocabulary import Vocabulary
-__all__ = [
-    "canonicalize_smiles",
-    "filter_novel_smiles",
-    "generate_smiles",
-    "load_existing_smiles_set",
-    "load_rnn_model",
-    "MultiGRU",
-    "RNN",
-    "Vocabulary",
-]

src/rnn_smiles/generator.py DELETED Viewed

@@ -1,175 +0,0 @@
-"""Streamlit integration helpers for RNN SMILES generation."""
-from __future__ import annotations
-from pathlib import Path
-from typing import Iterable, Sequence
-import pandas as pd
-import streamlit as st
-import torch
-from rdkit import Chem, RDLogger
-from .rnn import RNN
-from .vocabulary import Vocabulary
-RDLogger.DisableLog("rdApp.*")
-def canonicalize_smiles(smiles: str) -> str | None:
-    s = (smiles or "").strip()
-    if not s:
-        return None
-    mol = Chem.MolFromSmiles(s)
-    if mol is None:
-        return None
-    return Chem.MolToSmiles(mol, canonical=True)
-def _find_smiles_column(path: Path) -> str:
-    header = pd.read_csv(path, nrows=0)
-    for col in header.columns:
-        if str(col).strip().lower() == "smiles":
-            return col
-    raise ValueError(f"No SMILES column found in {path}")
-def _load_checkpoint(path: Path, device: torch.device) -> dict:
-    # Support both new/old torch signatures while preferring secure load mode.
-    try:
-        state = torch.load(path, map_location=device, weights_only=True)
-    except TypeError:
-        state = torch.load(path, map_location=device)
-    if isinstance(state, dict) and isinstance(state.get("state_dict"), dict):
-        state = state["state_dict"]
-    if not isinstance(state, dict):
-        raise RuntimeError(f"Checkpoint does not contain a state dict: {path}")
-    return state
-@st.cache_resource(show_spinner=False)
-def load_rnn_model(ckpt_path: str | Path, voc_path: str | Path) -> tuple[RNN, Vocabulary]:
-    ckpt_path = Path(ckpt_path).expanduser().resolve()
-    voc_path = Path(voc_path).expanduser().resolve()
-    if not ckpt_path.exists():
-        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
-    if not voc_path.exists():
-        raise FileNotFoundError(f"Vocabulary not found: {voc_path}")
-    voc = Vocabulary(init_from_file=str(voc_path))
-    model = RNN(voc)
-    model_device = next(model.rnn.parameters()).device
-    state = _load_checkpoint(ckpt_path, model_device)
-    ckpt_vocab_size = None
-    if "embedding.weight" in state:
-        ckpt_vocab_size = int(state["embedding.weight"].shape[0])
-    if ckpt_vocab_size is not None and ckpt_vocab_size != voc.vocab_size:
-        raise RuntimeError(
-            f"Vocabulary size mismatch: voc has {voc.vocab_size} tokens, "
-            f"checkpoint expects {ckpt_vocab_size}. "
-            "Use the matching vocab file for this checkpoint."
-        )
-    model.rnn.load_state_dict(state)
-    model.rnn.eval()
-    return model, voc
-def _sample_with_temperature(
-    model: RNN, voc: Vocabulary, batch_size: int, max_length: int, temperature: float
-) -> torch.Tensor:
-    temp = max(float(temperature), 1e-6)
-    device = next(model.rnn.parameters()).device
-    start_token = torch.full((batch_size,), voc.vocab["GO"], dtype=torch.long, device=device)
-    h = model.rnn.init_h(batch_size)
-    x = start_token
-    sequences: list[torch.Tensor] = []
-    finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
-    for _ in range(max_length):
-        logits, h = model.rnn(x, h)
-        logits = logits / temp
-        prob = torch.softmax(logits, dim=1)
-        x = torch.multinomial(prob, 1).view(-1)
-        sequences.append(x.view(-1, 1))
-        finished = finished | (x == voc.vocab["EOS"])
-        if torch.all(finished):
-            break
-    if not sequences:
-        return torch.empty((batch_size, 0), dtype=torch.long, device=device)
-    return torch.cat(sequences, dim=1)
-def generate_smiles(
-    model: RNN,
-    voc: Vocabulary,
-    n: int,
-    max_length: int,
-    temperature: float = 1.0,
-) -> list[str]:
-    if n <= 0:
-        return []
-    max_length = max(int(max_length), 1)
-    with torch.no_grad():
-        if abs(float(temperature) - 1.0) < 1e-8:
-            seqs, _, _ = model.sample(int(n), max_length=max_length)
-        else:
-            seqs = _sample_with_temperature(
-                model,
-                voc,
-                int(n),
-                max_length,
-                float(temperature),
-            )
-        arr = seqs.detach().cpu().numpy()
-    output: list[str] = []
-    for seq in arr:
-        output.append(voc.decode(seq))
-    return output
-def filter_novel_smiles(smiles: Iterable[str], existing: set[str]) -> list[str]:
-    novel: list[str] = []
-    seen: set[str] = set()
-    for smi in smiles:
-        canonical = canonicalize_smiles(smi)
-        if canonical is None:
-            continue
-        if canonical in seen:
-            continue
-        seen.add(canonical)
-        if canonical in existing:
-            continue
-        novel.append(canonical)
-    return novel
-@st.cache_resource(show_spinner=False)
-def load_existing_smiles_set(csv_paths: Sequence[str | Path], chunksize: int = 200_000) -> set[str]:
-    existing: set[str] = set()
-    for p in csv_paths:
-        path = Path(p)
-        if not path.exists():
-            continue
-        col = _find_smiles_column(path)
-        for chunk in pd.read_csv(path, usecols=[col], chunksize=int(chunksize)):
-            for smiles in chunk[col].astype(str):
-                canonical = canonicalize_smiles(smiles)
-                if canonical:
-                    existing.add(canonical)
-    return existing
-__all__ = [
-    "canonicalize_smiles",
-    "load_rnn_model",
-    "generate_smiles",
-    "filter_novel_smiles",
-    "load_existing_smiles_set",
-]

src/rnn_smiles/rnn.py DELETED Viewed

@@ -1,89 +0,0 @@
-"""Core GRU model used for polymer SMILES generation."""
-from __future__ import annotations
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class MultiGRU(nn.Module):
-    def __init__(self, vocab_size: int):
-        super().__init__()
-        self.embedding = nn.Embedding(vocab_size, 128)
-        self.gru_1 = nn.GRUCell(128, 512)
-        self.gru_2 = nn.GRUCell(512, 512)
-        self.gru_3 = nn.GRUCell(512, 512)
-        self.linear = nn.Linear(512, vocab_size)
-    def forward(self, x: torch.Tensor, h: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        x = self.embedding(x)
-        h_out = torch.zeros_like(h)
-        x = h_out[0] = self.gru_1(x, h[0])
-        x = h_out[1] = self.gru_2(x, h[1])
-        x = h_out[2] = self.gru_3(x, h[2])
-        x = self.linear(x)
-        return x, h_out
-    def init_h(self, batch_size: int) -> torch.Tensor:
-        device = next(self.parameters()).device
-        return torch.zeros(3, batch_size, 512, device=device)
-def nll_loss(log_probs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
-    # Gather selected token log-probability for each sample in batch.
-    return log_probs.gather(1, targets.contiguous().view(-1, 1)).squeeze(1)
-class RNN:
-    def __init__(self, voc):
-        self.rnn = MultiGRU(voc.vocab_size)
-        if torch.cuda.is_available():
-            self.rnn.cuda()
-        self.voc = voc
-    def likelihood(self, target: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        batch_size, seq_length = target.size()
-        device = target.device
-        start_token = torch.full((batch_size, 1), self.voc.vocab["GO"], dtype=torch.long, device=device)
-        x = torch.cat((start_token, target[:, :-1]), 1)
-        h = self.rnn.init_h(batch_size)
-        log_probs = torch.zeros(batch_size, device=device)
-        entropy = torch.zeros(batch_size, device=device)
-        for step in range(seq_length):
-            logits, h = self.rnn(x[:, step], h)
-            log_prob = F.log_softmax(logits, dim=1)
-            prob = F.softmax(logits, dim=1)
-            log_probs += nll_loss(log_prob, target[:, step])
-            entropy += -torch.sum((log_prob * prob), 1)
-        return log_probs, entropy
-    def sample(self, batch_size: int, max_length: int = 140) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        device = next(self.rnn.parameters()).device
-        start_token = torch.full((batch_size,), self.voc.vocab["GO"], dtype=torch.long, device=device)
-        h = self.rnn.init_h(batch_size)
-        x = start_token
-        sequences: list[torch.Tensor] = []
-        log_probs = torch.zeros(batch_size, device=device)
-        finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
-        entropy = torch.zeros(batch_size, device=device)
-        for _ in range(max_length):
-            logits, h = self.rnn(x, h)
-            prob = F.softmax(logits, dim=1)
-            log_prob = F.log_softmax(logits, dim=1)
-            x = torch.multinomial(prob, 1).view(-1)
-            sequences.append(x.view(-1, 1))
-            log_probs += nll_loss(log_prob, x)
-            entropy += -torch.sum((log_prob * prob), 1)
-            finished = finished | (x == self.voc.vocab["EOS"])
-            if torch.all(finished):
-                break
-        if sequences:
-            stacked = torch.cat(sequences, 1)
-        else:
-            stacked = torch.empty((batch_size, 0), dtype=torch.long, device=device)
-        return stacked, log_probs, entropy

src/rnn_smiles/utils.py DELETED Viewed

@@ -1,15 +0,0 @@
-"""Utility helpers used by the legacy-style RNN generator."""
-from __future__ import annotations
-import numpy as np
-import torch
-def variable(tensor: torch.Tensor | np.ndarray) -> torch.Tensor:
-    """Return a tensor on GPU when available."""
-    if isinstance(tensor, np.ndarray):
-        tensor = torch.from_numpy(tensor)
-    if torch.cuda.is_available():
-        return tensor.cuda()
-    return tensor

src/rnn_smiles/vocabulary.py DELETED Viewed

@@ -1,69 +0,0 @@
-"""Token vocabulary used by the SMILES RNN."""
-from __future__ import annotations
-import re
-import numpy as np
-class Vocabulary:
-    def __init__(self, init_from_file: str | None = None, max_length: int | None = None):
-        self.special_tokens = ["EOS", "GO"]
-        self.additional_chars: set[str] = set()
-        self.chars = self.special_tokens
-        self.vocab_size = len(self.chars)
-        self.vocab = dict(zip(self.chars, range(len(self.chars))))
-        self.reversed_vocab = {v: k for k, v in self.vocab.items()}
-        self.max_length = max_length
-        if init_from_file:
-            self.init_from_file(init_from_file)
-    def encode(self, char_list: list[str]) -> np.ndarray:
-        smiles_matrix = np.zeros(len(char_list), dtype=np.float32)
-        for i, char in enumerate(char_list):
-            smiles_matrix[i] = self.vocab[char]
-        return smiles_matrix
-    def decode(self, matrix: np.ndarray) -> str:
-        chars: list[str] = []
-        eos_id = self.vocab["EOS"]
-        for i in matrix:
-            if int(i) == eos_id:
-                break
-            chars.append(self.reversed_vocab[int(i)])
-        return "".join(chars)
-    def tokenize(self, smiles: str) -> list[str]:
-        regex = r"(\[[^\[\]]{1,6}\])"
-        char_list = re.split(regex, smiles)
-        tokenized: list[str] = []
-        for char in char_list:
-            if not char:
-                continue
-            if char.startswith("["):
-                tokenized.append(char)
-            else:
-                tokenized.extend(list(char))
-        tokenized.append("EOS")
-        return tokenized
-    def add_characters(self, chars: list[str]) -> None:
-        for char in chars:
-            self.additional_chars.add(char)
-        char_list = sorted(list(self.additional_chars))
-        self.chars = char_list + self.special_tokens
-        self.vocab_size = len(self.chars)
-        self.vocab = dict(zip(self.chars, range(len(self.chars))))
-        self.reversed_vocab = {v: k for k, v in self.vocab.items()}
-    def init_from_file(self, file_path: str) -> None:
-        with open(file_path, "r", encoding="utf-8") as f:
-            chars = f.read().split()
-        self.add_characters(chars)
-    def __len__(self) -> int:
-        return len(self.chars)
-    def __str__(self) -> str:
-        return f"Vocabulary containing {len(self)} tokens: {self.chars}"

src/sascorer.py DELETED Viewed

@@ -1,192 +0,0 @@
-#
-# calculation of synthetic accessibility score as described in:
-#
-# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions
-# Peter Ertl and Ansgar Schuffenhauer
-# Journal of Cheminformatics 1:8 (2009)
-# http://www.jcheminf.com/content/1/1/8
-#
-# several small modifications to the original paper are included
-# particularly slightly different formula for marocyclic penalty
-# and taking into account also molecule symmetry (fingerprint density)
-#
-# for a set of 10k diverse molecules the agreement between the original method
-# as implemented in PipelinePilot and this implementation is r2 = 0.97
-#
-# peter ertl & greg landrum, september 2013
-#
-from rdkit import Chem
-from rdkit.Chem import rdFingerprintGenerator, rdMolDescriptors
-import math
-import pickle
-import os.path as op
-_fscores = None
-mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2)
-def readFragmentScores(name="fpscores.pkl.gz"):
-  import gzip
-  global _fscores
-  # generate the full path filename:
-  if name == "fpscores.pkl.gz":
-    name = op.join(op.dirname(__file__), name)
-  data = pickle.load(gzip.open(name))
-  outDict = {}
-  for i in data:
-    for j in range(1, len(i)):
-      outDict[i[j]] = float(i[0])
-  _fscores = outDict
-def numBridgeheadsAndSpiro(mol, ri=None):
-  nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
-  nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
-  return nBridgehead, nSpiro
-def calculateScore(m):
-  if not m.GetNumAtoms():
-    return None
-  if _fscores is None:
-    readFragmentScores()
-  # fragment score
-  sfp = mfpgen.GetSparseCountFingerprint(m)
-  score1 = 0.
-  nf = 0
-  nze = sfp.GetNonzeroElements()
-  for id, count in nze.items():
-    nf += count
-    score1 += _fscores.get(id, -4) * count
-  score1 /= nf
-  # features score
-  nAtoms = m.GetNumAtoms()
-  nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
-  ri = m.GetRingInfo()
-  nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
-  nMacrocycles = 0
-  for x in ri.AtomRings():
-    if len(x) > 8:
-      nMacrocycles += 1
-  sizePenalty = nAtoms**1.005 - nAtoms
-  stereoPenalty = math.log10(nChiralCenters + 1)
-  spiroPenalty = math.log10(nSpiro + 1)
-  bridgePenalty = math.log10(nBridgeheads + 1)
-  macrocyclePenalty = 0.
-  # ---------------------------------------
-  # This differs from the paper, which defines:
-  #  macrocyclePenalty = math.log10(nMacrocycles+1)
-  # This form generates better results when 2 or more macrocycles are present
-  if nMacrocycles > 0:
-    macrocyclePenalty = math.log10(2)
-  score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty
-  # correction for the fingerprint density
-  # not in the original publication, added in version 1.1
-  # to make highly symmetrical molecules easier to synthetise
-  score3 = 0.
-  numBits = len(nze)
-  if nAtoms > numBits:
-    score3 = math.log(float(nAtoms) / numBits) * .5
-  sascore = score1 + score2 + score3
-  # need to transform "raw" value into scale between 1 and 10
-  min = -4.0
-  max = 2.5
-  sascore = 11. - (sascore - min + 1) / (max - min) * 9.
-  # smooth the 10-end
-  if sascore > 8.:
-    sascore = 8. + math.log(sascore + 1. - 9.)
-  if sascore > 10.:
-    sascore = 10.0
-  elif sascore < 1.:
-    sascore = 1.0
-  return sascore
-def processMols(mols):
-  print('smiles\tName\tsa_score')
-  for i, m in enumerate(mols):
-    if m is None:
-      continue
-    s = calculateScore(m)
-    smiles = Chem.MolToSmiles(m)
-    if s is None:
-      print(f"{smiles}\t{m.GetProp('_Name')}\t{s}")
-    else:
-      print(f"{smiles}\t{m.GetProp('_Name')}\t{s:3f}")
-if __name__ == '__main__':
-  import sys
-  import time
-  t1 = time.time()
-  if len(sys.argv) == 2:
-    readFragmentScores()
-  else:
-    readFragmentScores(sys.argv[2])
-  t2 = time.time()
-  molFile = sys.argv[1]
-  if molFile.endswith("smi"):
-    suppl = Chem.SmilesMolSupplier(molFile)
-  elif molFile.endswith("sdf"):
-    suppl = Chem.SDMolSupplier(molFile)
-  else:
-    print(f"Unrecognized file extension for {molFile}")
-    sys.exit()
-  t3 = time.time()
-  processMols(suppl)
-  t4 = time.time()
-  print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)),
-        file=sys.stderr)
-#
-#  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
-#  All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#     * Redistributions of source code must retain the above copyright
-#       notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following
-#       disclaimer in the documentation and/or other materials provided
-#       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
-#       nor the names of its contributors may be used to endorse or promote
-#       products derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

src/ui_style.py DELETED Viewed

@@ -1,1003 +0,0 @@
-import base64
-import html
-import os
-from pathlib import Path
-from urllib import request
-import streamlit as st
-def _icon_data_uri(filename: str) -> str:
-    icon_path = Path(__file__).resolve().parent.parent / "icons" / filename
-    if not icon_path.exists():
-        return ""
-    try:
-        encoded = base64.b64encode(icon_path.read_bytes()).decode("ascii")
-    except Exception:
-        return ""
-    return f"data:image/png;base64,{encoded}"
-def _config_value(name: str, default: str = "") -> str:
-    try:
-        if name in st.secrets:
-            return str(st.secrets[name]).strip()
-    except Exception:
-        pass
-    return str(os.getenv(name, default)).strip()
-def _build_sidebar_icon_css() -> str:
-    fallback = {
-        1: "🏠",
-        2: "🔎",
-        3: "📦",
-        4: "🧬",
-        5: "⚙️",
-        6: "🧠",
-        7: "✨",
-        8: "💬",
-        9: "📚",
-    }
-    icon_name = {
-        1: "home1.png",
-        2: "probe1.png",
-        3: "batch1.png",
-        4: "molecule1.png",
-        5: "manual1.png",
-        6: "ai1.png",
-        7: "rnn1.png",
-        8: "literature.png",
-        9: "feedback.png",
-    }
-    rules = [
-        '[data-testid="stSidebarNav"] ul li a { position: relative; padding-left: 3.25rem !important; }',
-        '[data-testid="stSidebarNav"] ul li a::before { content: ""; position: absolute; left: 12px; top: 50%; transform: translateY(-50%); width: 32px; height: 32px; background-size: contain; background-repeat: no-repeat; background-position: center; }',
-    ]
-    for idx in range(1, 10):
-        uri = _icon_data_uri(icon_name[idx])
-        if uri:
-            rules.append(
-                '[data-testid="stSidebarNav"] ul li:nth-of-type(%d) a::before { content: ""; background-image: url("%s"); }'
-                % (idx, uri)
-            )
-        else:
-            emoji = fallback[idx]
-            rules.append(
-                '[data-testid="stSidebarNav"] ul li:nth-of-type(%d) a::before { content: "%s"; background-image: none; width: auto; height: auto; font-size: 1.4rem; }'
-                % (idx, emoji)
-            )
-    return "\n".join(rules)
-def _log_visit_once_per_session() -> None:
-    if st.session_state.get("_visit_logged"):
-        return
-    webhook_url = _config_value("FEEDBACK_WEBHOOK_URL", "")
-    webhook_token = _config_value("FEEDBACK_WEBHOOK_TOKEN", "")
-    if not webhook_url:
-        return
-    endpoint = webhook_url
-    sep = "&" if "?" in webhook_url else "?"
-    endpoint = f"{webhook_url}{sep}event=visit"
-    if webhook_token:
-        endpoint = f"{endpoint}&token={webhook_token}"
-    try:
-        with request.urlopen(endpoint, timeout=3):
-            pass
-    except Exception:
-        pass
-    st.session_state["_visit_logged"] = True
-def render_page_header(title: str, subtitle: str = "", badge: str = "") -> None:
-    title_html = html.escape(title)
-    subtitle_html = html.escape(subtitle) if subtitle else ""
-    badge_html = html.escape(badge) if badge else ""
-    st.markdown(
-        f"""
-<section class="pp-page-header">
-  {"<span class='pp-badge'>" + badge_html + "</span>" if badge_html else ""}
-  <h1 class="pp-page-title">{title_html}</h1>
-  {"<p class='pp-page-subtitle'>" + subtitle_html + "</p>" if subtitle_html else ""}
-</section>
-""",
-        unsafe_allow_html=True,
-    )
-def apply_global_style() -> None:
-    _log_visit_once_per_session()
-    icon_css = _build_sidebar_icon_css()
-    css = """
-    <style>
-    @import url('https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;600;700;800&display=swap');
-    :root {
-        --pp-text: #133a2a;
-        --pp-muted: #4e6f5d;
-        --pp-primary: #1f8f52;
-        --pp-primary-2: #2cab67;
-        --pp-border: rgba(255, 255, 255, 0.96);
-        --pp-surface: rgba(251, 247, 255, 0.90);
-        --pp-shadow: 0 10px 22px rgba(70, 66, 110, 0.12);
-        --pp-panel-shadow:
-            0 14px 28px rgba(83, 73, 124, 0.16),
-            0 1px 0 rgba(255, 255, 255, 0.58) inset;
-    }
-    html, body, [class*="css"], [data-testid="stMarkdownContainer"] * {
-        font-family: "Manrope", "Avenir Next", "Segoe UI", sans-serif;
-    }
-    [data-testid="stAppViewContainer"] {
-        min-height: 100vh;
-        overflow: visible !important;
-        background:
-            radial-gradient(1300px 700px at -10% 105%, rgba(188, 113, 202, 0.62), transparent 62%),
-            radial-gradient(1200px 700px at 108% 104%, rgba(130, 170, 235, 0.50), transparent 62%),
-            linear-gradient(120deg, #d5c4e3 0%, #cdd4e9 45%, #c4e1ea 100%) !important;
-    }
-    .stApp,
-    [data-testid="stAppViewContainer"] > .main,
-    section[data-testid="stMain"] {
-        color: var(--pp-text);
-        background: transparent !important;
-    }
-    [data-testid="stAppViewContainer"] > .main {
-        margin: 0 !important;
-        border: none !important;
-        border-radius: 0 !important;
-        box-shadow: none !important;
-        background: transparent !important;
-        overflow: visible !important;
-    }
-    section[data-testid="stMain"] {
-        position: relative;
-        margin: 12px 14px 12px 18px;
-        min-height: calc(100vh - 24px) !important;
-        border-radius: 30px;
-        border: 1px solid rgba(255, 255, 255, 0.98);
-        border-right: 2px solid rgba(233, 223, 245, 0.98);
-        background: rgba(244, 239, 250, 0.96) !important;
-        box-shadow:
-            var(--pp-panel-shadow),
-            inset -1px 0 0 rgba(233, 223, 245, 0.95);
-        overflow: visible;
-        isolation: isolate;
-    }
-    section[data-testid="stMain"] {
-        overflow-y: visible !important;
-        overflow-x: hidden !important;
-    }
-    section[data-testid="stMain"]::before {
-        content: "";
-        position: absolute;
-        inset: 0;
-        border-radius: inherit;
-        pointer-events: none;
-        box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.58);
-        z-index: 0;
-    }
-    section[data-testid="stMain"]::after {
-        display: none;
-    }
-    section[data-testid="stMain"] > div,
-    [data-testid="stMainBlockContainer"] {
-        position: relative;
-        z-index: 1;
-        background: transparent !important;
-    }
-    [data-testid="stHeader"] {
-        background: transparent !important;
-    }
-    .block-container {
-        max-width: 1220px;
-        padding-top: 1.3rem;
-        padding-bottom: 2.4rem;
-    }
-    h1, h2, h3, h4, h5 {
-        letter-spacing: -0.02em;
-    }
-    p, li, label, [data-testid="stCaptionContainer"] {
-        color: var(--pp-muted);
-    }
-    a, a:visited {
-        color: #1f8f52 !important;
-    }
-    section[data-testid="stSidebar"],
-    [data-testid="stSidebar"] {
-        background: transparent !important;
-        border-right: none !important;
-        --pp-sidebar-width: 300px;
-        height: calc(100vh - 24px) !important;
-        min-width: var(--pp-sidebar-width) !important;
-        width: var(--pp-sidebar-width) !important;
-        max-width: var(--pp-sidebar-width) !important;
-        flex: 0 0 var(--pp-sidebar-width) !important;
-        flex-basis: var(--pp-sidebar-width) !important;
-    }
-    section[data-testid="stSidebar"][aria-expanded="true"] {
-        min-width: var(--pp-sidebar-width) !important;
-        width: var(--pp-sidebar-width) !important;
-        max-width: var(--pp-sidebar-width) !important;
-        flex: 0 0 var(--pp-sidebar-width) !important;
-        flex-basis: var(--pp-sidebar-width) !important;
-    }
-    [data-testid="stSidebar"] > div:first-child {
-        margin: 12px 0 12px 12px;
-        height: calc(100vh - 24px);
-        border-radius: 30px;
-        border: 1px solid rgba(255, 255, 255, 0.98);
-        background: rgba(241, 226, 248, 0.96) !important;
-        box-shadow: var(--pp-panel-shadow);
-        backdrop-filter: blur(6px);
-        overflow-y: auto;
-        overflow-x: hidden;
-    }
-    [data-testid="stSidebarUserContent"] {
-        padding-top: 0.25rem;
-    }
-    [data-testid="stSidebarNav"] ul li,
-    [data-testid="stSidebarNav"] ul li a,
-    [data-testid="stSidebarNav"] ul li button {
-        margin: 0 !important;
-        padding: 0 !important;
-    }
-    [data-testid="stSidebarNav"] ul li + li {
-        margin-top: 0.44rem !important;
-    }
-    [data-testid="stSidebarNav"] ul li a,
-    [data-testid="stSidebarNav"] ul li button {
-        font-size: 1.02rem !important;
-        font-family: "Inter", "Manrope", "Avenir Next", "Segoe UI", sans-serif !important;
-        font-weight: 600 !important;
-        color: #1f6b4a !important;
-        border-radius: 12px !important;
-        display: flex !important;
-        align-items: center !important;
-        justify-content: flex-start !important;
-        height: 44px !important;
-        min-height: 44px !important;
-        max-height: 44px !important;
-        line-height: 1 !important;
-        padding: 0 0.78rem 0 3.1rem !important;
-        border: 1px solid rgba(255, 255, 255, 0.98);
-        background: rgba(255, 255, 255, 0.78) !important;
-        box-shadow:
-            0 1px 0 rgba(255, 255, 255, 0.42) inset,
-            0 2px 7px rgba(80, 86, 131, 0.07);
-        transition: all 140ms ease;
-        box-sizing: border-box !important;
-        overflow: hidden !important;
-    }
-    [data-testid="stSidebarNav"] ul li a > div,
-    [data-testid="stSidebarNav"] ul li button > div {
-        margin: 0 !important;
-        padding: 0 !important;
-        display: flex !important;
-        align-items: center !important;
-        min-height: 0 !important;
-        line-height: 1 !important;
-    }
-    [data-testid="stSidebarNav"] ul li a span,
-    [data-testid="stSidebarNav"] ul li button span {
-        font-size: 0.95rem !important;
-        font-family: "Inter", "Manrope", "Avenir Next", "Segoe UI", sans-serif !important;
-        font-weight: 600 !important;
-        color: #1f6b4a !important;
-        line-height: 1.05 !important;
-        white-space: nowrap !important;
-        margin: 0 !important;
-        padding: 0 !important;
-    }
-    [data-testid="stSidebarNav"] ul li a:hover,
-    [data-testid="stSidebarNav"] ul li button:hover {
-        transform: translateY(-1px);
-        background: rgba(255, 255, 255, 0.90) !important;
-        box-shadow:
-            0 1px 0 rgba(255, 255, 255, 0.44) inset,
-            0 4px 10px rgba(85, 94, 142, 0.10);
-    }
-    [data-testid="stSidebarNav"] ul li a[aria-current="page"],
-    [data-testid="stSidebarNav"] ul li button[aria-current="page"] {
-        border: 1px solid rgba(34, 163, 93, 0.34) !important;
-        background: linear-gradient(100deg, #21a35e, #34c67a) !important;
-        box-shadow:
-            0 1px 0 rgba(255, 255, 255, 0.22) inset,
-            0 8px 16px rgba(34, 163, 93, 0.28);
-    }
-    [data-testid="stSidebarNav"] ul li a[aria-current="page"],
-    [data-testid="stSidebarNav"] ul li a[aria-current="page"] *,
-    [data-testid="stSidebarNav"] ul li button[aria-current="page"],
-    [data-testid="stSidebarNav"] ul li button[aria-current="page"] * {
-        color: #ffffff !important;
-        fill: #ffffff !important;
-        font-weight: 700 !important;
-    }
-    [data-testid="stSidebarNav"] ul li a[aria-current="page"]::before {
-        filter: brightness(0) invert(1);
-        opacity: 0.96;
-    }
-    __ICON_CSS__
-    .stTextInput > div > div > input,
-    .stTextArea textarea,
-    .stSelectbox [data-baseweb="select"] > div,
-    .stMultiSelect [data-baseweb="select"] > div,
-    .stNumberInput input {
-        border-radius: 12px !important;
-        border: 1px solid #d7deeb !important;
-        background: rgba(255, 255, 255, 0.87) !important;
-        box-shadow: none !important;
-        color: #173b2b !important;
-    }
-    .stTextInput > div > div > input:focus,
-    .stTextArea textarea:focus,
-    .stSelectbox [data-baseweb="select"] > div:focus-within,
-    .stMultiSelect [data-baseweb="select"] > div:focus-within,
-    .stNumberInput input:focus {
-        border-color: #21a35e !important;
-        box-shadow: 0 0 0 3px rgba(34, 163, 93, 0.18) !important;
-    }
-    /* Force dropdown/expanded menus to green-white accents */
-    [data-baseweb="popover"] [role="listbox"],
-    [data-baseweb="popover"] [data-baseweb="menu"],
-    div[role="listbox"] {
-        background: rgba(248, 252, 249, 0.98) !important;
-        border: 1px solid rgba(185, 214, 198, 0.95) !important;
-        border-radius: 12px !important;
-        box-shadow: 0 10px 24px rgba(44, 95, 67, 0.14) !important;
-    }
-    [data-baseweb="popover"] [role="option"],
-    [data-baseweb="popover"] li,
-    [data-baseweb="popover"] [data-baseweb="menu"] > div,
-    div[role="listbox"] [role="option"] {
-        background: transparent !important;
-        color: #173b2b !important;
-    }
-    [data-baseweb="popover"] [role="option"]:hover,
-    [data-baseweb="popover"] li:hover,
-    [data-baseweb="popover"] [data-highlighted="true"],
-    div[role="listbox"] [role="option"]:hover {
-        background: rgba(34, 163, 93, 0.10) !important;
-    }
-    [data-baseweb="popover"] [role="option"][aria-selected="true"],
-    [data-baseweb="popover"] li[aria-selected="true"],
-    [data-baseweb="popover"] [aria-selected="true"],
-    div[role="listbox"] [role="option"][aria-selected="true"] {
-        background: rgba(34, 163, 93, 0.18) !important;
-        color: #173b2b !important;
-    }
-    [data-baseweb="tag"] {
-        background: #2f9d62 !important;
-        border: 1px solid #288653 !important;
-        color: #ffffff !important;
-    }
-    [data-baseweb="tag"] *,
-    [data-baseweb="tag"] svg {
-        color: #ffffff !important;
-        fill: #ffffff !important;
-    }
-    /* Keep sliders/toggles green while page background stays blue */
-    .stSlider [data-baseweb="slider"] > div > div > div:first-child,
-    [data-baseweb="slider"] > div > div > div:first-child {
-        background-color: #1f8f52 !important;
-    }
-    .stSlider [data-baseweb="slider"] > div > div > div:last-child,
-    [data-baseweb="slider"] > div > div > div:last-child {
-        background-color: rgba(34, 163, 93, 0.30) !important;
-    }
-    [data-baseweb="slider"] [style*="rgb(79, 70, 229)"],
-    [data-baseweb="slider"] [style*="rgb(91, 80, 255)"],
-    [data-baseweb="slider"] [style*="rgb(67, 56, 202)"],
-    [data-baseweb="slider"] [style*="rgb("] {
-        background-color: #1f8f52 !important;
-        border-color: #1f8f52 !important;
-    }
-    [data-baseweb="slider"] [role="slider"] {
-        background-color: #1f8f52 !important;
-        border: 2px solid #ffffff !important;
-        box-shadow: 0 0 0 1px rgba(34, 163, 93, 0.35), 0 2px 6px rgba(34, 163, 93, 0.28) !important;
-    }
-    [data-baseweb="checkbox"] [aria-checked="true"] {
-        color: #1f8f52 !important;
-    }
-    [data-baseweb="checkbox"] [aria-checked="true"] > div {
-        background-color: #1f8f52 !important;
-        border-color: #1f8f52 !important;
-    }
-    input[type="checkbox"],
-    input[type="radio"] {
-        accent-color: #1f8f52 !important;
-    }
-    [data-baseweb="radio"] [aria-checked="true"] {
-        color: #1f8f52 !important;
-    }
-    .stButton > button,
-    .stDownloadButton > button,
-    [data-testid="baseButton-secondary"] {
-        border-radius: 999px !important;
-        border: 1px solid #d7dff2 !important;
-        font-weight: 500 !important;
-        min-height: 2.65rem;
-        padding: 0.3rem 1.08rem !important;
-        background: rgba(255, 255, 255, 0.94) !important;
-        transition: all 140ms ease;
-    }
-    .stButton > button[kind="primary"],
-    [data-testid="baseButton-primary"] {
-        background: linear-gradient(100deg, var(--pp-primary), var(--pp-primary-2)) !important;
-        color: #fff !important;
-        border: none !important;
-        box-shadow: 0 10px 22px rgba(31, 157, 85, 0.34);
-    }
-    .stButton > button[kind="primary"] *,
-    [data-testid="baseButton-primary"] * {
-        color: #fff !important;
-        fill: #fff !important;
-    }
-    [data-testid="stFormSubmitButton"] button,
-    [data-testid="stFormSubmitButton"] button * {
-        color: #fff !important;
-        fill: #fff !important;
-    }
-    .stButton > button:hover,
-    .stDownloadButton > button:hover {
-        transform: translateY(-1px);
-        box-shadow: 0 10px 20px rgba(44, 95, 67, 0.2);
-    }
-    div[data-testid="stVerticalBlockBorderWrapper"] {
-        border-radius: 18px !important;
-        border: 1px solid var(--pp-border) !important;
-        background: var(--pp-surface) !important;
-        box-shadow: var(--pp-shadow);
-    }
-    div[data-testid="stMetric"] {
-        background: rgba(255, 255, 255, 0.72);
-        border-radius: 14px;
-        border: 1px solid rgba(255, 255, 255, 0.84);
-        padding: 0.45rem 0.7rem;
-    }
-    div[data-testid="stMetric"] label {
-        color: #4d705d !important;
-        font-weight: 600 !important;
-        letter-spacing: 0.01em;
-    }
-    div[data-testid="stMetricValue"] {
-        color: #1f8f52 !important;
-        font-weight: 800 !important;
-    }
-    [data-testid="stDataFrame"],
-    [data-testid="stTable"] {
-        background: rgba(255, 255, 255, 0.78);
-        border-radius: 14px;
-        border: 1px solid rgba(255, 255, 255, 0.88);
-        overflow: hidden;
-    }
-    [data-testid="stDataFrame"] [role="grid"],
-    [data-testid="stDataFrame"] [role="rowgroup"],
-    [data-testid="stDataFrame"] [role="row"],
-    [data-testid="stDataFrame"] [role="gridcell"],
-    [data-testid="stDataFrame"] [role="columnheader"] {
-        background-color: rgba(247, 252, 248, 0.90) !important;
-        border-color: rgba(188, 213, 196, 0.55) !important;
-    }
-    [data-testid="stTable"] table,
-    [data-testid="stTable"] th,
-    [data-testid="stTable"] td {
-        background-color: rgba(248, 252, 249, 0.94) !important;
-        border-color: rgba(188, 213, 196, 0.62) !important;
-    }
-    .pp-page-header {
-        margin: 0.1rem 0 1.0rem 0;
-    }
-    .pp-page-title {
-        margin: 0.2rem 0 0.45rem 0;
-        font-size: clamp(1.95rem, 2.65vw, 3.0rem);
-        line-height: 1.1;
-        font-weight: 800;
-        color: #123726;
-    }
-    .pp-page-subtitle {
-        margin: 0;
-        max-width: 880px;
-        color: #4a6e5b;
-        font-size: 1.02rem;
-        line-height: 1.62;
-    }
-    .pp-badge {
-        display: inline-flex;
-        align-items: center;
-        gap: 0.38rem;
-        padding: 0.3rem 0.72rem;
-        border-radius: 999px;
-        border: 1px solid rgba(255, 255, 255, 0.92);
-        background: rgba(255, 255, 255, 0.72);
-        color: #3f6856;
-        font-size: 0.76rem;
-        font-weight: 700;
-        text-transform: uppercase;
-        letter-spacing: 0.04em;
-    }
-    .pp-hero {
-        border-radius: 22px;
-        border: 1px solid rgba(255, 255, 255, 0.84);
-        background:
-            linear-gradient(95deg, rgba(255, 255, 255, 0.74), rgba(255, 255, 255, 0.60)),
-            radial-gradient(800px 300px at 100% 0%, rgba(34, 163, 93, 0.16), transparent 72%);
-        box-shadow: 0 16px 32px rgba(56, 70, 121, 0.11);
-        padding: 1.45rem 1.5rem;
-        margin: 0.3rem 0 1.2rem;
-    }
-    .pp-hero-grid {
-        display: grid;
-        grid-template-columns: minmax(0, 4fr) minmax(130px, 1fr);
-        gap: 1.4rem;
-        align-items: center;
-    }
-    .pp-hero-title {
-        margin: 0.58rem 0 0.5rem;
-        color: #123726;
-        font-size: clamp(1.55rem, 2.12vw, 2.35rem);
-        font-weight: 800;
-        letter-spacing: -0.014em;
-        line-height: 1.2;
-    }
-    .pp-hero-copy {
-        margin: 0;
-        color: #4c6f5d;
-        max-width: 780px;
-        line-height: 1.66;
-    }
-    .pp-hero-logo {
-        display: flex;
-        justify-content: center;
-    }
-    .pp-hero-logo img {
-        width: 112px;
-        height: 112px;
-        border-radius: 18px;
-        border: 1px solid rgba(255, 255, 255, 0.88);
-        box-shadow: 0 12px 26px rgba(30, 49, 103, 0.2);
-        object-fit: contain;
-        background: rgba(255, 255, 255, 0.96);
-        padding: 8px;
-    }
-    .pp-stat-card {
-        border-radius: 16px;
-        border: 1px solid rgba(255, 255, 255, 0.9);
-        background: rgba(255, 255, 255, 0.78);
-        padding: 0.85rem 0.92rem;
-        box-shadow: 0 8px 22px rgba(56, 78, 145, 0.12);
-    }
-    .pp-stat-value {
-        margin: 0;
-        color: #2f9d62;
-        font-size: clamp(1.2rem, 1.8vw, 1.95rem);
-        font-weight: 800;
-        letter-spacing: -0.018em;
-    }
-    .pp-stat-label {
-        margin: 0.2rem 0 0;
-        color: #678a76;
-        font-size: 0.8rem;
-        text-transform: uppercase;
-        letter-spacing: 0.045em;
-        font-weight: 700;
-    }
-    .pp-kpi-strip {
-        border-radius: 28px;
-        border: 1px solid rgba(255, 255, 255, 0.9);
-        background: rgba(247, 251, 248, 0.86);
-        box-shadow: 0 12px 26px rgba(56, 78, 145, 0.08);
-        display: grid;
-        grid-template-columns: repeat(3, minmax(0, 1fr));
-        gap: 0;
-        padding: 1.45rem 1.15rem 1.25rem;
-        margin: 0.62rem 0 0.9rem 0;
-    }
-    .pp-kpi-item {
-        padding: 0.1rem 0.45rem 0.2rem;
-        text-align: center;
-    }
-    .pp-kpi-strip .pp-kpi-value {
-        margin: 0;
-        color: #2f9d62 !important;
-        font-size: 3.5rem !important;
-        font-weight: 800 !important;
-        line-height: 0.98 !important;
-        letter-spacing: -0.03em !important;
-    }
-    .pp-kpi-strip .pp-kpi-label {
-        margin: 0.75rem 0 0;
-        color: #6f8d7c !important;
-        font-size: 0.99rem !important;
-        font-weight: 700 !important;
-        letter-spacing: 0.12em !important;
-        text-transform: uppercase !important;
-    }
-    .pp-step-card {
-        border-radius: 16px;
-        border: 1px solid rgba(255, 255, 255, 0.9);
-        background: rgba(255, 255, 255, 0.77);
-        padding: 0.8rem 0.92rem;
-        min-height: 120px;
-    }
-    .pp-step-title {
-        margin: 0 0 0.26rem;
-        color: #1a4b36;
-        font-weight: 700;
-        font-size: 0.98rem;
-    }
-    .pp-step-copy {
-        margin: 0;
-        color: #557562;
-        font-size: 0.89rem;
-        line-height: 1.45;
-    }
-    .pp-module-card {
-        border-radius: 16px;
-        border: 1px solid rgba(255, 255, 255, 0.9);
-        background: rgba(255, 255, 255, 0.77);
-        box-shadow: 0 8px 22px rgba(56, 78, 145, 0.10);
-        padding: 1.02rem 1.05rem;
-        margin-bottom: 0.74rem;
-        min-height: 136px;
-    }
-    .pp-module-title {
-        margin: 0 0 0.32rem;
-        color: #1a4b36;
-        font-size: 1.02rem;
-        font-weight: 800;
-        line-height: 1.32;
-    }
-    .pp-module-copy {
-        margin: 0;
-        color: #557461;
-        font-size: 0.9rem;
-        line-height: 1.5;
-    }
-    .pp-api-card {
-        border-radius: 18px;
-        border: 1px solid rgba(255, 255, 255, 0.9);
-        background:
-            linear-gradient(120deg, rgba(255, 255, 255, 0.84), rgba(246, 251, 248, 0.76));
-        box-shadow: 0 10px 24px rgba(56, 78, 145, 0.10);
-        padding: 1rem 1.05rem 0.95rem;
-        margin: 0 0 0.8rem 0;
-    }
-    .pp-api-kicker {
-        display: inline-flex;
-        align-items: center;
-        padding: 0.28rem 0.72rem;
-        border-radius: 999px;
-        border: 1px solid rgba(188, 213, 196, 0.95);
-        background: rgba(255, 255, 255, 0.84);
-        color: #4a6e5b;
-        text-transform: uppercase;
-        letter-spacing: 0.08em;
-        font-size: 0.72rem;
-        font-weight: 700;
-    }
-    .pp-api-title {
-        margin: 0.72rem 0 0.26rem;
-        color: #123726;
-        font-size: clamp(1.2rem, 1.6vw, 1.55rem);
-        font-weight: 800;
-        letter-spacing: -0.02em;
-    }
-    .pp-api-copy {
-        margin: 0;
-        color: #557461;
-        font-size: 0.94rem;
-        line-height: 1.58;
-        max-width: 900px;
-    }
-    .pp-api-meta {
-        display: flex;
-        flex-wrap: wrap;
-        gap: 0.5rem;
-        margin: 0.25rem 0 0.15rem;
-    }
-    .pp-api-pill {
-        display: inline-flex;
-        align-items: center;
-        gap: 0.32rem;
-        padding: 0.34rem 0.72rem;
-        border-radius: 999px;
-        border: 1px solid rgba(188, 213, 196, 0.95);
-        background: rgba(248, 252, 249, 0.95);
-        color: #3f6856;
-        font-size: 0.8rem;
-        font-weight: 600;
-        line-height: 1.2;
-    }
-    .pp-api-pill strong {
-        color: #1f8f52;
-        font-weight: 800;
-    }
-    .pp-api-inline-head {
-        margin: 0.05rem 0 0.7rem 0;
-    }
-    .pp-main-card {
-        border-radius: 16px;
-        border: 1px solid rgba(255, 255, 255, 0.9);
-        background: rgba(255, 255, 255, 0.77);
-        box-shadow: 0 8px 22px rgba(56, 78, 145, 0.10);
-        padding: 1.1rem 1.08rem;
-        margin: 3.25rem 0 0.9rem 0;
-    }
-    .pp-main-grid {
-        display: grid;
-        grid-template-columns: minmax(0, 4fr) minmax(120px, 1fr);
-        gap: 1.1rem;
-        align-items: center;
-    }
-    .pp-main-card .pp-main-title {
-        margin: 0;
-        color: #000000 !important;
-        font-size: clamp(2.8rem, 4.2vw, 3.5rem) !important;
-        font-weight: 800 !important;
-        line-height: 0.98 !important;
-        letter-spacing: -0.03em !important;
-    }
-    .pp-main-copy {
-        margin: 0.5rem 0 0;
-        color: #557461;
-        font-size: 1.0rem;
-        line-height: 1.56;
-        max-width: 900px;
-    }
-    .pp-main-logo {
-        display: flex;
-        justify-content: center;
-    }
-    .pp-main-logo img {
-        width: 118px;
-        height: 118px;
-        border-radius: 16px;
-        border: 1px solid rgba(255, 255, 255, 0.62);
-        box-shadow: 0 10px 22px rgba(32, 92, 63, 0.20);
-        object-fit: contain;
-        background: linear-gradient(145deg, #2f8059, #1f9d55);
-        padding: 8px;
-        filter: drop-shadow(0 0 5px rgba(255, 255, 255, 0.25));
-    }
-    .pp-lab-card {
-        margin: 0.35rem 0 0.5rem 0;
-        border-radius: 20px;
-        border: 1px solid rgba(255, 255, 255, 0.92);
-        background:
-            linear-gradient(125deg, rgba(255, 255, 255, 0.82), rgba(242, 250, 245, 0.75));
-        box-shadow: 0 12px 26px rgba(44, 95, 67, 0.12);
-        padding: 1.3rem 1.35rem 1.25rem;
-    }
-    .pp-lab-kicker {
-        display: inline-flex;
-        align-items: center;
-        padding: 0.28rem 0.72rem;
-        border-radius: 999px;
-        background: rgba(255, 255, 255, 0.82);
-        border: 1px solid rgba(188, 213, 196, 0.95);
-        color: #4a6e5b;
-        text-transform: uppercase;
-        letter-spacing: 0.08em;
-        font-size: 0.74rem;
-        font-weight: 700;
-    }
-    .pp-lab-title {
-        margin: 0.72rem 0 0.26rem;
-        font-size: clamp(1.52rem, 2.1vw, 2.05rem);
-        font-weight: 800;
-        letter-spacing: -0.02em;
-        color: #123726;
-    }
-    .pp-lab-subtitle {
-        margin: 0;
-        color: #678a76;
-        font-size: 1.0rem;
-        font-weight: 600;
-        line-height: 1.45;
-    }
-    .pp-lab-copy {
-        margin: 1rem 0 1.15rem;
-        color: #4c6f5d;
-        font-size: 1.02rem;
-        line-height: 1.68;
-        max-width: 1080px;
-    }
-    .pp-lab-link {
-        display: inline-flex;
-        align-items: center;
-        justify-content: center;
-        text-decoration: none;
-        border-radius: 999px;
-        border: 1px solid rgba(34, 163, 93, 0.36);
-        background: linear-gradient(100deg, #21a35e, #34c67a);
-        color: #ffffff !important;
-        font-size: 0.95rem;
-        font-weight: 700;
-        letter-spacing: 0.01em;
-        padding: 0.56rem 1rem;
-        box-shadow: 0 8px 16px rgba(34, 163, 93, 0.25);
-        transition: transform 140ms ease, box-shadow 140ms ease, filter 140ms ease;
-    }
-    .pp-lab-link:hover {
-        color: #ffffff !important;
-        transform: translateY(-1px);
-        box-shadow: 0 12px 20px rgba(34, 163, 93, 0.3);
-        filter: saturate(1.02);
-    }
-    @media (max-width: 980px) {
-        [data-testid="stAppViewContainer"] {
-            height: auto;
-            overflow: visible !important;
-        }
-        section[data-testid="stMain"] {
-            margin: 8px;
-            height: auto !important;
-            border-radius: 16px;
-        }
-        [data-testid="stSidebar"] > div:first-child {
-            margin: 8px;
-            height: auto;
-            border-radius: 16px;
-        }
-        .pp-main-grid {
-            grid-template-columns: 1fr;
-            gap: 0.75rem;
-        }
-        .pp-kpi-strip {
-            grid-template-columns: 1fr;
-            gap: 0.42rem;
-            padding: 0.95rem 1rem 0.85rem;
-        }
-        .pp-kpi-item {
-            padding: 0.18rem 0.45rem;
-        }
-        .pp-kpi-strip .pp-kpi-value {
-            font-size: 3.05rem !important;
-        }
-        .pp-kpi-strip .pp-kpi-label {
-            font-size: 0.7rem !important;
-        }
-        .pp-main-logo {
-            justify-content: flex-start;
-        }
-        .pp-main-card {
-            margin-top: 0.7rem;
-        }
-        .pp-hero-grid {
-            grid-template-columns: 1fr;
-            gap: 0.85rem;
-        }
-        .pp-hero-logo {
-            justify-content: flex-start;
-        }
-        .pp-lab-card {
-            padding: 1.0rem;
-        }
-        .pp-lab-title {
-            margin-top: 0.58rem;
-        }
-        .pp-lab-copy {
-            margin-top: 0.78rem;
-            font-size: 0.96rem;
-        }
-    }
-    </style>
-    """
-    st.markdown(css.replace("__ICON_CSS__", icon_css), unsafe_allow_html=True)

src/utils.py DELETED Viewed

@@ -1,338 +0,0 @@
-# utils.py
-from __future__ import annotations
-from typing import Dict, List, Optional, Sequence, Literal
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-# Re-exported conveniences from data_builder
-from src.data_builder import TargetScaler, grouped_split_by_smiles  # noqa: F401
-# ---------------------------------------------------------
-# Seeding and device helpers
-# ---------------------------------------------------------
-def seed_everything(seed: int) -> None:
-    """Deterministically seed Python, NumPy, and PyTorch (CPU/CUDA)."""
-    import random
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-def to_device(batch, device: torch.device):
-    """Move a PyG Batch or simple dict of tensors to device."""
-    if hasattr(batch, "to"):
-        return batch.to(device)
-    if isinstance(batch, dict):
-        return {k: (v.to(device) if torch.is_tensor(v) else v) for k, v in batch.items()}
-    return batch
-# ---------------------------------------------------------
-# Masked metrics (canonical)
-# ---------------------------------------------------------
-def _safe_div(num: torch.Tensor, den: torch.Tensor) -> torch.Tensor:
-    den = torch.clamp(den, min=1e-12)
-    return num / den
-def masked_mse(pred: torch.Tensor, target: torch.Tensor, mask: torch.Tensor,
-               reduction: Literal["mean", "sum"] = "mean") -> torch.Tensor:
-    """
-    pred/target: [B, T]; mask: [B, T] bool
-    """
-    pred, target = pred.float(), target.float()
-    mask = mask.bool()
-    se = ((pred - target) ** 2) * mask
-    if reduction == "sum":
-        return se.sum()
-    return _safe_div(se.sum(), mask.sum().float())
-def masked_mae(pred: torch.Tensor, target: torch.Tensor, mask: torch.Tensor,
-               reduction: Literal["mean", "sum"] = "mean") -> torch.Tensor:
-    ae = (pred - target).abs() * mask
-    if reduction == "sum":
-        return ae.sum()
-    return _safe_div(ae.sum(), mask.sum().float())
-def masked_rmse(pred: torch.Tensor, target: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-    return torch.sqrt(masked_mse(pred, target, mask, reduction="mean"))
-def masked_r2(pred: torch.Tensor, target: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-    """
-    Masked coefficient of determination across all elements jointly.
-    """
-    pred, target = pred.float(), target.float()
-    mask = mask.bool()
-    count = mask.sum().float().clamp(min=1.0)
-    mean = _safe_div((target * mask).sum(), count)
-    sst = (((target - mean) ** 2) * mask).sum()
-    sse = (((target - pred) ** 2) * mask).sum()
-    return 1.0 - _safe_div(sse, sst.clamp(min=1e-12))
-def masked_metrics_overall(pred: torch.Tensor, target: torch.Tensor, mask: torch.Tensor) -> Dict[str, float]:
-    return {
-        "rmse": float(masked_rmse(pred, target, mask).detach().cpu()),
-        "mae": float(masked_mae(pred, target, mask).detach().cpu()),
-        "r2": float(masked_r2(pred, target, mask).detach().cpu()),
-    }
-def masked_metrics_per_task(
-    pred: torch.Tensor,
-    target: torch.Tensor,
-    mask: torch.Tensor,
-    task_names: Sequence[str],
-) -> Dict[str, Dict[str, float]]:
-    """
-    Per-task metrics using the same masked formulations.
-    """
-    out: Dict[str, Dict[str, float]] = {}
-    for t, name in enumerate(task_names):
-        m = mask[:, t]
-        if m.any():
-            rmse = float(masked_rmse(pred[:, t:t+1], target[:, t:t+1], m.unsqueeze(1)).detach().cpu())
-            mae  = float(masked_mae(pred[:, t:t+1], target[:, t:t+1], m.unsqueeze(1)).detach().cpu())
-            r2   = float(masked_r2(pred[:, t:t+1], target[:, t:t+1], m.unsqueeze(1)).detach().cpu())
-        else:
-            rmse = mae = r2 = float("nan")
-        out[name] = {"rmse": rmse, "mae": mae, "r2": r2}
-    return out
-def masked_metrics_by_fidelity(
-    pred: torch.Tensor,
-    target: torch.Tensor,
-    mask: torch.Tensor,
-    fid_idx: torch.Tensor,
-    fid_names: Sequence[str],
-    task_names: Sequence[str],  # kept for API parity; not used in overall-by-fid
-) -> Dict[str, Dict[str, float]]:
-    """
-    Overall metrics per fidelity (aggregated across tasks).
-    """
-    out: Dict[str, Dict[str, float]] = {}
-    fid_idx = fid_idx.view(-1).long()
-    for i, fname in enumerate(fid_names):
-        sel = (fid_idx == i)
-        if sel.any():
-            p = pred[sel]
-            y = target[sel]
-            m = mask[sel]
-            out[fname] = masked_metrics_overall(p, y, m)
-        else:
-            out[fname] = {"rmse": float("nan"), "mae": float("nan"), "r2": float("nan")}
-    return out
-# ---------------------------------------------------------
-# Multitask, multi-fidelity loss (canonical)
-# ---------------------------------------------------------
-def gaussian_nll(mu: torch.Tensor, logvar: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-    """
-    Element-wise Gaussian NLL (no reduction).
-    Shapes: mu, logvar, target -> [B, T] (or broadcastable).
-    """
-    logvar = torch.as_tensor(logvar, device=mu.device, dtype=mu.dtype)
-    logvar = logvar.clamp(min=-20.0, max=20.0)  # numerical guard
-    var = torch.exp(logvar)
-    err2_over_var = (target - mu) ** 2 / var
-    nll = 0.5 * (err2_over_var + logvar + math.log(2.0 * math.pi))  # [B, T]
-    return nll
-def loss_multitask_fidelity(
-    *,
-    pred: torch.Tensor,            # [B, T] (or means if heteroscedastic)
-    target: torch.Tensor,          # [B, T]
-    mask: torch.Tensor,            # [B, T] bool
-    fid_idx: torch.Tensor,         # [B] long (per-row fidelity index)
-    fid_loss_w: Sequence[float] | torch.Tensor | None,     # [F] weights per fidelity
-    task_weights: Optional[Sequence[float] | torch.Tensor] = None,  # [T]
-    hetero_logvar: Optional[torch.Tensor] = None,   # [B, T] if heteroscedastic head
-    reduction: Literal["mean", "sum"] = "mean",
-    task_log_sigma2: Optional[torch.Tensor] = None, # [T] learned homoscedastic uncertainty
-    balanced: bool = True,
-) -> torch.Tensor:
-    """
-    Multi-task, multi-fidelity loss with *balanced per-task reduction* by default.
-    - If `hetero_logvar` is given: uses Gaussian NLL per element.
-    - Applies per-fidelity weights via `fid_idx`.
-    - Balanced reduction: compute mean loss per task first, then average across tasks
-      (optionally weight by `task_weights` or learned uncertainty `task_log_sigma2`).
-    - If `balanced=False`, uses legacy global reduction.
-    """
-    B, T = pred.shape
-    pred = pred.float()
-    target = target.float()
-    mask = mask.bool()
-    fid_idx = fid_idx.view(-1).long()
-    # Task weights (optional)
-    if task_weights is None:
-        tw = pred.new_ones(T)  # [T]
-    else:
-        tw = torch.as_tensor(task_weights, dtype=pred.dtype, device=pred.device)
-        assert tw.numel() == T, f"task_weights len {tw.numel()} != T {T}"
-        s = tw.sum().clamp(min=1e-12)
-        tw = tw * (T / s)      # normalize to sum=T for stable scale
-    # Fidelity weights
-    if fid_loss_w is None:
-        fw = pred.new_ones(int(fid_idx.max().item()) + 1)
-    else:
-        fw = torch.as_tensor(fid_loss_w, dtype=pred.dtype, device=pred.device)
-    w_fid = fw[fid_idx].unsqueeze(1).expand(-1, T)  # [B, T]
-    # Elementwise loss
-    if hetero_logvar is not None:
-        elem_loss = gaussian_nll(pred, hetero_logvar.float(), target)  # [B, T]
-    else:
-        elem_loss = (pred - target) ** 2                                # [B, T]
-    if not balanced:
-        # Legacy global reduction (label-count biased)
-        w_task = tw.view(1, T).expand(B, -1)
-        weighted = elem_loss * mask * w_task * w_fid
-        if reduction == "sum":
-            return weighted.sum()
-        denom = (mask * w_task * w_fid).sum().float().clamp(min=1e-12)
-        return weighted.sum() / denom
-    # -------- Balanced per-task reduction --------
-    # First compute a per-task average (exclude tw here)
-    num = (elem_loss * mask * w_fid).sum(dim=0)              # [T]
-    den = (mask * w_fid).sum(dim=0).float().clamp(min=1e-12) # [T]
-    per_task_loss = num / den                                # [T]
-    # Optional manual task weights AFTER per-task averaging
-    if task_weights is not None:
-        per_task_loss = per_task_loss * tw
-    # Optional homoscedastic task-uncertainty weighting (Kendall & Gal)
-    if task_log_sigma2 is not None:
-        assert task_log_sigma2.numel() == T, f"task_log_sigma2 must be [T], got {task_log_sigma2.shape}"
-        sigma2 = torch.exp(task_log_sigma2)  # [T]
-        per_task_loss = per_task_loss / (2.0 * sigma2) + 0.5 * torch.log(sigma2)
-    if reduction == "sum":
-        return per_task_loss.sum()
-    return per_task_loss.mean()
-# ---------------------------------------------------------
-# Curriculum scheduler for EXP fidelity
-# ---------------------------------------------------------
-def exp_weight_at_epoch(
-    epoch: int,
-    total_epochs: int,
-    schedule: Literal["none", "linear", "cosine"] = "none",
-    start: float = 0.6,
-    end: float = 1.0,
-) -> float:
-    """
-    Returns the EXP loss weight for a given epoch under the chosen schedule.
-    """
-    if schedule == "none":
-        return float(end)
-    epoch = max(0, min(epoch, total_epochs))
-    if total_epochs <= 0:
-        return float(end)
-    t = epoch / float(total_epochs)
-    if schedule == "linear":
-        return float(start + (end - start) * t)
-    if schedule == "cosine":
-        cos_t = 0.5 - 0.5 * math.cos(math.pi * t)  # 0->1 smoothly
-        return float(start + (end - start) * cos_t)
-    raise ValueError(f"Unknown schedule: {schedule}")
-def make_fid_loss_weights(
-    fids: Sequence[str],
-    base_weights: Optional[Sequence[float]] = None,
-    exp_weight: Optional[float] = None,
-) -> List[float]:
-    """
-    Builds a per-fidelity weight vector aligned with dataset.fids order.
-    If exp_weight is provided, it overrides the weight for the 'exp' fidelity.
-    If base_weights is provided, it must match len(fids) and is used as a template.
-    """
-    fids_lc = [f.lower() for f in fids]
-    F = len(fids_lc)
-    if base_weights is None:
-        w = [1.0] * F
-    else:
-        assert len(base_weights) == F, f"base_weights len {len(base_weights)} != {F}"
-        w = [float(x) for x in base_weights]
-    if exp_weight is not None and "exp" in fids_lc:
-        idx = fids_lc.index("exp")
-        w[idx] = float(exp_weight)
-    return w
-# ---------------------------------------------------------
-# Inference utilities
-# ---------------------------------------------------------
-def apply_inverse_transform(pred: torch.Tensor, scaler):
-    """
-    Apply inverse target scaling safely on the same device as pred.
-    Works for CPU/GPU and legacy scalers.
-    """
-    dev = pred.device
-    # Move scaler tensors to pred device if needed
-    if hasattr(scaler, "mean") and scaler.mean.device != dev:
-        scaler.mean = scaler.mean.to(dev)
-    if hasattr(scaler, "std") and scaler.std.device != dev:
-        scaler.std = scaler.std.to(dev)
-    if hasattr(scaler, "eps") and scaler.eps is not None and scaler.eps.device != dev:
-        scaler.eps = scaler.eps.to(dev)
-    return scaler.inverse(pred)
-def ensure_2d(x: torch.Tensor) -> torch.Tensor:
-    """Utility to guarantee [B, T] shape for single-task or squeezed outputs."""
-    if x.dim() == 1:
-        return x.unsqueeze(1)
-    return x
-# ---------------------------------------------------------
-# Simple test harness (optional)
-# ---------------------------------------------------------
-if __name__ == "__main__":
-    # Minimal sanity checks
-    torch.manual_seed(0)
-    B, T = 5, 3
-    pred = torch.randn(B, T)
-    targ = torch.randn(B, T)
-    mask = torch.rand(B, T) > 0.3
-    fid_idx = torch.randint(0, 4, (B,))
-    fid_w = [1.0, 0.8, 0.6, 0.5]
-    task_w = [1.0, 2.0, 1.0]
-    l1 = loss_multitask_fidelity(pred=pred, target=targ, mask=mask, fid_idx=fid_idx, fid_loss_w=fid_w, task_weights=task_w)
-    l2 = loss_multitask_fidelity(pred=pred, target=targ, mask=mask, fid_idx=fid_idx, fid_loss_w=fid_w, task_weights=None)
-    print("Loss with task weights:", float(l1))
-    print("Loss without task weights:", float(l2))
-    m_all = masked_metrics_overall(pred, targ, mask)
-    print("Overall metrics:", m_all)