Spaces:

sobinalosious92
/

POLYMER-PROPERTY

Running

File size: 26,843 Bytes

930ea3d

# src/discovery.py
from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path
from typing import Callable, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from . import sascorer

# Reuse your canonicalizer if you want; otherwise keep local
def canonicalize_smiles(smiles: str) -> Optional[str]:
    s = (smiles or "").strip()
    if not s:
        return None
    m = Chem.MolFromSmiles(s)
    if m is None:
        return None
    return Chem.MolToSmiles(m, canonical=True)


# -------------------------
# Spec schema (minimal v0)
# -------------------------
@dataclass
class DiscoverySpec:
    dataset: List[str]  # ["PI1M_PROPERTY.parquet", "POLYINFO_PROPERTY.parquet"]
    polyinfo: str  # "POLYINFO_PROPERTY.parquet"
    polyinfo_csv: str  # "POLYINFO.csv"

    hard_constraints: Dict[str, Dict[str, float]]  # { "tg": {"min": 400}, "tc": {"max": 0.3} }
    objectives: List[Dict[str, str]]  # [{"property":"cp","goal":"maximize"}, ...]

    max_pool: int = 200000         # legacy (kept for compatibility; aligned to pareto_max)
    pareto_max: int = 50000        # cap points used for Pareto + diversity fingerprinting
    max_candidates: int = 30       # final output size
    max_pareto_fronts: int = 5     # how many Pareto layers to keep for candidate pool
    min_distance: float = 0.30     # diversity threshold in Tanimoto distance
    fingerprint: str = "morgan"    # morgan only for now
    random_seed: int = 7
    use_canonical_smiles: bool = True
    use_full_data: bool = False
    trust_weights: Dict[str, float] | None = None
    selection_weights: Dict[str, float] | None = None


# -------------------------
# Column mapping
# -------------------------
def mean_col(prop_key: str) -> str:
    return f"mean_{prop_key.lower()}"

def std_col(prop_key: str) -> str:
    return f"std_{prop_key.lower()}"


def normalize_weights(weights: Dict[str, float], defaults: Dict[str, float]) -> Dict[str, float]:
    out: Dict[str, float] = {}
    for k, v in defaults.items():
        try:
            vv = float(weights.get(k, v))
        except Exception:
            vv = float(v)
        out[k] = max(0.0, vv)
    s = float(sum(out.values()))
    if s <= 0.0:
        return defaults.copy()
    return {k: float(v / s) for k, v in out.items()}

def spec_from_dict(obj: dict, dataset_path: List[str], polyinfo_path: str, polyinfo_csv_path: str) -> DiscoverySpec:
    pareto_max = int(obj.get("pareto_max", 50000))
    return DiscoverySpec(
        dataset=list(dataset_path),
        polyinfo=polyinfo_path,
        polyinfo_csv=polyinfo_csv_path,
        hard_constraints=obj.get("hard_constraints", {}),
        objectives=obj.get("objectives", []),
        # Legacy field kept for compatibility; effectively collapsed to pareto_max.
        max_pool=pareto_max,
        pareto_max=pareto_max,
        max_candidates=int(obj.get("max_candidates", 30)),
        max_pareto_fronts=int(obj.get("max_pareto_fronts", 5)),
        min_distance=float(obj.get("min_distance", 0.30)),
        fingerprint=str(obj.get("fingerprint", "morgan")),
        random_seed=int(obj.get("random_seed", 7)),
        use_canonical_smiles=not bool(obj.get("skip_smiles_canonicalization", True)),
        use_full_data=bool(obj.get("use_full_data", False)),
        trust_weights=obj.get("trust_weights"),
        selection_weights=obj.get("selection_weights"),
    )

# -------------------------
# Parquet loading (safe)
# -------------------------
def load_parquet_columns(path: str | List[str], columns: List[str]) -> pd.DataFrame:
    """
    Load only requested columns from Parquet (critical for 1M rows).
    Accepts a single path or a list of paths and concatenates rows.
    """
    def _load_one(fp: str, req_cols: List[str]) -> pd.DataFrame:
        available: list[str]
        try:
            import pyarrow.parquet as pq

            pf = pq.ParquetFile(fp)
            available = [str(c) for c in pf.schema.names]
        except Exception:
            # If schema probing fails, fall back to direct read with requested columns.
            return pd.read_parquet(fp, columns=req_cols)

        available_set = set(available)
        lower_to_actual = {c.lower(): c for c in available}

        # Resolve requested names against actual parquet schema.
        resolved: dict[str, str] = {}
        for req in req_cols:
            if req in available_set:
                resolved[req] = req
                continue
            alt = lower_to_actual.get(str(req).lower())
            if alt is not None:
                resolved[req] = alt

        use_cols = sorted(set(resolved.values()))
        if not use_cols:
            return pd.DataFrame(columns=req_cols)

        out = pd.read_parquet(fp, columns=use_cols)
        for req in req_cols:
            src = resolved.get(req)
            if src is None:
                out[req] = np.nan
            elif src != req:
                out[req] = out[src]
        return out[req_cols]

    if isinstance(path, (list, tuple)):
        frames = [_load_one(p, columns) for p in path]
        if not frames:
            return pd.DataFrame(columns=columns)
        return pd.concat(frames, ignore_index=True)
    return _load_one(path, columns)


def normalize_smiles(smiles: str, use_canonical_smiles: bool) -> Optional[str]:
    s = (smiles or "").strip()
    if not s:
        return None
    if not use_canonical_smiles:
        # Skip RDKit parsing entirely in fast mode.
        return s
    m = Chem.MolFromSmiles(s)
    if m is None:
        return None
    if use_canonical_smiles:
        return Chem.MolToSmiles(m, canonical=True)
    return s


def load_polyinfo_index(polyinfo_csv_path: str, use_canonical_smiles: bool = True) -> pd.DataFrame:
    """
    Expected CSV columns: SMILES, Polymer_Class, polymer_name (or common variants).
    Returns dataframe with index on smiles_key and columns polymer_name/polymer_class.
    """
    df = pd.read_csv(polyinfo_csv_path)

    # normalize column names
    cols = {c: c for c in df.columns}
    # map typical names
    if "SMILES" in cols:
        df = df.rename(columns={"SMILES": "smiles"})
    elif "smiles" not in df.columns:
        raise ValueError(f"{polyinfo_csv_path} missing SMILES/smiles column")

    if "Polymer_Name" in df.columns:
        df = df.rename(columns={"Polymer_Name": "polymer_name"})
    if "polymer_Name" in df.columns:
        df = df.rename(columns={"polymer_Name": "polymer_name"})
    if "Polymer_Class" in df.columns:
        df = df.rename(columns={"Polymer_Class": "polymer_class"})

    if "polymer_name" not in df.columns:
        df["polymer_name"] = pd.NA
    if "polymer_class" not in df.columns:
        df["polymer_class"] = pd.NA

    df["smiles_key"] = df["smiles"].astype(str).map(lambda s: normalize_smiles(s, use_canonical_smiles))
    df = df.dropna(subset=["smiles_key"]).drop_duplicates("smiles_key")
    df = df.set_index("smiles_key", drop=True)
    return df[["polymer_name", "polymer_class"]]


# -------------------------
# Pareto (2–3 objectives)
# -------------------------
def pareto_front_mask(X: np.ndarray) -> np.ndarray:
    """
    Returns mask for nondominated points.
    X: (N, M), all objectives assumed to be minimized.
    For maximize objectives, we invert before calling this.
    """
    N = X.shape[0]
    is_efficient = np.ones(N, dtype=bool)
    for i in range(N):
        if not is_efficient[i]:
            continue
        # any point that is <= in all dims and < in at least one dominates
        dominates = np.all(X <= X[i], axis=1) & np.any(X < X[i], axis=1)
        # if a point dominates i, mark i inefficient
        if np.any(dominates):
            is_efficient[i] = False
            continue
        # otherwise, i may dominate others
        dominated_by_i = np.all(X[i] <= X, axis=1) & np.any(X[i] < X, axis=1)
        is_efficient[dominated_by_i] = False
        is_efficient[i] = True
    return is_efficient


def pareto_layers(X: np.ndarray, max_layers: int = 10) -> np.ndarray:
    """
    Returns layer index per point: 1 = Pareto front, 2 = second layer, ...
    Unassigned points beyond max_layers get 0.
    """
    N = X.shape[0]
    layers = np.zeros(N, dtype=int)
    remaining = np.arange(N)

    layer = 1
    while remaining.size > 0 and layer <= max_layers:
        mask = pareto_front_mask(X[remaining])
        front_idx = remaining[mask]
        layers[front_idx] = layer
        remaining = remaining[~mask]
        layer += 1
    return layers


def pareto_front_mask_chunked(
    X: np.ndarray,
    chunk_size: int = 100000,
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> np.ndarray:
    """
    Exact global Pareto front mask via chunk-local front reduction + global reconcile.
    This is exact for front-1:
      1) compute exact local front within each chunk
      2) union local fronts
      3) compute exact front on the union
    """
    N = X.shape[0]
    if N <= chunk_size:
        if progress_callback is not None:
            progress_callback(1, 1)
        return pareto_front_mask(X)

    local_front_idx = []
    total_chunks = (N + chunk_size - 1) // chunk_size
    done_chunks = 0
    for start in range(0, N, chunk_size):
        end = min(start + chunk_size, N)
        idx = np.arange(start, end)
        mask_local = pareto_front_mask(X[idx])
        local_front_idx.append(idx[mask_local])
        done_chunks += 1
        if progress_callback is not None:
            progress_callback(done_chunks, total_chunks)

    if not local_front_idx:
        return np.zeros(N, dtype=bool)

    reduced_idx = np.concatenate(local_front_idx)
    reduced_mask = pareto_front_mask(X[reduced_idx])
    front_idx = reduced_idx[reduced_mask]

    out = np.zeros(N, dtype=bool)
    out[front_idx] = True
    return out


def pareto_layers_chunked(
    X: np.ndarray,
    max_layers: int = 10,
    chunk_size: int = 100000,
    progress_callback: Optional[Callable[[int, int, int], None]] = None,
) -> np.ndarray:
    """
    Exact Pareto layers using repeated exact chunked front extraction.
    """
    N = X.shape[0]
    layers = np.zeros(N, dtype=int)
    remaining = np.arange(N)
    layer = 1

    while remaining.size > 0 and layer <= max_layers:
        def on_chunk(done: int, total: int) -> None:
            if progress_callback is not None:
                progress_callback(layer, done, total)

        mask = pareto_front_mask_chunked(X[remaining], chunk_size=chunk_size, progress_callback=on_chunk)
        front_idx = remaining[mask]
        layers[front_idx] = layer
        remaining = remaining[~mask]
        layer += 1

    return layers


# -------------------------
# Fingerprints & diversity
# -------------------------
def morgan_fp(smiles: str, radius: int = 2, nbits: int = 2048):
    m = Chem.MolFromSmiles(smiles)
    if m is None:
        return None
    return AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=nbits)

def tanimoto_distance(fp1, fp2) -> float:
    return 1.0 - DataStructs.TanimotoSimilarity(fp1, fp2)

def greedy_diverse_select(
    smiles_list: List[str],
    scores: np.ndarray,
    max_k: int,
    min_dist: float,
) -> List[int]:
    """
    Greedy selection by descending score, enforcing min Tanimoto distance.
    Returns indices into smiles_list.
    """
    fps = []
    valid_idx = []
    for i, s in enumerate(smiles_list):
        fp = morgan_fp(s)
        if fp is not None:
            fps.append(fp)
            valid_idx.append(i)

    if not valid_idx:
        return []

    # rank candidates (higher score first)
    order = np.argsort(-scores[valid_idx])
    selected_global = []
    selected_fps = []

    for oi in order:
        i = valid_idx[oi]
        fp_i = fps[oi]  # aligned with valid_idx
        ok = True
        for fp_j in selected_fps:
            if tanimoto_distance(fp_i, fp_j) < min_dist:
                ok = False
                break
        if ok:
            selected_global.append(i)
            selected_fps.append(fp_i)
        if len(selected_global) >= max_k:
            break

    return selected_global


# -------------------------
# Trust score (lightweight, robust)
# -------------------------
def internal_consistency_penalty(row: pd.Series) -> float:
    """
    Very simple physics/validity checks. Penalty in [0,1].
    Adjust/add rules later.
    """
    viol = 0
    total = 0

    def chk(cond: bool):
        nonlocal viol, total
        total += 1
        if not cond:
            viol += 1

    # positivity checks if present
    for p in ["cp", "tc", "rho", "dif", "visc", "tg", "tm", "bandgap"]:
        c = mean_col(p)
        if c in row.index and pd.notna(row[c]):
            if p in ["bandgap", "tg", "tm"]:
                chk(float(row[c]) >= 0.0)
            else:
                chk(float(row[c]) > 0.0)

    # Poisson ratio bounds if present
    if mean_col("poisson") in row.index and pd.notna(row[mean_col("poisson")]):
        v = float(row[mean_col("poisson")])
        chk(0.0 <= v <= 0.5)

    # Tg <= Tm if both present
    if mean_col("tg") in row.index and mean_col("tm") in row.index:
        if pd.notna(row[mean_col("tg")]) and pd.notna(row[mean_col("tm")]):
            chk(float(row[mean_col("tg")]) <= float(row[mean_col("tm")]))

    if total == 0:
        return 0.0
    return viol / total


def synthesizability_score(smiles: str) -> float:
    """
    RDKit SA-score based synthesizability proxy in [0,1].
    SA-score is ~[1 (easy), 10 (hard)].
    We map: 1 -> 1.0, 10 -> 0.0
    """
    m = Chem.MolFromSmiles(smiles)
    if m is None:
        return 0.0

    # Guard against unexpected scorer failures / None for edge-case molecules.
    try:
        sa_raw = sascorer.calculateScore(m)
    except Exception:
        return 0.0
    if sa_raw is None:
        return 0.0

    sa = float(sa_raw)  # ~ 1..10
    s_syn = 1.0 - (sa - 1.0) / 9.0          # linear map to [0,1]
    return float(np.clip(s_syn, 0.0, 1.0))


def compute_trust_scores(
    df: pd.DataFrame,
    real_fps: List,
    real_smiles: List[str],
    trust_weights: Dict[str, float] | None = None,
) -> np.ndarray:
    """
    Trust score in [0,1] (higher = more trustworthy / lower risk).
    Components:
      - distance to nearest real polymer (fingerprint distance)
      - internal consistency penalty
      - uncertainty penalty (if std columns exist)
      - synthesizability
    """
    N = len(df)
    trust = np.zeros(N, dtype=float)
    tw_defaults = {"real": 0.45, "consistency": 0.25, "uncertainty": 0.10, "synth": 0.20}
    tw = normalize_weights(trust_weights or {}, tw_defaults)

    # nearest-real distance (expensive if done naively)
    # We do it only for the (small) post-filter set, which is safe.
    smiles_col = "smiles_key" if "smiles_key" in df.columns else "smiles_canon"
    for i in range(N):
        s = df.iloc[i][smiles_col]
        fp = morgan_fp(s)
        if fp is None or not real_fps:
            d_real = 1.0
        else:
            sims = DataStructs.BulkTanimotoSimilarity(fp, real_fps)
            d_real = 1.0 - float(max(sims))  # distance to nearest

        # internal consistency
        pen_cons = internal_consistency_penalty(df.iloc[i])

        # uncertainty: average normalized std for any std_* columns present
        std_cols = [c for c in df.columns if c.startswith("std_")]
        if std_cols:
            std_vals = df.iloc[i][std_cols].astype(float)
            std_vals = std_vals.replace([np.inf, -np.inf], np.nan).dropna()
            pen_unc = float(np.clip(std_vals.mean() / (std_vals.mean() + 1.0), 0.0, 1.0)) if len(std_vals) else 0.0
        else:
            pen_unc = 0.0

        # synthesizability heuristic
        s_syn = synthesizability_score(s)

        # Combine (tunable weights)
        # lower distance to real is better -> convert to score
        s_real = 1.0 - np.clip(d_real, 0.0, 1.0)

        trust[i] = (
            tw["real"] * s_real +
            tw["consistency"] * (1.0 - pen_cons) +
            tw["uncertainty"] * (1.0 - pen_unc) +
            tw["synth"] * s_syn
        )

    trust = np.clip(trust, 0.0, 1.0)
    return trust


# -------------------------
# Main pipeline
# -------------------------
def run_discovery(
    spec: DiscoverySpec,
    progress_callback: Optional[Callable[[str, float], None]] = None,
) -> Tuple[pd.DataFrame, Dict[str, float], pd.DataFrame]:
    def report(step: str, pct: float) -> None:
        if progress_callback is not None:
            progress_callback(step, pct)

    rng = np.random.default_rng(spec.random_seed)

    # 1) Determine required columns
    report("Preparing columns…", 0.02)
    obj_props = [o["property"].lower() for o in spec.objectives]
    cons_props = [p.lower() for p in spec.hard_constraints.keys()]

    needed_props = sorted(set(obj_props + cons_props))
    cols = ["SMILES"] + [mean_col(p) for p in needed_props]

    # include std columns if available (not required, but used for trust)
    std_cols = [std_col(p) for p in needed_props]
    cols += std_cols

    # 2) Load only needed columns
    report("Loading data from parquet…", 0.05)
    df = load_parquet_columns(spec.dataset, columns=[c for c in cols if c != "SMILES"] + ["SMILES"])
    # normalize
    if "SMILES" not in df.columns and "smiles" in df.columns:
        df = df.rename(columns={"smiles": "SMILES"})
    normalize_step = "Canonicalizing SMILES…" if spec.use_canonical_smiles else "Skipping SMILES normalization…"
    report(normalize_step, 0.10)
    df["smiles_key"] = df["SMILES"].astype(str).map(lambda s: normalize_smiles(s, spec.use_canonical_smiles))
    df = df.dropna(subset=["smiles_key"]).reset_index(drop=True)

    # 3) Hard constraints
    report("Applying constraints…", 0.22)
    for p, rule in spec.hard_constraints.items():
        p = p.lower()
        c = mean_col(p)
        if c not in df.columns:
            # if missing, nothing can satisfy
            df = df.iloc[0:0]
            break
        if "min" in rule:
            df = df[df[c] >= float(rule["min"])]
        if "max" in rule:
            df = df[df[c] <= float(rule["max"])]

    n_after = len(df)
    if n_after == 0:
        empty_stats = {"n_total": 0, "n_after_constraints": 0, "n_pool": 0, "n_pareto_pool": 0, "n_selected": 0}
        return df, empty_stats, pd.DataFrame()

    n_pool = len(df)

    # 5) Prepare objective matrix for Pareto
    report("Building objective matrix…", 0.30)
    # convert to minimization: maximize => negate
    X = []
    for o in spec.objectives:
        prop = o["property"].lower()
        goal = o["goal"].lower()
        c = mean_col(prop)
        if c not in df.columns:
            raise ValueError(f"Objective column missing: {c}")
        v = df[c].to_numpy(dtype=float)
        if goal == "maximize":
            v = -v
        X.append(v)
    X = np.stack(X, axis=1)  # (N, M)

    # Pareto cap before computing layers (optional safety)
    if spec.use_full_data:
        report("Using full dataset (no Pareto cap)…", 0.35)
    elif len(df) > spec.pareto_max:
        idx = rng.choice(len(df), size=spec.pareto_max, replace=False)
        df = df.iloc[idx].reset_index(drop=True)
        X = X[idx]

    # 6) Pareto layers (only 5 layers needed for candidate pool)
    report("Computing Pareto layers…", 0.40)
    pareto_start = 0.40
    pareto_end = 0.54
    max_layers_for_pool = max(1, int(spec.max_pareto_fronts))
    pareto_chunk_ref = {"chunks_per_layer": None}

    def on_pareto_chunk(layer_i: int, done_chunks: int, total_chunks: int) -> None:
        if pareto_chunk_ref["chunks_per_layer"] is None:
            pareto_chunk_ref["chunks_per_layer"] = max(1, int(total_chunks))
        ref_chunks = pareto_chunk_ref["chunks_per_layer"]
        total_units = max_layers_for_pool * ref_chunks
        done_units = min(total_units, ((layer_i - 1) * ref_chunks) + done_chunks)
        pareto_pct = int(round(100.0 * done_units / max(1, total_units)))

        layer_progress = done_chunks / max(1, total_chunks)
        overall = ((layer_i - 1) + layer_progress) / max_layers_for_pool
        pct = pareto_start + (pareto_end - pareto_start) * min(1.0, max(0.0, overall))
        report(
            f"Computing Pareto layers… {pareto_pct}% (Layer {layer_i}/{max_layers_for_pool}, chunk {done_chunks}/{total_chunks})",
            pct,
        )

    layers = pareto_layers_chunked(
        X,
        max_layers=max_layers_for_pool,
        chunk_size=100000,
        progress_callback=on_pareto_chunk,
    )
    report("Computing Pareto layers…", pareto_end)
    df["pareto_layer"] = layers
    plot_df = df[["smiles_key"] + [mean_col(p) for p in obj_props] + ["pareto_layer"]].copy()
    plot_df = plot_df.rename(columns={"smiles_key": "SMILES"})

    # Keep first few layers as candidate pool (avoid huge set)
    cand = df[df["pareto_layer"].between(1, max_layers_for_pool)].copy()
    if cand.empty:
        cand = df[df["pareto_layer"] == 1].copy()
    cand = cand.reset_index(drop=True)
    n_pareto = len(cand)

    # 7) Load real polymer metadata and fingerprints (from POLYINFO.csv)
    report("Loading POLYINFO index…", 0.55)
    polyinfo = load_polyinfo_index(spec.polyinfo_csv, use_canonical_smiles=spec.use_canonical_smiles)
    real_smiles = polyinfo.index.to_list()

    report("Building real-polymer fingerprints…", 0.60)
    real_fps = []
    for s in real_smiles:
        fp = morgan_fp(s)
        if fp is not None:
            real_fps.append(fp)

    # 8) Trust score on candidate pool (safe size)
    report("Computing trust scores…", 0.70)
    trust = compute_trust_scores(
        cand,
        real_fps=real_fps,
        real_smiles=real_smiles,
        trust_weights=spec.trust_weights,
    )
    cand["trust_score"] = trust

    # 9) Diversity selection on candidate pool
    report("Diversity selection…", 0.88)
    # score for selection: prioritize Pareto layer 1 then trust
    # higher is better
    sw_defaults = {"pareto": 0.60, "trust": 0.40}
    sw = normalize_weights(spec.selection_weights or {}, sw_defaults)
    pareto_bonus = (
        (max_layers_for_pool + 1) - np.clip(cand["pareto_layer"].to_numpy(dtype=int), 1, max_layers_for_pool)
    ) / float(max_layers_for_pool)
    sel_score = sw["pareto"] * pareto_bonus + sw["trust"] * cand["trust_score"].to_numpy(dtype=float)

    chosen_idx = greedy_diverse_select(
        smiles_list=cand["smiles_key"].tolist(),
        scores=sel_score,
        max_k=spec.max_candidates,
        min_dist=spec.min_distance,
    )
    out = cand.iloc[chosen_idx].copy().reset_index(drop=True)

    # 10) Attach Polymer_Name/Class if available (only for matches)
    report("Finalizing results…", 0.96)
    out = out.set_index("smiles_key", drop=False)
    out = out.join(polyinfo, how="left")
    out = out.reset_index(drop=True)

    # 11) Make a clean output bundle with requested columns
    # Keep SMILES (canonical), name/class, pareto layer, trust score, properties used
    keep = ["smiles_key", "polymer_name", "polymer_class", "pareto_layer", "trust_score"]
    for p in needed_props:
        mc = mean_col(p)
        sc = std_col(p)
        if mc in out.columns:
            keep.append(mc)
        if sc in out.columns:
            keep.append(sc)

    out = out[keep].rename(columns={"smiles_key": "SMILES"})

    stats = {
        "n_total": float(len(df)),
        "n_after_constraints": float(n_after),
        "n_pool": float(n_pool),
        "n_pareto_pool": float(n_pareto),
        "n_selected": float(len(out)),
    }
    report("Done.", 1.0)
    return out, stats, plot_df


def build_pareto_plot_df(spec: DiscoverySpec, max_plot_points: int = 30000) -> pd.DataFrame:
    """
    Returns a small dataframe for plotting (sampled), with objective columns and pareto_layer.
    Does NOT compute trust/diversity. Safe for live plotting.
    """
    rng = np.random.default_rng(spec.random_seed)

    obj_props = [o["property"].lower() for o in spec.objectives]
    cons_props = [p.lower() for p in spec.hard_constraints.keys()]
    needed_props = sorted(set(obj_props + cons_props))

    cols = ["SMILES"] + [mean_col(p) for p in needed_props]
    df = load_parquet_columns(spec.dataset, columns=cols)

    if "SMILES" not in df.columns and "smiles" in df.columns:
        df = df.rename(columns={"smiles": "SMILES"})

    df["smiles_key"] = df["SMILES"].astype(str).map(lambda s: normalize_smiles(s, spec.use_canonical_smiles))
    df = df.dropna(subset=["smiles_key"]).reset_index(drop=True)

    # Hard constraints
    for p, rule in spec.hard_constraints.items():
        p = p.lower()
        c = mean_col(p)
        if c not in df.columns:
            return df.iloc[0:0]
        if "min" in rule:
            df = df[df[c] >= float(rule["min"])]
        if "max" in rule:
            df = df[df[c] <= float(rule["max"])]

    if len(df) == 0:
        return df

    # Pareto cap for plotting
    plot_cap = min(int(max_plot_points), int(spec.pareto_max))
    if len(df) > plot_cap:
        idx = rng.choice(len(df), size=plot_cap, replace=False)
        df = df.iloc[idx].reset_index(drop=True)

    # Build objective matrix (minimization)
    X = []
    for o in spec.objectives:
        prop = o["property"].lower()
        goal = o["goal"].lower()
        c = mean_col(prop)
        v = df[c].to_numpy(dtype=float)
        if goal == "maximize":
            v = -v
        X.append(v)
    X = np.stack(X, axis=1)

    df["pareto_layer"] = pareto_layers(X, max_layers=5)

    # Return only what plotting needs
    keep = ["smiles_key", "pareto_layer"] + [mean_col(p) for p in obj_props]
    out = df[keep].rename(columns={"smiles_key": "SMILES"})
    return out


def parse_spec(text: str, dataset_path: List[str], polyinfo_path: str, polyinfo_csv_path: str) -> DiscoverySpec:
    obj = json.loads(text)
    pareto_max = int(obj.get("pareto_max", 50000))

    return DiscoverySpec(
        dataset=list(dataset_path),
        polyinfo=polyinfo_path,
        polyinfo_csv=polyinfo_csv_path,
        hard_constraints=obj.get("hard_constraints", {}),
        objectives=obj.get("objectives", []),
        max_pool=pareto_max,
        pareto_max=pareto_max,
        max_candidates=int(obj.get("max_candidates", 30)),
        max_pareto_fronts=int(obj.get("max_pareto_fronts", 5)),
        min_distance=float(obj.get("min_distance", 0.30)),
        fingerprint=str(obj.get("fingerprint", "morgan")),
        random_seed=int(obj.get("random_seed", 7)),
        use_canonical_smiles=not bool(obj.get("skip_smiles_canonicalization", True)),
        use_full_data=bool(obj.get("use_full_data", False)),
        trust_weights=obj.get("trust_weights"),
        selection_weights=obj.get("selection_weights"),
    )