from __future__ import annotations import pandas as pd import streamlit as st from rdkit import Chem from rdkit import RDLogger RDLogger.DisableLog("rdApp.*") # ---------------------------- # Sources (property value files) # ---------------------------- SOURCES = ["EXP", "MD", "DFT", "GC"] SOURCE_LABELS = { "EXP": "Experimental", "MD": "Molecular Dynamics", "DFT": "Density Functional Theory", "GC": "Group Contribution", } # ---------------------------- # PolyInfo metadata file (name/class) # ---------------------------- POLYINFO_FILE = "data/POLYINFO.csv" # contains: SMILES, Polymer_Class, Polymer_Name def canonicalize_smiles(smiles: str) -> str | None: smiles = (smiles or "").strip() if not smiles: return None mol = Chem.MolFromSmiles(smiles) if mol is None: return None return Chem.MolToSmiles(mol, canonical=True) # --- Property meta (full name + unit) --- PROPERTY_META = { # Thermal "tm": {"name": "Melting temperature", "unit": "K"}, "tg": {"name": "Glass transition temperature", "unit": "K"}, "td": {"name": "Thermal diffusivity", "unit": "m^2/s"}, "tc": {"name": "Thermal conductivity", "unit": "W/m·K"}, "cp": {"name": "Specific heat capacity", "unit": "J/kg·K"}, # Mechanical "young": {"name": "Young's modulus", "unit": "GPa"}, "shear": {"name": "Shear modulus", "unit": "GPa"}, "bulk": {"name": "Bulk modulus", "unit": "GPa"}, "poisson": {"name": "Poisson ratio", "unit": "-"}, # Transport "visc": {"name": "Viscosity", "unit": "Pa·s"}, "dif": {"name": "Diffusivity", "unit": "cm^2/s"}, # Gas permeability "phe": {"name": "He permeability", "unit": "Barrer"}, "ph2": {"name": "H2 permeability", "unit": "Barrer"}, "pco2": {"name": "CO2 permeability", "unit": "Barrer"}, "pn2": {"name": "N2 permeability", "unit": "Barrer"}, "po2": {"name": "O2 permeability", "unit": "Barrer"}, "pch4": {"name": "CH4 permeability", "unit": "Barrer"}, # Electronic / Optical "alpha": {"name": "Polarizability", "unit": "a.u."}, "homo": {"name": "HOMO energy", "unit": "eV"}, "lumo": {"name": "LUMO energy", "unit": "eV"}, "bandgap": {"name": "Band gap", "unit": "eV"}, "mu": {"name": "Dipole moment", "unit": "Debye"}, "etotal": {"name": "Total electronic energy", "unit": "eV"}, "ri": {"name": "Refractive index", "unit": "-"}, "dc": {"name": "Dielectric constant", "unit": "-"}, "pe": {"name": "Permittivity", "unit": "-"}, # Structural / Physical "rg": {"name": "Radius of gyration", "unit": "Å"}, "rho": {"name": "Density", "unit": "g/cm^3"}, } @st.cache_data def load_source_csv(source: str) -> pd.DataFrame: """ Loads data/{SOURCE}.csv, normalizes: - SMILES column -> 'smiles' - property columns -> lowercase - adds 'smiles_canon' """ path = f"data/{source}.csv" df = pd.read_csv(path) # Normalize SMILES column name if "SMILES" in df.columns: df = df.rename(columns={"SMILES": "smiles"}) elif "smiles" not in df.columns: raise ValueError(f"{path} missing SMILES column") # Normalize property column names to lowercase rename_map = {c: c.lower() for c in df.columns if c != "smiles"} df = df.rename(columns=rename_map) # Canonicalize SMILES df["smiles_canon"] = df["smiles"].astype(str).apply(canonicalize_smiles) df = df.dropna(subset=["smiles_canon"]).reset_index(drop=True) return df @st.cache_data def build_index(df: pd.DataFrame) -> dict[str, int]: """canonical smiles -> row index (first occurrence)""" idx: dict[str, int] = {} for i, s in enumerate(df["smiles_canon"].tolist()): if s and s not in idx: idx[s] = i return idx @st.cache_data def load_polyinfo_csv() -> pd.DataFrame: """ Loads data/POLYINFO.csv with columns: SMILES, Polymer_Class, Polymer_Name Adds canonical smiles column 'smiles_canon'. Returns empty df if file missing. """ try: df = pd.read_csv(POLYINFO_FILE) except Exception: return pd.DataFrame(columns=["smiles", "polymer_class", "polymer_name", "smiles_canon"]) # Normalize columns if "SMILES" in df.columns: df = df.rename(columns={"SMILES": "smiles"}) elif "smiles" not in df.columns: # If the file doesn't have a SMILES column as expected, return empty gracefully return pd.DataFrame(columns=["smiles", "polymer_class", "polymer_name", "smiles_canon"]) # Normalize expected meta columns ren = {} if "Polymer_Class" in df.columns: ren["Polymer_Class"] = "polymer_class" if "Polymer_Name" in df.columns: ren["Polymer_Name"] = "polymer_name" df = df.rename(columns=ren) # Ensure the columns exist (even if missing in the file) if "polymer_class" not in df.columns: df["polymer_class"] = pd.NA if "polymer_name" not in df.columns: df["polymer_name"] = pd.NA # Canonicalize smiles df["smiles_canon"] = df["smiles"].astype(str).apply(canonicalize_smiles) df = df.dropna(subset=["smiles_canon"]).reset_index(drop=True) return df @st.cache_data def load_all_sources(): """ Returns dict: db["EXP"/"MD"/"DFT"/"GC"] = {"df": df, "idx": idx} db["POLYINFO"] = {"df": df, "idx": idx} """ db = {} for src in SOURCES: df = load_source_csv(src) idx = build_index(df) db[src] = {"df": df, "idx": idx} # PolyInfo metadata pi_df = load_polyinfo_csv() pi_idx = build_index(pi_df) if not pi_df.empty else {} db["POLYINFO"] = {"df": pi_df, "idx": pi_idx} return db def get_value(db, source: str, smiles_canon: str, prop_key: str): pack = db[source] df, idx = pack["df"], pack["idx"] row_i = idx.get(smiles_canon, None) if row_i is None: return None if prop_key not in df.columns: return None val = df.iloc[row_i][prop_key] if pd.isna(val): return None return float(val) def get_polyinfo(db, smiles_canon: str) -> tuple[str | None, str | None]: """ Returns (polymer_name, polymer_class) if available, else (None, None). No 'not available' text here. """ pack = db.get("POLYINFO", None) if pack is None: return None, None df, idx = pack["df"], pack["idx"] if df is None or df.empty: return None, None row_i = idx.get(smiles_canon, None) if row_i is None: return None, None name = df.iloc[row_i].get("polymer_name", None) cls = df.iloc[row_i].get("polymer_class", None) # Clean up NA / empty if pd.isna(name) or str(name).strip() == "": name = None else: name = str(name).strip() if pd.isna(cls) or str(cls).strip() == "": cls = None else: cls = str(cls).strip() return name, cls