POLYMER-PROPERTY / src /lookup.py
sobinalosious92's picture
Upload 297 files
930ea3d verified
from __future__ import annotations
import pandas as pd
import streamlit as st
from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog("rdApp.*")
# ----------------------------
# Sources (property value files)
# ----------------------------
SOURCES = ["EXP", "MD", "DFT", "GC"]
SOURCE_LABELS = {
"EXP": "Experimental",
"MD": "Molecular Dynamics",
"DFT": "Density Functional Theory",
"GC": "Group Contribution",
}
# ----------------------------
# PolyInfo metadata file (name/class)
# ----------------------------
POLYINFO_FILE = "data/POLYINFO.csv" # contains: SMILES, Polymer_Class, Polymer_Name
def canonicalize_smiles(smiles: str) -> str | None:
smiles = (smiles or "").strip()
if not smiles:
return None
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
return Chem.MolToSmiles(mol, canonical=True)
# --- Property meta (full name + unit) ---
PROPERTY_META = {
# Thermal
"tm": {"name": "Melting temperature", "unit": "K"},
"tg": {"name": "Glass transition temperature", "unit": "K"},
"td": {"name": "Thermal diffusivity", "unit": "m^2/s"},
"tc": {"name": "Thermal conductivity", "unit": "W/m路K"},
"cp": {"name": "Specific heat capacity", "unit": "J/kg路K"},
# Mechanical
"young": {"name": "Young's modulus", "unit": "GPa"},
"shear": {"name": "Shear modulus", "unit": "GPa"},
"bulk": {"name": "Bulk modulus", "unit": "GPa"},
"poisson": {"name": "Poisson ratio", "unit": "-"},
# Transport
"visc": {"name": "Viscosity", "unit": "Pa路s"},
"dif": {"name": "Diffusivity", "unit": "cm^2/s"},
# Gas permeability
"phe": {"name": "He permeability", "unit": "Barrer"},
"ph2": {"name": "H2 permeability", "unit": "Barrer"},
"pco2": {"name": "CO2 permeability", "unit": "Barrer"},
"pn2": {"name": "N2 permeability", "unit": "Barrer"},
"po2": {"name": "O2 permeability", "unit": "Barrer"},
"pch4": {"name": "CH4 permeability", "unit": "Barrer"},
# Electronic / Optical
"alpha": {"name": "Polarizability", "unit": "a.u."},
"homo": {"name": "HOMO energy", "unit": "eV"},
"lumo": {"name": "LUMO energy", "unit": "eV"},
"bandgap": {"name": "Band gap", "unit": "eV"},
"mu": {"name": "Dipole moment", "unit": "Debye"},
"etotal": {"name": "Total electronic energy", "unit": "eV"},
"ri": {"name": "Refractive index", "unit": "-"},
"dc": {"name": "Dielectric constant", "unit": "-"},
"pe": {"name": "Permittivity", "unit": "-"},
# Structural / Physical
"rg": {"name": "Radius of gyration", "unit": "脜"},
"rho": {"name": "Density", "unit": "g/cm^3"},
}
@st.cache_data
def load_source_csv(source: str) -> pd.DataFrame:
"""
Loads data/{SOURCE}.csv, normalizes:
- SMILES column -> 'smiles'
- property columns -> lowercase
- adds 'smiles_canon'
"""
path = f"data/{source}.csv"
df = pd.read_csv(path)
# Normalize SMILES column name
if "SMILES" in df.columns:
df = df.rename(columns={"SMILES": "smiles"})
elif "smiles" not in df.columns:
raise ValueError(f"{path} missing SMILES column")
# Normalize property column names to lowercase
rename_map = {c: c.lower() for c in df.columns if c != "smiles"}
df = df.rename(columns=rename_map)
# Canonicalize SMILES
df["smiles_canon"] = df["smiles"].astype(str).apply(canonicalize_smiles)
df = df.dropna(subset=["smiles_canon"]).reset_index(drop=True)
return df
@st.cache_data
def build_index(df: pd.DataFrame) -> dict[str, int]:
"""canonical smiles -> row index (first occurrence)"""
idx: dict[str, int] = {}
for i, s in enumerate(df["smiles_canon"].tolist()):
if s and s not in idx:
idx[s] = i
return idx
@st.cache_data
def load_polyinfo_csv() -> pd.DataFrame:
"""
Loads data/POLYINFO.csv with columns:
SMILES, Polymer_Class, Polymer_Name
Adds canonical smiles column 'smiles_canon'.
Returns empty df if file missing.
"""
try:
df = pd.read_csv(POLYINFO_FILE)
except Exception:
return pd.DataFrame(columns=["smiles", "polymer_class", "polymer_name", "smiles_canon"])
# Normalize columns
if "SMILES" in df.columns:
df = df.rename(columns={"SMILES": "smiles"})
elif "smiles" not in df.columns:
# If the file doesn't have a SMILES column as expected, return empty gracefully
return pd.DataFrame(columns=["smiles", "polymer_class", "polymer_name", "smiles_canon"])
# Normalize expected meta columns
ren = {}
if "Polymer_Class" in df.columns:
ren["Polymer_Class"] = "polymer_class"
if "Polymer_Name" in df.columns:
ren["Polymer_Name"] = "polymer_name"
df = df.rename(columns=ren)
# Ensure the columns exist (even if missing in the file)
if "polymer_class" not in df.columns:
df["polymer_class"] = pd.NA
if "polymer_name" not in df.columns:
df["polymer_name"] = pd.NA
# Canonicalize smiles
df["smiles_canon"] = df["smiles"].astype(str).apply(canonicalize_smiles)
df = df.dropna(subset=["smiles_canon"]).reset_index(drop=True)
return df
@st.cache_data
def load_all_sources():
"""
Returns dict:
db["EXP"/"MD"/"DFT"/"GC"] = {"df": df, "idx": idx}
db["POLYINFO"] = {"df": df, "idx": idx}
"""
db = {}
for src in SOURCES:
df = load_source_csv(src)
idx = build_index(df)
db[src] = {"df": df, "idx": idx}
# PolyInfo metadata
pi_df = load_polyinfo_csv()
pi_idx = build_index(pi_df) if not pi_df.empty else {}
db["POLYINFO"] = {"df": pi_df, "idx": pi_idx}
return db
def get_value(db, source: str, smiles_canon: str, prop_key: str):
pack = db[source]
df, idx = pack["df"], pack["idx"]
row_i = idx.get(smiles_canon, None)
if row_i is None:
return None
if prop_key not in df.columns:
return None
val = df.iloc[row_i][prop_key]
if pd.isna(val):
return None
return float(val)
def get_polyinfo(db, smiles_canon: str) -> tuple[str | None, str | None]:
"""
Returns (polymer_name, polymer_class) if available, else (None, None).
No 'not available' text here.
"""
pack = db.get("POLYINFO", None)
if pack is None:
return None, None
df, idx = pack["df"], pack["idx"]
if df is None or df.empty:
return None, None
row_i = idx.get(smiles_canon, None)
if row_i is None:
return None, None
name = df.iloc[row_i].get("polymer_name", None)
cls = df.iloc[row_i].get("polymer_class", None)
# Clean up NA / empty
if pd.isna(name) or str(name).strip() == "":
name = None
else:
name = str(name).strip()
if pd.isna(cls) or str(cls).strip() == "":
cls = None
else:
cls = str(cls).strip()
return name, cls