Spaces:

SalZa2004
/

MoleculeGenerator

Build error

App Files Files Community

SalZa2004 commited on Jan 4

Commit

da421be

1 Parent(s): 9250f3b

new structure

Browse files

Files changed (25) hide show

applications/3_molecule_generator/__pycache__/__init__.cpython-310.pyc +0 -0
applications/3_molecule_generator/__pycache__/cli.cpython-310.pyc +0 -0
applications/3_molecule_generator/__pycache__/main.cpython-310.pyc +0 -0
applications/3_molecule_generator/__pycache__/results.cpython-310.pyc +0 -0
applications/__pycache__/__init__.cpython-310.pyc +0 -0
core/__pycache__/__init__.cpython-310.pyc +0 -0
core/__pycache__/config.cpython-310.pyc +0 -0
core/__pycache__/data_prep.cpython-310.pyc +0 -0
{src → core}/__pycache__/shared_features.cpython-310.pyc +0 -0
core/evolution/__pycache__/__init__.cpython-310.pyc +0 -0
core/evolution/__pycache__/evolution.cpython-310.pyc +0 -0
core/evolution/__pycache__/molecule.cpython-310.pyc +0 -0
core/evolution/__pycache__/population.cpython-310.pyc +0 -0
core/predictors/__pycache__/__init__.cpython-310.pyc +0 -0
core/predictors/pure_component/__pycache__/generic.cpython-310.pyc +0 -0
core/predictors/pure_component/__pycache__/hf_models.cpython-310.pyc +0 -0
core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc +0 -0
src/__pycache__/data_prep.cpython-310.pyc +0 -0
src/data_prep.py +0 -36
src/database_main.db +0 -3
src/diesel_fragments.db +0 -3
src/main.py +0 -704
src/model_config.py +0 -53
src/shared_features.py +0 -233
src/streamlit_app.py +0 -161

applications/3_molecule_generator/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (143 Bytes). View file

applications/3_molecule_generator/__pycache__/cli.cpython-310.pyc ADDED Viewed

Binary file (1.76 kB). View file

applications/3_molecule_generator/__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (918 Bytes). View file

applications/3_molecule_generator/__pycache__/results.cpython-310.pyc ADDED Viewed

Binary file (1.85 kB). View file

applications/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (122 Bytes). View file

core/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (138 Bytes). View file

core/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (1.25 kB). View file

core/__pycache__/data_prep.cpython-310.pyc ADDED Viewed

Binary file (1 kB). View file

{src → core}/__pycache__/shared_features.cpython-310.pyc RENAMED Viewed

Binary files a/src/__pycache__/shared_features.cpython-310.pyc and b/core/__pycache__/shared_features.cpython-310.pyc differ

core/evolution/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (148 Bytes). View file

core/evolution/__pycache__/evolution.cpython-310.pyc ADDED Viewed

Binary file (7.88 kB). View file

core/evolution/__pycache__/molecule.cpython-310.pyc ADDED Viewed

Binary file (1.57 kB). View file

core/evolution/__pycache__/population.cpython-310.pyc ADDED Viewed

Binary file (3.7 kB). View file

core/predictors/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (149 Bytes). View file

core/predictors/pure_component/__pycache__/generic.cpython-310.pyc ADDED Viewed

Binary file (1.68 kB). View file

core/predictors/pure_component/__pycache__/hf_models.cpython-310.pyc ADDED Viewed

Binary file (866 Bytes). View file

core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc ADDED Viewed

Binary file (3.31 kB). View file

src/__pycache__/data_prep.cpython-310.pyc DELETED Viewed

Binary file (1.14 kB)

src/data_prep.py DELETED Viewed

@@ -1,36 +0,0 @@
-import os
-import sqlite3
-import pandas as pd
-from sklearn.model_selection import train_test_split
-import os
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-DB_PATH = os.path.join(BASE_DIR, "database_main.db")
-TARGET_CN = "cn"      # Cetane number
-N_FOLDS = 5
-TOP_K = 5
-print("Connecting to SQLite database...")
-conn = sqlite3.connect(DB_PATH)
-cursor = conn.cursor()
-cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
-print(cursor.fetchall())
-query = """
-SELECT
-    F.Fuel_Name,
-    F.SMILES,
-    T.Standardised_DCN AS cn
-FROM FUEL F
-LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
-"""
-df = pd.read_sql_query(query, conn)
-conn.close()
-df.dropna(subset=[TARGET_CN, "SMILES"], inplace=True)
-train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
-print(df.head())
-print(df.columns)
-def load_data():
-    return df

src/database_main.db DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b14779692bb401ac9fc714a3aa8919d4e14f75aef9f92c6004195d89102ebcff
-size 344064

src/diesel_fragments.db DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e76b070ca56ecaaf083602224e59dbff6d5f94c43960e139643c52d93472acb
-size 10002432

src/main.py DELETED Viewed

@@ -1,704 +0,0 @@
-import os
-import sys
-from pathlib import Path
-from dataclasses import dataclass, asdict, field
-from typing import List, Dict, Optional, Tuple, Callable
-import joblib
-import numpy as np
-import pandas as pd
-import random
-from rdkit import Chem
-from crem.crem import mutate_mol
-from sklearn.base import BaseEstimator, RegressorMixin
-from joblib import Parallel, delayed
-from tqdm import tqdm
-from huggingface_hub import snapshot_download
-HF_MODEL_REPOS = {
-    "cn": "SalZa2004/Cetane_Number_Predictor",
-    "ysi": "SalZa2004/YSI_Predictor",
-    "density": "SalZa2004/Density_Predictor",
-    "lhv": "SalZa2004/LHV_Predictor",
-    "dynamic_viscosity": "SalZa2004/Dynamic_Viscosity_Predictor",
-    "bp": "SalZa2004/Boiling_Point_Predictor",
-}
-def get_hf_model_dir(repo_id: str) -> Path:
-    """
-    Download a Hugging Face model repo and return local path.
-    Uses HF cache automatically.
-    """
-    local_dir = snapshot_download(
-        repo_id=repo_id,
-        repo_type="model",
-        local_dir=None,          # use HF cache
-        local_dir_use_symlinks=True
-    )
-    return Path(local_dir)
-# === Project Setup ===
-PROJECT_ROOT = Path.cwd()
-SRC_DIR = PROJECT_ROOT / "src"
-sys.path.append(str(PROJECT_ROOT))
-INITIAL_PRED_CACHE = PROJECT_ROOT / "cache" / "initial_predictions.parquet"
-from shared_features import FeatureSelector, featurize_df
-from data_prep import df
-class GenericPredictor:
-    """Generic predictor that works for any property model."""
-    def __init__(self, model_dir: Path, property_name: str):
-        self.property_name = property_name
-        model_path =  model_dir / "model.joblib"
-        selector_path = model_dir / "selector.joblib"
-        if not model_path.exists():
-            raise FileNotFoundError(f"Missing model.joblib in {model_dir}")
-        if not selector_path.exists():
-            raise FileNotFoundError(f"Missing selector.joblib in {model_dir}")
-        self.model = joblib.load(model_path)
-        self.selector = FeatureSelector.load(selector_path)
-        print(f"✓ Loaded {property_name} predictor")
-    def predict(self, smiles_list):
-        """Inference on a list of SMILES strings."""
-        if isinstance(smiles_list, str):
-            smiles_list = [smiles_list]
-        X_full = featurize_df(smiles_list, return_df=False)
-        if X_full is None:
-            print(f"⚠ Warning: No valid molecules found for {self.property_name}!")
-            return []
-        X_selected = self.selector.transform(X_full)
-        predictions = self.model.predict(X_selected)
-        return predictions.tolist()
-    def predict_with_details(self, smiles_list):
-        """Inference with valid/invalid info."""
-        if isinstance(smiles_list, str):
-            smiles_list = [smiles_list]
-        df = pd.DataFrame({"SMILES": smiles_list})
-        X_full, df_valid = featurize_df(df, return_df=True)
-        col_name = f"Predicted_{self.property_name}"
-        if X_full is None:
-            return pd.DataFrame(columns=["SMILES", col_name, "Valid"])
-        X_selected = self.selector.transform(X_full)
-        predictions = self.model.predict(X_selected)
-        df_valid[col_name] = predictions
-        df_valid["Valid"] = True
-        all_results = pd.DataFrame({"SMILES": smiles_list})
-        all_results = all_results.merge(
-            df_valid[["SMILES", col_name, "Valid"]],
-            on="SMILES", how="left"
-        )
-        all_results["Valid"] = all_results["Valid"].fillna(False)
-        return all_results
-@dataclass
-class EvolutionConfig:
-    """Configuration for evolutionary algorithm."""
-    target_cn: float
-    minimize_ysi: bool = True
-    generations: int = 6
-    population_size: int = 50
-    mutations_per_parent: int = 5
-    survivor_fraction: float = 0.5
-    min_bp: float = 60
-    max_bp: float = 250
-    min_dynamic_viscosity: float = 0.0
-    max_dynamic_viscosity: float = 2.0
-    min_density: float = 720
-    min_lhv: float = 30
-    use_bp_filter: bool = True
-    use_density_filter: bool = True
-    use_lhv_filter: bool = True
-    use_dynamic_viscosity_filter: bool = True
-    batch_size: int = 200  # Increased default for better throughput
-    max_offspring_attempts: int = 10
-    n_jobs: int = -1  # Number of parallel jobs for mutation (-1 = all cores)
-    def __post_init__(self):
-        """Validate configuration parameters."""
-        if self.target_cn < 0:
-            raise ValueError("target_cn must be positive")
-        if not 0 < self.survivor_fraction < 1:
-            raise ValueError("survivor_fraction must be between 0 and 1")
-        if self.min_bp >= self.max_bp:
-            raise ValueError("min_bp must be less than max_bp")
-        if self.population_size < 2:
-            raise ValueError("population_size must be at least 2")
-@dataclass
-class Molecule:
-    """Represents a molecule with its properties."""
-    smiles: str
-    cn: float
-    cn_error: float
-    bp: Optional[float] = None
-    ysi: Optional[float] = None
-    density: Optional[float] = None
-    lhv: Optional[float] = None
-    dynamic_viscosity: Optional[float] = None
-    _mol_cache: Optional[Chem.Mol] = field(default=None, repr=False, compare=False)
-    def get_mol(self) -> Optional[Chem.Mol]:
-        """Get cached RDKit Mol object to avoid repeated conversions."""
-        if self._mol_cache is None:
-            self._mol_cache = Chem.MolFromSmiles(self.smiles)
-        return self._mol_cache
-    def dominates(self, other: 'Molecule') -> bool:
-        """Check if this molecule Pareto-dominates another."""
-        better_cn = self.cn_error <= other.cn_error
-        better_ysi = self.ysi <= other.ysi if self.ysi is not None else True
-        strictly_better = (self.cn_error < other.cn_error or
-                          (self.ysi is not None and self.ysi < other.ysi))
-        return better_cn and better_ysi and strictly_better
-    def to_dict(self) -> Dict:
-        """Convert to dictionary for DataFrame creation."""
-        return {k: v for k, v in asdict(self).items()
-                if v is not None and k != '_mol_cache'}
-class PropertyPredictor:
-    """Handles batch prediction for all molecular properties with caching."""
-    def __init__(self, config: EvolutionConfig):
-        self.config = config
-        self.predictors = {}
-        self.prediction_cache = {}
-        # --- Always load CN ---
-        cn_dir = get_hf_model_dir(HF_MODEL_REPOS["cn"])
-        self.predictors["cn"] = GenericPredictor(
-            cn_dir,
-            "Cetane Number"
-        )
-        # --- Optional predictors ---
-        if config.minimize_ysi:
-            ysi_dir = get_hf_model_dir(HF_MODEL_REPOS["ysi"])
-            self.predictors["ysi"] = GenericPredictor(
-                ysi_dir,
-                "YSI"
-            )
-        if config.use_bp_filter:
-            bp_dir = get_hf_model_dir(HF_MODEL_REPOS["bp"])
-            self.predictors["bp"] = GenericPredictor(
-                bp_dir,
-                "Boiling Point"
-            )
-        if config.use_density_filter:
-            density_dir = get_hf_model_dir(HF_MODEL_REPOS["density"])
-            self.predictors["density"] = GenericPredictor(
-                density_dir,
-                "Density"
-            )
-        if config.use_lhv_filter:
-            lhv_dir = get_hf_model_dir(HF_MODEL_REPOS["lhv"])
-            self.predictors["lhv"] = GenericPredictor(
-                lhv_dir,
-                "Lower Heating Value"
-            )
-        if config.use_dynamic_viscosity_filter:
-            dv_dir = get_hf_model_dir(HF_MODEL_REPOS["dynamic_viscosity"])
-            self.predictors["dynamic_viscosity"] = GenericPredictor(
-                dv_dir,
-                "Dynamic Viscosity"
-            )
-    def _safe_predict(self, predictions: List) -> List[Optional[float]]:
-        """Safely convert predictions, handling None/NaN/inf values."""
-        return [
-            float(pred) if pred is not None and np.isfinite(pred) else None
-            for pred in predictions
-        ]
-    def _predict_batch(self, property_name: str, smiles_list: List[str]) -> List[Optional[float]]:
-        """Generic batch prediction method."""
-        predictor = self.predictors.get(property_name)
-        if not smiles_list or predictor is None:
-            return [None] * len(smiles_list)
-        try:
-            predictions = predictor.predict(smiles_list)
-            return self._safe_predict(predictions)
-        except Exception as e:
-            print(f"⚠️  Warning: {property_name.upper()} prediction failed: {e}")
-            return [None] * len(smiles_list)
-    def predict_all_properties(self, smiles_list: List[str]) -> Dict[str, List[Optional[float]]]:
-        if not smiles_list:
-            return {}
-        # --- ONE featurization ---
-        X_full = featurize_df(smiles_list, return_df=False)
-        if X_full is None:
-            return {}
-        results = {}
-        for prop, predictor in self.predictors.items():
-            try:
-                X_sel = predictor.selector.transform(X_full)
-                preds = predictor.model.predict(X_sel)
-                results[prop] = self._safe_predict(preds)
-            except Exception as e:
-                print(f"⚠️ {prop} prediction failed: {e}")
-                results[prop] = [None] * len(smiles_list)
-        return results
-class Population:
-    """Manages the population of molecules."""
-    def __init__(self, config: EvolutionConfig):
-        self.config = config
-        self.molecules: List[Molecule] = []
-        self.seen_smiles: set = set()
-    def add_molecule(self, mol: Molecule) -> bool:
-        """Add a molecule if it's not already in the population."""
-        if mol.smiles in self.seen_smiles:
-            return False
-        self.molecules.append(mol)
-        self.seen_smiles.add(mol.smiles)
-        return True
-    def add_molecules(self, molecules: List[Molecule]) -> int:
-        """Add multiple molecules, return count added."""
-        return sum(self.add_molecule(mol) for mol in molecules)
-    def pareto_front(self) -> List[Molecule]:
-        """Extract Pareto front using optimized vectorized operations."""
-        if not self.config.minimize_ysi:
-            return []
-        n = len(self.molecules)
-        if n == 0:
-            return []
-        # Create numpy arrays for vectorized operations
-        cn_errors = np.array([m.cn_error for m in self.molecules])
-        ysis = np.array([m.ysi for m in self.molecules])
-        # Vectorized dominance check
-        is_pareto = np.ones(n, dtype=bool)
-        for i in range(n):
-            if not is_pareto[i]:
-                continue
-            # Check if any other point dominates point i
-            dominates_i = (
-                (cn_errors <= cn_errors[i]) &
-                (ysis <= ysis[i]) &
-                ((cn_errors < cn_errors[i]) | (ysis < ysis[i]))
-            )
-            dominates_i[i] = False  # Don't compare with itself
-            is_pareto[i] = not np.any(dominates_i)
-        return [self.molecules[i] for i in np.where(is_pareto)[0]]
-    def get_survivors(self) -> List[Molecule]:
-        """Select survivors for the next generation."""
-        target_size = int(self.config.population_size * self.config.survivor_fraction)
-        if self.config.minimize_ysi:
-            survivors = self.pareto_front()
-            # Sort key for combined objectives
-            sort_key = lambda m: m.cn_error + m.ysi
-            if len(survivors) > target_size:
-                survivors = sorted(survivors, key=sort_key)[:target_size]
-            elif len(survivors) < target_size:
-                remainder = [m for m in self.molecules if m not in survivors]
-                remainder = sorted(remainder, key=sort_key)
-                survivors.extend(remainder[:target_size - len(survivors)])
-        else:
-            survivors = sorted(self.molecules, key=lambda m: m.cn_error)[:target_size]
-        return survivors
-    def to_dataframe(self) -> pd.DataFrame:
-        """Convert population to DataFrame."""
-        df = pd.DataFrame([m.to_dict() for m in self.molecules])
-        sort_cols = ["cn_error", "ysi"] if self.config.minimize_ysi else ["cn_error"]
-        df = df.sort_values(sort_cols, ascending=True)
-        df.insert(0, 'rank', range(1, len(df) + 1))
-        return df
-class MolecularEvolution:
-    """Main evolutionary algorithm coordinator with optimized performance."""
-    REP_DB_PATH = "diesel_fragments.db"
-    def __init__(self, config: EvolutionConfig):
-        self.config = config
-        self.predictor = PropertyPredictor(config)
-        self.population = Population(config)
-    def _mutate_molecule(self, mol: Chem.Mol) -> List[str]:
-        """Generate mutations for a molecule using CREM with set-based deduplication."""
-        try:
-            mutants = set(mutate_mol(
-                mol,
-                db_name=str(self.REP_DB_PATH),
-                max_size=2,
-                return_mol=False
-            ))
-            # Single set operation instead of list comprehension
-            return list(mutants - self.population.seen_smiles)
-        except Exception:
-            return []
-    def _create_molecules(self, smiles_list: List[str]) -> List[Molecule]:
-        """Create Molecule objects from SMILES with vectorized validation."""
-        if not smiles_list:
-            return []
-        # Get all predictions at once
-        predictions = self.predictor.predict_all_properties(smiles_list)
-        # Vectorized validation using numpy
-        n = len(smiles_list)
-        cn_vals = np.array(predictions.get('cn', [None] * n))
-        valid_mask = ~np.isnan(cn_vals)  # Start with CN validity
-        if self.config.minimize_ysi:
-            ysi_vals = np.array(predictions.get('ysi', [None] * n))
-            valid_mask &= ~np.isnan(ysi_vals)
-        # Vectorized constraint checking
-        if 'bp' in predictions and self.config.use_bp_filter:
-            bp_vals = np.array([v if v is not None else np.nan for v in predictions['bp']])
-            valid_mask &= (bp_vals >= self.config.min_bp) & (bp_vals <= self.config.max_bp)
-        if 'density' in predictions and self.config.use_density_filter:
-            density_vals = np.array([v if v is not None else np.nan for v in predictions['density']])
-            valid_mask &= (density_vals > self.config.min_density)
-        if 'lhv' in predictions and self.config.use_lhv_filter:
-            lhv_vals = np.array([v if v is not None else np.nan for v in predictions['lhv']])
-            valid_mask &= (lhv_vals > self.config.min_lhv)
-        if 'dynamic_viscosity' in predictions and self.config.use_dynamic_viscosity_filter:
-            dv_vals = np.array([v if v is not None else np.nan for v in predictions['dynamic_viscosity']])
-            valid_mask &= (dv_vals > self.config.min_dynamic_viscosity) & (dv_vals <= self.config.max_dynamic_viscosity)
-        # Create molecules only for valid indices
-        molecules = []
-        for i in np.where(valid_mask)[0]:
-            molecules.append(Molecule(
-                smiles=smiles_list[i],
-                cn=predictions['cn'][i],
-                cn_error=abs(predictions['cn'][i] - self.config.target_cn),
-                bp=predictions.get('bp', [None]*n)[i],
-                ysi=predictions.get('ysi', [None]*n)[i],
-                density=predictions.get('density', [None]*n)[i],
-                lhv=predictions.get('lhv', [None]*n)[i],
-                dynamic_viscosity=predictions.get('dynamic_viscosity', [None]*n)[i]
-            ))
-        return molecules
-    def initialize_population(self, initial_smiles: List[str]) -> int:
-        cache_path = INITIAL_PRED_CACHE
-        cache_path.parent.mkdir(exist_ok=True)
-        if cache_path.exists():
-            print("✓ Loading cached initial predictions")
-            df_pred = pd.read_parquet(cache_path)
-        else:
-            print("Predicting properties for initial population (cached)...")
-            predictions = self.predictor.predict_all_properties(initial_smiles)
-            df_pred = pd.DataFrame({
-                "smiles": initial_smiles,
-                **predictions
-            })
-            df_pred.to_parquet(cache_path)
-        # --- Apply constraints & build Molecules ---
-        molecules = []
-        for _, row in df_pred.iterrows():
-            if row["cn"] is None:
-                continue
-            if self.config.minimize_ysi and pd.isna(row.get("ysi")):
-                continue
-            if self.config.use_bp_filter:
-                if not (self.config.min_bp <= row["bp"] <= self.config.max_bp):
-                    continue
-            if self.config.use_density_filter:
-                if row["density"] <= self.config.min_density:
-                    continue
-            if self.config.use_lhv_filter:
-                if row["lhv"] <= self.config.min_lhv:
-                    continue
-            if self.config.use_dynamic_viscosity_filter:
-                if not (
-                    self.config.min_dynamic_viscosity
-                    < row["dynamic_viscosity"]
-                    <= self.config.max_dynamic_viscosity
-                ):
-                    continue
-            molecules.append(
-                Molecule(
-                    smiles=row["smiles"],
-                    cn=row["cn"],
-                    cn_error=abs(row["cn"] - self.config.target_cn),
-                    bp=row["bp"],
-                    ysi=row.get("ysi"),
-                    density=row["density"],
-                    lhv=row["lhv"],
-                    dynamic_viscosity=row["dynamic_viscosity"]
-                )
-            )
-        return self.population.add_molecules(molecules)
-    def _log_generation_stats(self, generation: int):
-        """Log statistics for the current generation."""
-        mols = self.population.molecules
-        best_cn = min(mols, key=lambda m: m.cn_error)
-        avg_cn_err = np.mean([m.cn_error for m in mols])
-        log_dict = {
-            "generation": generation,
-            "best_cn_error": best_cn.cn_error,
-            "population_size": len(mols),
-            "avg_cn_error": avg_cn_err,
-        }
-        print_msg = (f"Gen {generation}/{self.config.generations} | "
-                    f"Pop {len(mols)} | "
-                    f"Best CN err: {best_cn.cn_error:.3f} | "
-                    f"Avg CN err: {avg_cn_err:.3f}")
-        if self.config.minimize_ysi:
-            front = self.population.pareto_front()
-            best_ysi = min(mols, key=lambda m: m.ysi)
-            avg_ysi = np.mean([m.ysi for m in mols])
-            log_dict.update({
-                "best_ysi": best_ysi.ysi,
-                "pareto_size": len(front),
-                "avg_ysi": avg_ysi,
-            })
-            print_msg += (f" | Best YSI: {best_ysi.ysi:.3f} | "
-                         f"Avg YSI: {avg_ysi:.3f} | "
-                         f"Pareto size: {len(front)}")
-        print(print_msg)
-    def _generate_offspring(self, survivors: List[Molecule]) -> List[Molecule]:
-        """Generate offspring from survivors with parallel mutation."""
-        target_count = self.config.population_size - len(survivors)
-        max_attempts = target_count * self.config.max_offspring_attempts
-        # Generate parent pool
-        parents = [random.choice(survivors) for _ in range(max_attempts)]
-        parent_mols = [p.get_mol() for p in parents]  # Use cached Mol objects
-        parent_mols = [m for m in parent_mols if m is not None]
-        # Parallel mutation generation
-        print(f"  → Generating mutations in parallel ({len(parent_mols)} parents)...")
-        all_children_nested = Parallel(n_jobs=self.config.n_jobs, batch_size=10)(
-            delayed(self._mutate_molecule)(mol) for mol in parent_mols
-        )
-        # Flatten and limit
-        all_children = [child for children in all_children_nested for child in children]
-        all_children = all_children[:target_count * 3]  # Reasonable limit
-        # Batch evaluation
-        if all_children:
-            print(f"  → Evaluating {len(all_children)} offspring...")
-            new_molecules = self._create_molecules(all_children)
-            all_children.clear()  # Explicit memory cleanup
-            return new_molecules
-        return []
-    def _run_evolution_loop(self):
-        """Run the main evolution loop with progress tracking."""
-        for gen in tqdm(range(1, self.config.generations + 1), desc="Evolution"):
-            self._log_generation_stats(gen)
-            survivors = self.population.get_survivors()
-            offspring = self._generate_offspring(survivors)
-            # Create new population
-            new_pop = Population(self.config)
-            new_pop.add_molecules(survivors + offspring)
-            self.population = new_pop
-    def _generate_results(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
-        """Generate final results DataFrames."""
-        final_df = self.population.to_dataframe()
-        if self.config.minimize_ysi and "ysi" in final_df.columns:
-            final_df = final_df[
-                (final_df["cn_error"] < 5) &
-                (final_df["ysi"] < 50)
-            ].sort_values(["cn_error", "ysi"], ascending=True)
-            # overwrite rank safely
-            final_df["rank"] = range(1, len(final_df) + 1)
-        if self.config.minimize_ysi:
-            pareto_mols = self.population.pareto_front()
-            pareto_df = pd.DataFrame([m.to_dict() for m in pareto_mols])
-            if not pareto_df.empty:
-                pareto_df = pareto_df[
-                    (pareto_df['cn_error'] < 5) & (pareto_df['ysi'] < 50)
-                ].sort_values(["cn_error", "ysi"], ascending=True)
-                pareto_df.insert(0, 'rank', range(1, len(pareto_df) + 1))
-        else:
-            pareto_df = pd.DataFrame()
-        return final_df, pareto_df
-    def evolve(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
-        """Run the evolutionary algorithm."""
-        # Initialize
-        init_smiles = df["SMILES"].sample(200, random_state=42).tolist()
-        init_count = self.initialize_population(init_smiles)
-        if init_count == 0:
-            print("❌ No valid initial molecules")
-            return pd.DataFrame(), pd.DataFrame()
-        print(f"✓ Initial population size: {init_count}")
-        # Evolution
-        self._run_evolution_loop()
-        # Results
-        return self._generate_results()
-def get_user_config() -> EvolutionConfig:
-    """Get configuration from user input."""
-    print("\n" + "="*70)
-    print("MOLECULAR EVOLUTION WITH GENETIC ALGORITHM (OPTIMIZED)")
-    print("="*70)
-    while True:
-        target = float(input("Enter target CN: ") or "50")
-        if target > 40:
-            break
-        print("⚠️  Target CN is too low, optimization may be challenging.")
-        print("Consider using a higher target CN for better results.\n")
-    minimize_ysi = input("Minimise YSI (y/n): ").strip().lower() in ['y', 'yes']
-    return EvolutionConfig(target_cn=target, minimize_ysi=minimize_ysi)
-def save_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, minimize_ysi: bool):
-    """Save results to CSV files."""
-    results_dir = Path("results")
-    results_dir.mkdir(exist_ok=True)
-    final_df.to_csv(results_dir / "final_population.csv", index=False)
-    if minimize_ysi and not pareto_df.empty:
-        pareto_df.to_csv(results_dir / "pareto_front.csv", index=False)
-    print("\n✓ Saved to results/")
-def display_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, minimize_ysi: bool):
-    """Display results to console."""
-    cols = (["rank", "smiles", "cn", "cn_error", "ysi", "bp", "density", "lhv", "dynamic_viscosity"])
-    print("\n=== TOP 10 (sorted) ===")
-    print(final_df.head(10)[cols].to_string(index=False))
-    if minimize_ysi and not pareto_df.empty:
-        print("\n=== PARETO FRONT (ranked) ===")
-        print(pareto_df[["rank", "smiles", "cn", "cn_error", "ysi", "bp", "density", "lhv", "dynamic_viscosity"]]
-              .head(20).to_string(index=False))
-def main():
-    """Main execution function with optional profiling."""
-    import cProfile
-    import pstats
-    config = get_user_config()
-    # Optional profiling
-    profiler = None
-    if os.environ.get('PROFILE'):
-        profiler = cProfile.Profile()
-        profiler.enable()
-    project_name = "cetane-ysi-pareto" if config.minimize_ysi else "cetane-optimization"
-    evolution = MolecularEvolution(config)
-    final_df, pareto_df = evolution.evolve()
-    # Display and save results
-    display_results(final_df, pareto_df, config.minimize_ysi)
-    save_results(final_df, pareto_df, config.minimize_ysi)
-    # Print profiling stats if enabled
-    if profiler:
-        profiler.disable()
-        stats = pstats.Stats(profiler)
-        stats.sort_stats('cumulative')
-        print("\n=== PROFILING STATS (Top 20) ===")
-        stats.print_stats(20)
-if __name__ == "__main__":
-    main()

src/model_config.py DELETED Viewed

@@ -1,53 +0,0 @@
-"""
-Model configuration for loading from Hugging Face Hub.
-Instructions:
-1. Upload your models to Hugging Face
-2. Update the repo IDs below with your actual repository names
-3. Set USE_LOCAL_MODELS=false in your environment (default)
-"""
-import os
-# Toggle between local files and HF Hub
-# Set to 'true' for local development, 'false' for deployment
-USE_LOCAL_MODELS = os.getenv('USE_LOCAL_MODELS', 'false').lower() == 'true'
-# ============================================================================
-# HUGGING FACE MODEL REPOSITORIES
-# ============================================================================
-# Update these with your actual Hugging Face repository IDs
-# Format: "username/repo-name" or "organization/repo-name"
-HF_MODEL_REPOS = {
-    'cn': "SalZa2004/Cetane_Number_Predictor",           # Example: "john-doe/cetane-predictor"
-    'ysi': "SalZa2004/YSI_Predictor",         # Example: "john-doe/ysi-predictor"
-    'bp': "SalZa2004/BP_Predictor",           # Example: "john-doe/bp-predictor"
-    'density': "SalZa2004/Density_Predictor", # Example: "john-doe/density-predictor"
-    'lhv': "SalZa2004/LHV_Predictor",         # Example: "john-doe/lhv-predictor"
-}
-# ============================================================================
-# VALIDATION
-# ============================================================================
-def validate_config():
-    """Validate that configuration is properly set up."""
-    if not USE_LOCAL_MODELS:
-        # Check if HF repos are configured
-        for prop, repo in HF_MODEL_REPOS.items():
-            if repo == f"SalZa2004/{prop}-predictor":
-                print(f"⚠️  Warning: {prop} model repo not configured!")
-                print(f"    Update HF_MODEL_REPOS['{prop}'] in model_config.py")
-                return False
-    return True
-# Run validation on import
-if __name__ != "__main__":
-    if not validate_config() and not USE_LOCAL_MODELS:
-        print("\n" + "="*70)
-        print("❌ MODEL CONFIGURATION INCOMPLETE")
-        print("="*70)
-        print("\nPlease update model_config.py with your Hugging Face repository IDs.")
-        print("Example: HF_MODEL_REPOS['cn'] = 'john-doe/cetane-predictor'")
-        print("="*70 + "\n")

src/shared_features.py DELETED Viewed

@@ -1,233 +0,0 @@
-import os
-import sqlite3
-import pandas as pd
-import numpy as np
-from sklearn.model_selection import train_test_split
-PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
-DB_PATH = os.path.join(PROJECT_ROOT, "data", "database", "database_main.db")
-from functools import lru_cache
-def load_raw_data():
-    """Load raw data from database."""
-    print("Connecting to SQLite database...")
-    conn = sqlite3.connect(DB_PATH)
-    query = """
-    SELECT
-        F.Fuel_Name,
-        F.SMILES,
-        T.Standardised_DCN AS cn
-    FROM FUEL F
-    LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
-    """
-    df = pd.read_sql_query(query, conn)
-    conn.close()
-    # Clean data
-    df.dropna(subset=["cn", "SMILES"], inplace=True)
-    return df
-# ============================================================================
-# 2. FEATURIZATION MODULE
-# ============================================================================
-from rdkit import Chem
-from rdkit.Chem import Descriptors, rdFingerprintGenerator
-from tqdm import tqdm
-# Get descriptor names globally
-DESCRIPTOR_NAMES = [d[0] for d in Descriptors._descList]
-desc_functions = [d[1] for d in Descriptors._descList]
-def morgan_fp_from_mol(mol, radius=2, n_bits=2048):
-    """Generate Morgan fingerprint."""
-    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
-    fp = fpgen.GetFingerprint(mol)
-    arr = np.array(list(fp.ToBitString()), dtype=int)
-    return arr
-def physchem_desc_from_mol(mol):
-    """Calculate physicochemical descriptors."""
-    try:
-        desc = np.array([fn(mol) for fn in desc_functions], dtype=np.float32)
-        desc = np.nan_to_num(desc, nan=0.0, posinf=0.0, neginf=0.0)
-        return desc
-    except:
-        return None
-def featurize(smiles):
-    """Convert SMILES to feature vector."""
-    mol = Chem.MolFromSmiles(smiles)
-    if mol is None:
-        return None
-    fp = morgan_fp_from_mol(mol)
-    desc = physchem_desc_from_mol(mol)
-    if fp is None or desc is None:
-        return None
-    return np.hstack([fp, desc])
-def featurize_df(df, smiles_col="SMILES", return_df=True):
-    """
-    Featurize a DataFrame or list of SMILES (vectorized for speed).
-    """
-    # Handle different input types
-    if isinstance(df, (list, np.ndarray)):
-        df = pd.DataFrame({smiles_col: df})
-    elif isinstance(df, pd.Series):
-        df = pd.DataFrame({smiles_col: df})
-    # Convert all SMILES to molecules in batch
-    mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_col]]
-    features = []
-    valid_indices = []
-    # Process valid molecules
-    for i, mol in enumerate(tqdm(mols, desc="Featurizing")):
-        if mol is None:
-            continue
-        try:
-            fp = morgan_fp_from_mol(mol)
-            desc = physchem_desc_from_mol(mol)
-            if fp is not None and desc is not None:
-                features.append(np.hstack([fp, desc]))
-                valid_indices.append(i)
-        except:
-            continue
-    if len(features) == 0:
-        return (None, None) if return_df else None
-    X = np.vstack(features)
-    if return_df:
-        df_valid = df.iloc[valid_indices].reset_index(drop=True)
-        return X, df_valid
-    else:
-        return X
-@lru_cache(maxsize=50_000)
-def cached_featurize_smiles(smiles: str):
-    X = featurize_df([smiles], return_df=False)
-    if X is None:
-        return None
-    return X[0]  # single feature vector
-# ============================================================================
-# 3. FEATURE SELECTOR CLASS
-# ============================================================================
-import joblib
-class FeatureSelector:
-    """Feature selection pipeline that can be saved and reused."""
-    def __init__(self, n_morgan=2048, corr_threshold=0.95, top_k=300):
-        self.n_morgan = n_morgan
-        self.corr_threshold = corr_threshold
-        self.top_k = top_k
-        # Filled during fit()
-        self.corr_cols_to_drop = None
-        self.selected_indices = None
-        self.is_fitted = False
-    def fit(self, X, y):
-        """Fit the feature selector on training data."""
-        print("\n" + "="*70)
-        print("FITTING FEATURE SELECTOR")
-        print("="*70)
-        # Step 1: Split Morgan and descriptors
-        X_mfp = X[:, :self.n_morgan]
-        X_desc = X[:, self.n_morgan:]
-        print(f"Morgan fingerprints: {X_mfp.shape[1]}")
-        print(f"Descriptors: {X_desc.shape[1]}")
-        # Step 2: Remove correlated descriptors
-        desc_df = pd.DataFrame(X_desc)
-        corr_matrix = desc_df.corr().abs()
-        upper = corr_matrix.where(
-            np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
-        )
-        self.corr_cols_to_drop = [
-            col for col in upper.columns if any(upper[col] > self.corr_threshold)
-        ]
-        print(f"Correlated descriptors removed: {len(self.corr_cols_to_drop)}")
-        desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
-        X_corr = np.hstack([X_mfp, desc_filtered])
-        print(f"Features after correlation filter: {X_corr.shape[1]}")
-        # Step 3: Feature importance selection
-        from sklearn.ensemble import ExtraTreesRegressor
-        print("Running feature importance selection...")
-        model = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
-        model.fit(X_corr, y)
-        importances = model.feature_importances_
-        indices = np.argsort(importances)[::-1]
-        self.selected_indices = indices[:self.top_k]
-        print(f"Final selected features: {len(self.selected_indices)}")
-        self.is_fitted = True
-        return self
-    def transform(self, X):
-        """Apply the fitted feature selection to new data."""
-        if not self.is_fitted:
-            raise RuntimeError("FeatureSelector must be fitted before transform!")
-        # Step 1: Split Morgan and descriptors
-        X_mfp = X[:, :self.n_morgan]
-        X_desc = X[:, self.n_morgan:]
-        # Step 2: Remove same correlated descriptors
-        desc_df = pd.DataFrame(X_desc)
-        desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
-        X_corr = np.hstack([X_mfp, desc_filtered])
-        # Step 3: Select same important features
-        X_selected = X_corr[:, self.selected_indices]
-        return X_selected
-    def fit_transform(self, X, y):
-        """Fit and transform in one step."""
-        return self.fit(X, y).transform(X)
-    def save(self, filepath='feature_selector.joblib'):
-        """Save the fitted selector."""
-        if not self.is_fitted:
-            raise RuntimeError("Cannot save unfitted selector!")
-        # Create directory if it doesn't exist
-        os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
-        joblib.dump(self, filepath)
-        print(f"✓ Feature selector saved to {filepath}")
-    @staticmethod
-    def load(filepath='feature_selector.joblib'):
-        """Load a fitted selector."""
-        selector = joblib.load(filepath)
-        if not selector.is_fitted:
-            raise RuntimeError("Loaded selector is not fitted!")
-        print(f"✓ Feature selector loaded from {filepath}")
-        return selector

src/streamlit_app.py DELETED Viewed

@@ -1,161 +0,0 @@
-import streamlit as st
-from pathlib import Path
-import pandas as pd
-import os
-from shared_features import FeatureSelector, featurize_df
-# -----------------------------
-# OPTIONAL: Disable wandb on HF
-# -----------------------------
-os.environ["WANDB_MODE"] = "disabled"
-# -----------------------------
-# Import your existing code
-# -----------------------------
-from dataclasses import asdict
-from main import (
-    EvolutionConfig,
-    MolecularEvolution
-)
-# -----------------------------
-# Page config
-# -----------------------------
-st.set_page_config(
-    page_title="Molecular Evolution Optimizer",
-    layout="wide"
-)
-st.title("🧬 Molecular Evolution for Cetane Optimization")
-st.markdown(
-    """
-    This app runs a **genetic algorithm** to evolve molecules towards a
-    **target Cetane Number (CN)**, optionally minimizing **YSI** and
-    enforcing physical constraints.
-    """
-)
-# -----------------------------
-# Sidebar: Configuration
-# -----------------------------
-st.sidebar.header("⚙️ Evolution Configuration")
-target_cn = st.sidebar.slider(
-    "Target Cetane Number (CN)",
-    min_value=40.0,
-    max_value=80.0,
-    value=50.0,
-    step=1.0
-)
-minimize_ysi = st.sidebar.checkbox(
-    "Minimize YSI",
-    value=True
-)
-generations = st.sidebar.slider(
-    "Generations",
-    min_value=1,
-    max_value=20,
-    value=6
-)
-population_size = st.sidebar.slider(
-    "Population Size",
-    min_value=10,
-    max_value=200,
-    value=50,
-    step=10
-)
-survivor_fraction = st.sidebar.slider(
-    "Survivor Fraction",
-    min_value=0.1,
-    max_value=0.9,
-    value=0.5,
-    step=0.05
-)
-st.sidebar.subheader("🔬 Property Filters")
-use_bp_filter = st.sidebar.checkbox("Use Boiling Point filter", True)
-use_density_filter = st.sidebar.checkbox("Use Density filter", True)
-use_lhv_filter = st.sidebar.checkbox("Use LHV filter", True)
-use_dv_filter = st.sidebar.checkbox("Use Dynamic Viscosity filter", True)
-# -----------------------------
-# Build config
-# -----------------------------
-config = EvolutionConfig(
-    target_cn=target_cn,
-    minimize_ysi=minimize_ysi,
-    generations=generations,
-    population_size=population_size,
-    survivor_fraction=survivor_fraction,
-    use_bp_filter=use_bp_filter,
-    use_density_filter=use_density_filter,
-    use_lhv_filter=use_lhv_filter,
-    use_dynamic_viscosity_filter=use_dv_filter,
-)
-# -----------------------------
-# Run button
-# -----------------------------
-run = st.button("🚀 Run Evolution")
-if run:
-    with st.spinner("Running molecular evolution... This may take several minutes."):
-        evolution = MolecularEvolution(config)
-        final_df, pareto_df = evolution.evolve()
-    st.success("Evolution completed!")
-    # -----------------------------
-    # Results: Final population
-    # -----------------------------
-    st.header("📊 Final Population")
-    if final_df.empty:
-        st.warning("No valid molecules found.")
-    else:
-        st.dataframe(final_df, use_container_width=True)
-        csv = final_df.to_csv(index=False).encode("utf-8")
-        st.download_button(
-            "⬇️ Download Final Population CSV",
-            csv,
-            "final_population.csv",
-            "text/csv"
-        )
-    # -----------------------------
-    # Results: Pareto front
-    # -----------------------------
-    if minimize_ysi:
-        st.header("🏆 Pareto Front")
-        if pareto_df.empty:
-            st.info("No Pareto-optimal molecules found.")
-        else:
-            st.dataframe(pareto_df, use_container_width=True)
-            pareto_csv = pareto_df.to_csv(index=False).encode("utf-8")
-            st.download_button(
-                "⬇️ Download Pareto Front CSV",
-                pareto_csv,
-                "pareto_front.csv",
-                "text/csv"
-            )
-    # -----------------------------
-    # Quick plots
-    # -----------------------------
-    if not final_df.empty:
-        st.header("📈 CN Error vs YSI")
-        if "ysi" in final_df.columns:
-            st.scatter_chart(
-                final_df,
-                x="cn_error",
-                y="ysi"
-            )