Spaces:

SalZa2004
/

MoleculeGenerator

Build error

App Files Files Community

SalZa2004 commited on Jan 9

Commit

315d4ad

1 Parent(s): b1a6659

updated applications

Browse files

Files changed (38) hide show

applications/3_molecule_generator/__pycache__/__init__.cpython-310.pyc +0 -0
applications/3_molecule_generator/__pycache__/cli.cpython-310.pyc +0 -0
applications/3_molecule_generator/__pycache__/main.cpython-310.pyc +0 -0
applications/3_molecule_generator/__pycache__/results.cpython-310.pyc +0 -0
applications/__init__.py +0 -0
applications/docker/.dockerignore +0 -5
applications/docker/Dockerfile +0 -33
applications/docker/docker-compose.yml +0 -22
applications/mixture_aware_generator/__init__.py +0 -0
applications/mixture_predictor/__init__.py +0 -0
applications/molecule_generator/__init__.py +0 -0
applications/molecule_generator/cli.py +43 -0
applications/molecule_generator/main.py +34 -0
applications/molecule_generator/results.py +37 -0
applications/molecule_generator/results/final_population.csv +6 -0
applications/molecule_generator/results/pareto_front.csv +3 -0
applications/pure_predictor/__init__.py +0 -0
applications/pure_predictor/cli.py +29 -0
applications/pure_predictor/main.py +82 -0
applications/pure_predictor/results.py +11 -0
core/__init__.py +0 -0
core/__pycache__/config.cpython-310.pyc +0 -0
core/blending/__init__.py +0 -0
core/config.py +26 -0
core/data_prep.py +34 -0
core/evolution/__init__.py +0 -0
core/evolution/__pycache__/evolution.cpython-310.pyc +0 -0
core/evolution/evolution.py +234 -0
core/evolution/molecule.py +33 -0
core/evolution/population.py +86 -0
core/predictors/__init__.py +0 -0
core/predictors/mixture/__init__.py +0 -0
core/predictors/pure_component/__pycache__/generic.cpython-310.pyc +0 -0
core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc +0 -0
core/predictors/pure_component/generic.py +51 -0
core/predictors/pure_component/hf_models.py +17 -0
core/predictors/pure_component/property_predictor.py +77 -0
core/shared_features.py +223 -0

applications/3_molecule_generator/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (143 Bytes)

applications/3_molecule_generator/__pycache__/cli.cpython-310.pyc DELETED Viewed

Binary file (1.76 kB)

applications/3_molecule_generator/__pycache__/main.cpython-310.pyc DELETED Viewed

Binary file (918 Bytes)

applications/3_molecule_generator/__pycache__/results.cpython-310.pyc DELETED Viewed

Binary file (1.85 kB)

applications/__init__.py ADDED Viewed

File without changes

applications/docker/.dockerignore DELETED Viewed

@@ -1,5 +0,0 @@
-venv*
-__pycache__/
-*.pyc
-.git/
-.gitignore

applications/docker/Dockerfile DELETED Viewed

@@ -1,33 +0,0 @@
-FROM python:3.10-slim
-# Avoid interactive prompts
-ENV DEBIAN_FRONTEND=noninteractive
-# System deps (important for RDKit / ML)
-RUN apt-get update && apt-get install -y \
-    git \
-    git-lfs \
-    build-essential \
-    sqlite3 \
-    && rm -rf /var/lib/apt/lists/*
-# Install git-lfs
-RUN git lfs install
-# Set working directory
-WORKDIR /app
-# Copy dependency files first (better caching)
-COPY requirements.txt .
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-# Copy the rest of the project
-COPY . .
-# Editable install
-RUN pip install -e .
-# Default command (can override)
-CMD ["bash"]

applications/docker/docker-compose.yml DELETED Viewed

@@ -1,22 +0,0 @@
-services:
-  biofuel-ml:
-    build:
-      context: ..
-      dockerfile: docker/Dockerfile
-    image: biofuel-ml:latest
-    container_name: biofuel-ml
-    tty: true
-    stdin_open: true
-    volumes:
-      - ..:/app
-      - ~/.cache/huggingface:/root/.cache/huggingface
-    working_dir: /app
-    environment:
-      - PYTHONUNBUFFERED=1
-      - HF_HOME=/root/.cache/huggingface
-      - PYTHONHASHSEED=42
-    command: bash

applications/mixture_aware_generator/__init__.py ADDED Viewed

File without changes

applications/mixture_predictor/__init__.py ADDED Viewed

File without changes

applications/molecule_generator/__init__.py ADDED Viewed

File without changes

applications/molecule_generator/cli.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from core.config import EvolutionConfig
+def get_user_config() -> EvolutionConfig:
+    """Get configuration from user input."""
+    print("\n" + "="*70)
+    print("MOLECULAR EVOLUTION WITH GENETIC ALGORITHM")
+    print("="*70)
+    # Choose optimization mode
+    print("\nOptimization Mode:")
+    print("1. Target a specific CN value (minimize error from target)")
+    print("2. Maximize CN (find highest possible CN)")
+    mode = input("Select mode (1 or 2): ").strip()
+    maximize_cn = (mode == "2")
+    while mode not in ["1", "2"]:
+        print("Invalid selection. Please choose 1 or 2.")
+        mode = input("Select mode (1 or 2): ").strip()
+        maximize_cn = (mode == "2")
+    if maximize_cn:
+        print("\n✓ Mode: Maximize Cetane Number")
+        target = 100.0  # Dummy target, not used in maximize mode
+    else:
+        print("\n✓ Mode: Target Cetane Number")
+        while True:
+            target = float(input("Enter target CN: ") or "50")
+            if target > 40:
+                break
+            print("⚠️  Target CN is too low, optimization may be challenging.")
+            print("Consider using a higher target CN for better results.\n")
+    # Ask about YSI
+    minimize_ysi = input("\nMinimize YSI (y/n): ").strip().lower() in ['y', 'yes']
+    # Print configuration summary
+    print("\n" + "="*70)
+    print("CONFIGURATION SUMMARY:")
+    print(f"  • Mode: {'Maximize CN' if maximize_cn else f'Target CN = {target}'}")
+    print(f"  • Minimize YSI: {'Yes' if minimize_ysi else 'No'}")
+    print(f"  • Optimization: {'Multi-objective (CN + YSI)' if minimize_ysi else 'Single-objective (CN only)'}")
+    print("="*70 + "\n")
+    return EvolutionConfig(target_cn=target, maximize_cn=maximize_cn, minimize_ysi=minimize_ysi)

applications/molecule_generator/main.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
+os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
+SEED = 42
+import random, numpy as np
+from .cli import get_user_config
+from .results import display_results, save_results
+from core.evolution.evolution import MolecularEvolution
+from core.shared_features import FeatureSelector
+os.environ["PYTHONHASHSEED"] = str(SEED)
+random.seed(SEED)
+np.random.seed(SEED)
+def run(config):
+    evolution = MolecularEvolution(config)
+    return evolution.evolve()
+def main():
+    """Main execution function."""
+    config = get_user_config()
+    evolution = MolecularEvolution(config)
+    final_df, pareto_df = evolution.evolve()
+    # Display and save results
+    display_results(final_df, pareto_df, config)
+    save_results(final_df, pareto_df, config.minimize_ysi)
+if __name__ == "__main__":
+    main()

applications/molecule_generator/results.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from pathlib import Path
+import pandas as pd
+from core.config import EvolutionConfig
+def save_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, minimize_ysi: bool):
+    """Save results to CSV files."""
+    results_dir = Path("results")
+    results_dir.mkdir(exist_ok=True)
+    final_df.to_csv(results_dir / "final_population.csv", index=False)
+    if minimize_ysi and not pareto_df.empty:
+        pareto_df.to_csv(results_dir / "pareto_front.csv", index=False)
+    print("\n✓ Results saved to results/")
+def display_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, config: EvolutionConfig):
+    """Display results to console."""
+    cols = ["rank", "smiles", "cn", "cn_error", "ysi", "bp", "density", "lhv", "dynamic_viscosity"]
+    if config.maximize_cn:
+        cols = [c for c in cols if c != "cn_error"]
+    available_cols = [c for c in cols if c in final_df.columns]
+    print("\n" + "="*70)
+    print("=== BEST CANDIDATES ===")
+    print("="*70)
+    print(final_df.head(10)[available_cols].to_string(index=False))
+    if config.minimize_ysi and not pareto_df.empty:
+        print("\n" + "="*70)
+        print("=== PARETO FRONT (Non-dominated solutions) ===")
+        print("="*70)
+        available_pareto_cols = [c for c in cols if c in pareto_df.columns]
+        print(pareto_df[available_pareto_cols].head(20).to_string(index=False))

applications/molecule_generator/results/final_population.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+rank,smiles,cn,cn_error,cn_score,ysi
+1,CCCCCC(C)C(=O)O,55.49838163060194,0.49838163060194063,55.49838163060194,45.2672860656464
+2,CCCCC=O,55.79958905600905,0.7995890560090473,55.79958905600905,23.523889793928337
+3,CCCCC(CC)C(=O)O,57.088458252834485,2.088458252834485,57.088458252834485,44.53587682413441
+4,CCCCC(C)C(=O)O,57.32411875680274,2.32411875680274,57.32411875680274,36.7278812007473
+5,CCCCC(=O)O,57.959166968253996,2.959166968253996,57.959166968253996,24.625850459125378

applications/molecule_generator/results/pareto_front.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+rank,smiles,cn,cn_error,cn_score,ysi
+1,CCCCCC(C)C(=O)O,55.49838163060194,0.49838163060194063,55.49838163060194,45.2672860656464
+2,CCCCC=O,55.79958905600905,0.7995890560090473,55.79958905600905,23.523889793928337

applications/pure_predictor/__init__.py ADDED Viewed

File without changes

applications/pure_predictor/cli.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# applications/1_pure_predictor/cli.py
+from rdkit import Chem
+from rdkit.Chem import rdinchi
+def get_user_config():
+    """
+    Collect user inputs for pure-component property prediction.
+    SMILES-only input.
+    """
+    mode = input("Select prediction mode (1: Single, 2: Batch): ").strip()
+    while mode not in {"1", "2"}:
+        print("Invalid selection. Please choose 1 or 2.")
+        mode = input("Select prediction mode (1: Single, 2: Batch): ").strip()
+    if mode == "1":
+        smiles = input("Enter SMILES string: ").strip()
+        if Chem.MolFromSmiles(smiles) is None:
+            raise ValueError("Invalid SMILES string.")
+    else:
+        smiles = input("Enter path to SMILES file: ").strip()
+    return {
+        "mode": mode,
+        "smiles": smiles
+    }

applications/pure_predictor/main.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# applications/1_pure_predictor/main.py
+import os
+os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
+os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
+from core.shared_features import featurize_df, FeatureSelector
+from core.predictors.pure_component.generic import GenericPredictor
+from core.predictors.pure_component.hf_models import load_models
+from .cli import get_user_config
+from .results import display_results
+# Load model paths (local or HF)
+PREDICTOR_PATHS = load_models()
+def run(config):
+    """
+    Run pure-component property prediction.
+    """
+    smiles = config["smiles"]
+    # --- Featurize ONCE ---
+    X_full = featurize_df([smiles], return_df=False)
+    if X_full is None:
+        raise RuntimeError("Featurization failed for input SMILES.")
+    # --- Initialise predictors ---
+    cn_predictor = GenericPredictor(
+        PREDICTOR_PATHS["cn"],
+        "Cetane Number"
+    )
+    bp_predictor = GenericPredictor(
+        PREDICTOR_PATHS["bp"],
+        "Boiling Point"
+    )
+    density_predictor = GenericPredictor(
+        PREDICTOR_PATHS["density"],
+        "Density"
+    )
+    lhv_predictor = GenericPredictor(
+        PREDICTOR_PATHS["lhv"],
+        "Lower Heating Value"
+    )
+    dyn_visc_predictor = GenericPredictor(
+        PREDICTOR_PATHS["dynamic_viscosity"],
+        "Dynamic Viscosity"
+    )
+    ysi_predictor = GenericPredictor(
+            PREDICTOR_PATHS["ysi"],
+            "YSI"
+        )
+    # --- Predict ---
+    result = {
+        "SMILES": smiles,
+        "CN": cn_predictor.predict_from_features(X_full)[0],
+        "YSI": ysi_predictor.predict_from_features(X_full)[0],
+        "BOILING POINT": bp_predictor.predict_from_features(X_full)[0],
+        "DENSITY": density_predictor.predict_from_features(X_full)[0],
+        "LHV": lhv_predictor.predict_from_features(X_full)[0],
+        "DYNAMIC VISCOSITY": dyn_visc_predictor.predict_from_features(X_full)[0]
+    }
+    return result
+def main():
+    config = get_user_config()
+    results = run(config)
+    display_results(results)
+if __name__ == "__main__":
+    main()

applications/pure_predictor/results.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# applications/1_pure_predictor/results.py
+import pandas as pd
+def display_results(result: dict):
+    """
+    Display pure-component prediction results.
+    """
+    df = pd.DataFrame([result])
+    print("\n=== PURE COMPONENT PROPERTY PREDICTION ===\n")
+    print(df.to_string(index=False))

core/__init__.py ADDED Viewed

File without changes

core/__pycache__/config.cpython-310.pyc CHANGED Viewed

Binary files a/core/__pycache__/config.cpython-310.pyc and b/core/__pycache__/config.cpython-310.pyc differ

core/blending/__init__.py ADDED Viewed

File without changes

core/config.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from dataclasses import dataclass, field
+@dataclass
+class EvolutionConfig:
+    target_cn: float = 50.0
+    maximize_cn: bool = False
+    minimize_ysi: bool = True
+    generations: int = 6
+    population_size: int = 100
+    mutations_per_parent: int = 5
+    survivor_fraction: float = 0.5
+    batch_size: int = 100
+    max_offspring_attempts: int = 10
+    # Filters
+    filters: dict = field(default_factory=lambda: {
+        "bp": (60.0, 250.0),
+        "density": (720.0, None),
+        "lhv": (30.0, None),
+        "dynamic_viscosity": (0.0, 2.0),
+    })
+    def cn_objective(self, cn: float) -> float:
+        return cn if self.maximize_cn else -abs(cn - self.target_cn)

core/data_prep.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+import sqlite3
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import os
+PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))   # goes from src/ → project root
+DB_PATH = os.path.join(PROJECT_ROOT, "data", "database", "database_main.db")
+TARGET_CN = "cn"      # Cetane number
+N_FOLDS = 5
+TOP_K = 5
+print("Connecting to SQLite database...")
+conn = sqlite3.connect(DB_PATH)
+query = """
+SELECT
+    F.Fuel_Name,
+    F.SMILES,
+    T.Standardised_DCN AS cn
+FROM FUEL F
+LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
+ORDER BY F.fuel_id ASC;
+"""
+df = pd.read_sql_query(query, conn)
+conn.close()
+df.dropna(subset=[TARGET_CN, "SMILES"], inplace=True)
+train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
+print(df.head())
+print(df.columns)
+def load_data():
+    return df

core/evolution/__init__.py ADDED Viewed

File without changes

core/evolution/__pycache__/evolution.cpython-310.pyc CHANGED Viewed

Binary files a/core/evolution/__pycache__/evolution.cpython-310.pyc and b/core/evolution/__pycache__/evolution.cpython-310.pyc differ

core/evolution/evolution.py ADDED Viewed

	@@ -0,0 +1,234 @@

+from .population import Population
+from .molecule import Molecule
+from core.predictors.pure_component.property_predictor import PropertyPredictor
+from core.config import EvolutionConfig
+from crem.crem import mutate_mol
+from rdkit import Chem
+import pandas as pd
+import numpy as np
+import random
+from typing import List, Tuple
+from core.data_prep import df  # Initial dataset for sampling
+from pathlib import Path
+class MolecularEvolution:
+    """Main evolutionary algorithm coordinator."""
+    BASE_DIR = Path(__file__).resolve().parent.parent.parent
+    REP_DB_PATH = BASE_DIR / "data" / "fragments" / "diesel_fragments.db"
+    def __init__(self, config: EvolutionConfig):
+        self.config = config
+        self.predictor = PropertyPredictor(config)
+        self.population = Population(config)
+    def _mutate_molecule(self, mol: Chem.Mol) -> List[str]:
+        """Generate mutations for a molecule using CREM."""
+        try:
+            mutants = list(mutate_mol(
+                mol,
+                db_name=str(self.REP_DB_PATH),
+                max_size=2,
+                return_mol=False
+            ))
+            return [m for m in mutants if m and m not in self.population.seen_smiles]
+        except Exception:
+            return []
+    def _create_molecules(self, smiles_list: List[str]) -> List[Molecule]:
+        """Create Molecule objects from SMILES with predictions (OPTIMIZED)."""
+        if not smiles_list:
+            return []
+        # OPTIMIZATION: Single featurization + all predictions
+        predictions = self.predictor.predict_all_properties(smiles_list)
+        molecules = []
+        for i, smiles in enumerate(smiles_list):
+            # Extract predictions for this molecule
+            props = {k: v[i] for k, v in predictions.items()}
+            # Validate required properties
+            if props.get('cn') is None:
+                continue
+            if self.config.minimize_ysi and props.get('ysi') is None:
+                continue
+            # Validate filtered properties
+            if not all(self.predictor.is_valid(k, props.get(k)) for k in ['bp', 'density', 'lhv', 'dynamic_viscosity']):
+                continue
+            molecules.append(Molecule(
+                smiles=smiles,
+                cn=props['cn'],
+                cn_error=abs(props['cn'] - self.config.target_cn),
+                cn_score=props['cn'],  # For maximize mode
+                bp=props.get('bp'),
+                ysi=props.get('ysi'),
+                density=props.get('density'),
+                lhv=props.get('lhv'),
+                dynamic_viscosity=props.get('dynamic_viscosity')
+            ))
+        return molecules
+    def initialize_population(self, initial_smiles: List[str]) -> int:
+        """Initialize the population from initial SMILES."""
+        print("Predicting properties for initial population...")
+        molecules = self._create_molecules(initial_smiles)
+        return self.population.add_molecules(molecules)
+    def _log_generation_stats(self, generation: int):
+        """Log statistics for the current generation."""
+        mols = self.population.molecules
+        if self.config.maximize_cn:
+            best_cn = max(mols, key=lambda m: m.cn)
+            avg_cn = np.mean([m.cn for m in mols])
+            print_msg = (f"Gen {generation}/{self.config.generations} | "
+                        f"Pop {len(mols)} | "
+                        f"Best CN: {best_cn.cn:.3f} | "
+                        f"Avg CN: {avg_cn:.3f}")
+        else:
+            best_cn = min(mols, key=lambda m: m.cn_error)
+            avg_cn_err = np.mean([m.cn_error for m in mols])
+            print_msg = (f"Gen {generation}/{self.config.generations} | "
+                        f"Pop {len(mols)} | "
+                        f"Best CN err: {best_cn.cn_error:.3f} | "
+                        f"Avg CN err: {avg_cn_err:.3f}")
+        if self.config.minimize_ysi:
+            front = self.population.pareto_front()
+            best_ysi = min(mols, key=lambda m: m.ysi)
+            avg_ysi = np.mean([m.ysi for m in mols])
+            print_msg += (f" | Best YSI: {best_ysi.ysi:.3f} | "
+                         f"Avg YSI: {avg_ysi:.3f} | "
+                         f"Pareto: {len(front)}")
+        print(print_msg)
+    def _generate_offspring(self, survivors: List[Molecule]) -> List[Molecule]:
+        """Generates offspring from survivors."""
+        target_count = self.config.population_size - len(survivors)
+        max_attempts = target_count * self.config.max_offspring_attempts
+        all_children = []
+        new_molecules = []
+        print(f"  → Generating offspring (target: {target_count})...")
+        for attempt in range(max_attempts):
+            if len(new_molecules) >= target_count:
+                break
+            # Generate mutations
+            parent = random.choice(survivors)
+            mol = Chem.MolFromSmiles(parent.smiles)
+            if mol is None:
+                continue
+            children = self._mutate_molecule(mol)
+            all_children.extend(children[:self.config.mutations_per_parent])
+            # Process in larger batches (single featurization per batch)
+            if len(all_children) >= self.config.batch_size:
+                print(f"  → Evaluating batch of {len(all_children)} (featurizing once)...")
+                new_molecules.extend(self._create_molecules(all_children))
+                all_children = []
+        # Process remaining children
+        if all_children:
+            print(f"  → Evaluating final batch of {len(all_children)}...")
+            new_molecules.extend(self._create_molecules(all_children))
+        print(f"  ✓ Generated {len(new_molecules)} valid offspring")
+        return new_molecules
+    def _run_evolution_loop(self):
+        """Run the main evolution loop."""
+        for gen in range(1, self.config.generations + 1):
+            self._log_generation_stats(gen)
+            survivors = self.population.get_survivors()
+            offspring = self._generate_offspring(survivors)
+            # Create new population
+            new_pop = Population(self.config)
+            new_pop.add_molecules(survivors + offspring)
+            self.population = new_pop
+    def _generate_results(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """Generate final results DataFrames."""
+        final_df = self.population.to_dataframe()
+        # Apply different filtering based on mode
+        if self.config.maximize_cn:
+            if self.config.minimize_ysi and "ysi" in final_df.columns:
+                # Maximize CN + minimize YSI: keep high CN, low YSI
+                final_df = final_df[
+                    (final_df["cn"] > 50) &
+                    (final_df["ysi"] < 50)
+                ].sort_values(["cn", "ysi"], ascending=[False, True])
+            else:
+                # Maximize CN only: just keep high CN
+                final_df = final_df[final_df["cn"] > 50].sort_values("cn", ascending=False)
+        else:
+            if self.config.minimize_ysi and "ysi" in final_df.columns:
+                # Target CN + minimize YSI: keep low error, low YSI
+                final_df = final_df[
+                    (final_df["cn_error"] < 5) &
+                    (final_df["ysi"] < 50)
+                ].sort_values(["cn_error", "ysi"], ascending=True)
+            else:
+                # Target CN only: just keep low error
+                final_df = final_df[final_df["cn_error"] < 5].sort_values("cn_error", ascending=True)
+        # Overwrite rank safely
+        final_df["rank"] = range(1, len(final_df) + 1)
+        if self.config.minimize_ysi:
+            pareto_mols = self.population.pareto_front()
+            pareto_df = pd.DataFrame([m.to_dict() for m in pareto_mols])
+            if not pareto_df.empty:
+                if self.config.maximize_cn:
+                    pareto_df = pareto_df[
+                        (pareto_df['cn'] > 50) & (pareto_df['ysi'] < 50)
+                    ].sort_values(["cn", "ysi"], ascending=[False, True])
+                else:
+                    pareto_df = pareto_df[
+                        (pareto_df['cn_error'] < 5) & (pareto_df['ysi'] < 50)
+                    ].sort_values(["cn_error", "ysi"], ascending=True)
+                pareto_df.insert(0, 'rank', range(1, len(pareto_df) + 1))
+        else:
+            pareto_df = pd.DataFrame()
+        return final_df, pareto_df
+    def evolve(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """Run the evolutionary algorithm."""
+        # Initialize
+        df_bins = pd.qcut(df["cn"], q=30)
+        initial_smiles = (
+            df.groupby(df_bins, observed=False)
+            .apply(lambda x: x.sample(20, random_state=42))
+            .reset_index(drop=True)["SMILES"]
+            .tolist()
+        )
+        init_count = self.initialize_population(initial_smiles)
+        if init_count == 0:
+            print("No valid initial molecules")
+            return pd.DataFrame(), pd.DataFrame()
+        print(f"✓ Initial population size: {init_count}\n")
+        # Evolution
+        self._run_evolution_loop()
+        # Results
+        return self._generate_results()

core/evolution/molecule.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from dataclasses import dataclass, asdict
+from typing import Optional, Dict
+@dataclass
+class Molecule:
+    """Represents a molecule with its properties."""
+    smiles: str
+    cn: float
+    cn_error: float
+    cn_score: float = 0.0  # For maximize mode (higher is better)
+    bp: Optional[float] = None
+    ysi: Optional[float] = None
+    density: Optional[float] = None
+    lhv: Optional[float] = None
+    dynamic_viscosity: Optional[float] = None
+    def dominates(self, other: 'Molecule', maximize_cn: bool = False) -> bool:
+        """Check if this molecule Pareto-dominates another."""
+        if maximize_cn:
+        # For maximize mode: higher CN is better
+            better_cn = self.cn >= other.cn
+            strictly_better_cn = self.cn > other.cn
+        else:
+            better_cn = self.cn_error <= other.cn_error
+            strictly_better_cn = self.cn_error < other.cn_error
+            better_ysi = self.ysi <= other.ysi if self.ysi is not None else True
+            strictly_better_ysi = self.ysi < other.ysi if self.ysi is not None else False
+            strictly_better = strictly_better_cn or strictly_better_ysi
+            return better_cn and better_ysi and strictly_better
+    def to_dict(self) -> Dict:
+        """Convert to dictionary for DataFrame creation."""
+        return {k: v for k, v in asdict(self).items() if v is not None}

core/evolution/population.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from typing import List
+from core.config import EvolutionConfig
+from .molecule import Molecule
+import pandas as pd
+class Population:
+    """Manages the population of molecules."""
+    def __init__(self, config: EvolutionConfig):
+        self.config = config
+        self.molecules: List[Molecule] = []
+        self.seen_smiles: set = set()
+    def add_molecule(self, mol: Molecule) -> bool:
+        """Add a molecule if it's not already in the population."""
+        if mol.smiles in self.seen_smiles:
+            return False
+        self.molecules.append(mol)
+        self.seen_smiles.add(mol.smiles)
+        return True
+    def add_molecules(self, molecules: List[Molecule]) -> int:
+        """Add multiple molecules, return count added."""
+        return sum(self.add_molecule(mol) for mol in molecules)
+    def pareto_front(self) -> List[Molecule]:
+        """Extract the Pareto front from the population."""
+        if not self.config.minimize_ysi:
+            return []
+        return [
+            mol for mol in self.molecules
+            if not any(other.dominates(mol, self.config.maximize_cn)
+                      for other in self.molecules if other is not mol)
+        ]
+    def get_survivors(self) -> List[Molecule]:
+        """Select survivors for the next generation."""
+        target_size = int(self.config.population_size * self.config.survivor_fraction)
+        if self.config.minimize_ysi:
+            survivors = self.pareto_front()
+            sort_key = lambda m: (
+                -self.config.cn_objective(m.cn),  # higher objective = better
+                m.ysi
+            )
+            if len(survivors) > target_size:
+                survivors = sorted(survivors, key=sort_key)[:target_size]
+            elif len(survivors) < target_size:
+                remainder = [m for m in self.molecules if m not in survivors]
+                remainder = sorted(remainder, key=sort_key)
+                survivors.extend(remainder[:target_size - len(survivors)])
+        else:
+            # Single objective mode
+            survivors = sorted(
+                self.molecules,
+                key=lambda m: self.config.cn_objective(m.cn),
+                reverse=True
+                )[:target_size]
+        return survivors
+    def to_dataframe(self) -> pd.DataFrame:
+        """Convert population to DataFrame."""
+        df = pd.DataFrame([m.to_dict() for m in self.molecules])
+        if self.config.maximize_cn:
+            if self.config.minimize_ysi:
+                sort_cols = ["cn", "ysi"]
+                ascending = [False, True]  # Descending CN, ascending YSI
+            else:
+                sort_cols = ["cn"]
+                ascending = False
+        else:
+            if self.config.minimize_ysi:
+                sort_cols = ["cn_error", "ysi"]
+                ascending = True
+            else:
+                sort_cols = ["cn_error"]
+                ascending = True
+        df = df.sort_values(sort_cols, ascending=ascending)
+        df.insert(0, 'rank', range(1, len(df) + 1))
+        return df

core/predictors/__init__.py ADDED Viewed

File without changes

core/predictors/mixture/__init__.py ADDED Viewed

File without changes

core/predictors/pure_component/__pycache__/generic.cpython-310.pyc CHANGED Viewed

Binary files a/core/predictors/pure_component/__pycache__/generic.cpython-310.pyc and b/core/predictors/pure_component/__pycache__/generic.cpython-310.pyc differ

core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc CHANGED Viewed

Binary files a/core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc and b/core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc differ

core/predictors/pure_component/generic.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import joblib
+import numpy as np
+from pathlib import Path
+# Make FeatureSelector discoverable for joblib
+import sys
+from core.shared_features import FeatureSelector
+# --- FIX FOR JOBLIB / PICKLE ---
+# Models were trained with FeatureSelector in __main__
+# Ensure pickle can resolve it in all contexts (pytest, HF, Docker)
+main_module = sys.modules.get("__main__")
+if main_module is not None and not hasattr(main_module, "FeatureSelector"):
+    setattr(main_module, "FeatureSelector", FeatureSelector)
+class GenericPredictor:
+    """Generic predictor that works for any property model."""
+    def __init__(self, model_dir: Path, property_name: str):
+        """
+        Initialize predictor from a model directory.
+        Args:
+            model_dir: Path to the model directory containing artifacts/
+            property_name: Name of the property (for display purposes)
+        """
+        print(f"Loading {property_name} Predictor...")
+        model_path = model_dir / "model.joblib"
+        selector_path = model_dir / "selector.joblib"
+        # Load artifacts
+        self.model = joblib.load(model_path)
+        self.selector = FeatureSelector.load(selector_path)
+        self.property_name = property_name
+        print(f"✓ {property_name} Predictor ready!")
+    def predict_from_features(self, X_full):
+        """Predict from pre-computed features."""
+        if X_full is None or len(X_full) == 0:
+            return []
+        try:
+            X_selected = self.selector.transform(X_full)
+            predictions = self.model.predict(X_selected)
+            return predictions.tolist()
+        except Exception as e:
+            print(f"⚠ Warning: {self.property_name} prediction failed: {e}")
+            return [None] * len(X_full)

core/predictors/pure_component/hf_models.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from pathlib import Path
+from huggingface_hub import snapshot_download
+HF_MODELS = {
+    "cn": "SalZa2004/Cetane_Number_Predictor",
+    "ysi": "SalZa2004/YSI_Predictor",
+    "bp": "SalZa2004/Boiling_Point_Predictor",
+    "density": "SalZa2004/Density_Predictor",
+    "lhv": "SalZa2004/LHV_Predictor",
+    "dynamic_viscosity": "SalZa2004/Dynamic_Viscosity_Predictor",
+}
+def load_models():
+    return {
+        k: Path(snapshot_download(repo_id=v, repo_type="model"))
+        for k, v in HF_MODELS.items()
+    }

core/predictors/pure_component/property_predictor.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import numpy as np
+from .generic import GenericPredictor
+from core.shared_features import featurize_df
+from core.config import EvolutionConfig
+from typing import List, Dict, Optional, Tuple, Callable
+from .hf_models import load_models
+PREDICTOR_PATHS = load_models()
+class PropertyPredictor:
+    """Handles batch prediction for all molecular properties."""
+    def __init__(self, config: EvolutionConfig):
+        self.config = config
+        # Initialize only the predictors we need
+        self.predictors = {}
+        # Always need CN predictor
+        self.predictors['cn'] = GenericPredictor(
+            PREDICTOR_PATHS['cn'],
+            'Cetane Number'
+        )
+        # Conditional predictors
+        if config.minimize_ysi:
+            self.predictors['ysi'] = GenericPredictor(
+                PREDICTOR_PATHS['ysi'],
+                'YSI'
+            )
+        # Define validation rules
+        self.validators = {
+            'bp': lambda v: self.config.min_bp <= v <= self.config.max_bp,
+            'density': lambda v: v > self.config.min_density,
+            'lhv': lambda v: v > self.config.min_lhv,
+            'dynamic_viscosity': lambda v: self.config.min_dynamic_viscosity < v <= self.config.max_dynamic_viscosity
+        }
+    def _safe_predict(self, predictions: List) -> List[Optional[float]]:
+        """Safely convert predictions, handling None/NaN/inf values."""
+        return [
+            float(pred) if pred is not None and np.isfinite(pred) else None
+            for pred in predictions
+        ]
+    def predict_all_properties(self, smiles_list: List[str]) -> Dict[str, List[Optional[float]]]:
+        """
+        Predict all properties for a batch of SMILES.
+        Featurizes ONCE and reuses features for all predictors.
+        """
+        if not smiles_list:
+            return {prop: [] for prop in self.predictors.keys()}
+        # OPTIMIZATION: Featurize only once per batch
+        X_full = featurize_df(smiles_list, return_df=False)
+        if X_full is None:
+            return {prop: [None] * len(smiles_list) for prop in self.predictors.keys()}
+        # Predict all properties using the same features
+        results = {}
+        for prop_name, predictor in self.predictors.items():
+            predictions = predictor.predict_from_features(X_full)
+            results[prop_name] = self._safe_predict(predictions)
+        return results
+    def is_valid(self, name, value):
+        if value is None or name not in self.config.filters:
+            return True
+        lo, hi = self.config.filters[name]
+        if lo is not None and value < lo:
+            return False
+        if hi is not None and value > hi:
+            return False
+        return True

core/shared_features.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import os
+import sqlite3
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
+DB_PATH = os.path.join(PROJECT_ROOT, "data", "database", "database_main.db")
+def load_raw_data():
+    """Load raw data from database."""
+    print("Connecting to SQLite database...")
+    conn = sqlite3.connect(DB_PATH)
+    query = """
+    SELECT
+        F.Fuel_Name,
+        F.SMILES,
+        T.Standardised_DCN AS cn
+    FROM FUEL F
+    LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
+    """
+    df = pd.read_sql_query(query, conn)
+    conn.close()
+    # Clean data
+    df.dropna(subset=["cn", "SMILES"], inplace=True)
+    return df
+# ============================================================================
+# 2. FEATURIZATION MODULE
+# ============================================================================
+from rdkit import Chem
+from rdkit.Chem import Descriptors, rdFingerprintGenerator
+from tqdm import tqdm
+# Get descriptor names globally
+DESCRIPTOR_NAMES = [d[0] for d in Descriptors._descList]
+desc_functions = [d[1] for d in Descriptors._descList]
+def morgan_fp_from_mol(mol, radius=2, n_bits=2048):
+    """Generate Morgan fingerprint."""
+    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
+    fp = fpgen.GetFingerprint(mol)
+    arr = np.array(list(fp.ToBitString()), dtype=int)
+    return arr
+def physchem_desc_from_mol(mol):
+    """Calculate physicochemical descriptors."""
+    try:
+        desc = np.array([fn(mol) for fn in desc_functions], dtype=np.float32)
+        desc = np.nan_to_num(desc, nan=0.0, posinf=0.0, neginf=0.0)
+        return desc
+    except:
+        return None
+def featurize(smiles):
+    """Convert SMILES to feature vector."""
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return None
+    fp = morgan_fp_from_mol(mol)
+    desc = physchem_desc_from_mol(mol)
+    if fp is None or desc is None:
+        return None
+    return np.hstack([fp, desc])
+def featurize_df(df, smiles_col="SMILES", return_df=True):
+    """
+    Featurize a DataFrame or list of SMILES (vectorized for speed).
+    """
+    # Handle different input types
+    if isinstance(df, (list, np.ndarray)):
+        df = pd.DataFrame({smiles_col: df})
+    elif isinstance(df, pd.Series):
+        df = pd.DataFrame({smiles_col: df})
+    # Convert all SMILES to molecules in batch
+    mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_col]]
+    features = []
+    valid_indices = []
+    # Process valid molecules
+    for i, mol in enumerate(tqdm(mols, desc="Featurizing")):
+        if mol is None:
+            continue
+        try:
+            fp = morgan_fp_from_mol(mol)
+            desc = physchem_desc_from_mol(mol)
+            if fp is not None and desc is not None:
+                features.append(np.hstack([fp, desc]))
+                valid_indices.append(i)
+        except:
+            continue
+    if len(features) == 0:
+        return (None, None) if return_df else None
+    X = np.vstack(features)
+    if return_df:
+        df_valid = df.iloc[valid_indices].reset_index(drop=True)
+        return X, df_valid
+    else:
+        return X
+# ============================================================================
+# 3. FEATURE SELECTOR CLASS
+# ============================================================================
+import joblib
+class FeatureSelector:
+    """Feature selection pipeline that can be saved and reused."""
+    def __init__(self, n_morgan=2048, corr_threshold=0.95, top_k=300):
+        self.n_morgan = n_morgan
+        self.corr_threshold = corr_threshold
+        self.top_k = top_k
+        # Filled during fit()
+        self.corr_cols_to_drop = None
+        self.selected_indices = None
+        self.is_fitted = False
+    def fit(self, X, y):
+        """Fit the feature selector on training data."""
+        print("\n" + "="*70)
+        print("FITTING FEATURE SELECTOR")
+        print("="*70)
+        # Step 1: Split Morgan and descriptors
+        X_mfp = X[:, :self.n_morgan]
+        X_desc = X[:, self.n_morgan:]
+        print(f"Morgan fingerprints: {X_mfp.shape[1]}")
+        print(f"Descriptors: {X_desc.shape[1]}")
+        # Step 2: Remove correlated descriptors
+        desc_df = pd.DataFrame(X_desc)
+        corr_matrix = desc_df.corr().abs()
+        upper = corr_matrix.where(
+            np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
+        )
+        self.corr_cols_to_drop = [
+            col for col in upper.columns if any(upper[col] > self.corr_threshold)
+        ]
+        print(f"Correlated descriptors removed: {len(self.corr_cols_to_drop)}")
+        desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
+        X_corr = np.hstack([X_mfp, desc_filtered])
+        print(f"Features after correlation filter: {X_corr.shape[1]}")
+        # Step 3: Feature importance selection
+        from sklearn.ensemble import ExtraTreesRegressor
+        print("Running feature importance selection...")
+        model = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
+        model.fit(X_corr, y)
+        importances = model.feature_importances_
+        indices = np.argsort(importances)[::-1]
+        self.selected_indices = indices[:self.top_k]
+        print(f"Final selected features: {len(self.selected_indices)}")
+        self.is_fitted = True
+        return self
+    def transform(self, X):
+        """Apply the fitted feature selection to new data."""
+        if not self.is_fitted:
+            raise RuntimeError("FeatureSelector must be fitted before transform!")
+        # Step 1: Split Morgan and descriptors
+        X_mfp = X[:, :self.n_morgan]
+        X_desc = X[:, self.n_morgan:]
+        # Step 2: Remove same correlated descriptors
+        desc_df = pd.DataFrame(X_desc)
+        desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
+        X_corr = np.hstack([X_mfp, desc_filtered])
+        # Step 3: Select same important features
+        X_selected = X_corr[:, self.selected_indices]
+        return X_selected
+    def fit_transform(self, X, y):
+        """Fit and transform in one step."""
+        return self.fit(X, y).transform(X)
+    def save(self, filepath='feature_selector.joblib'):
+        """Save the fitted selector."""
+        if not self.is_fitted:
+            raise RuntimeError("Cannot save unfitted selector!")
+        # Create directory if it doesn't exist
+        os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
+        joblib.dump(self, filepath)
+        print(f"✓ Feature selector saved to {filepath}")
+    @staticmethod
+    def load(filepath='feature_selector.joblib'):
+        """Load a fitted selector."""
+        selector = joblib.load(filepath)
+        if not selector.is_fitted:
+            raise RuntimeError("Loaded selector is not fitted!")
+        print(f"✓ Feature selector loaded from {filepath}")
+        return selector