SalZa2004 commited on
Commit
da421be
·
1 Parent(s): 9250f3b

new structure

Browse files
applications/3_molecule_generator/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (143 Bytes). View file
 
applications/3_molecule_generator/__pycache__/cli.cpython-310.pyc ADDED
Binary file (1.76 kB). View file
 
applications/3_molecule_generator/__pycache__/main.cpython-310.pyc ADDED
Binary file (918 Bytes). View file
 
applications/3_molecule_generator/__pycache__/results.cpython-310.pyc ADDED
Binary file (1.85 kB). View file
 
applications/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (122 Bytes). View file
 
core/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (138 Bytes). View file
 
core/__pycache__/config.cpython-310.pyc ADDED
Binary file (1.25 kB). View file
 
core/__pycache__/data_prep.cpython-310.pyc ADDED
Binary file (1 kB). View file
 
{src → core}/__pycache__/shared_features.cpython-310.pyc RENAMED
Binary files a/src/__pycache__/shared_features.cpython-310.pyc and b/core/__pycache__/shared_features.cpython-310.pyc differ
 
core/evolution/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (148 Bytes). View file
 
core/evolution/__pycache__/evolution.cpython-310.pyc ADDED
Binary file (7.88 kB). View file
 
core/evolution/__pycache__/molecule.cpython-310.pyc ADDED
Binary file (1.57 kB). View file
 
core/evolution/__pycache__/population.cpython-310.pyc ADDED
Binary file (3.7 kB). View file
 
core/predictors/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (149 Bytes). View file
 
core/predictors/pure_component/__pycache__/generic.cpython-310.pyc ADDED
Binary file (1.68 kB). View file
 
core/predictors/pure_component/__pycache__/hf_models.cpython-310.pyc ADDED
Binary file (866 Bytes). View file
 
core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc ADDED
Binary file (3.31 kB). View file
 
src/__pycache__/data_prep.cpython-310.pyc DELETED
Binary file (1.14 kB)
 
src/data_prep.py DELETED
@@ -1,36 +0,0 @@
1
- import os
2
- import sqlite3
3
- import pandas as pd
4
- from sklearn.model_selection import train_test_split
5
- import os
6
-
7
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
8
- DB_PATH = os.path.join(BASE_DIR, "database_main.db")
9
-
10
- TARGET_CN = "cn" # Cetane number
11
- N_FOLDS = 5
12
- TOP_K = 5
13
- print("Connecting to SQLite database...")
14
- conn = sqlite3.connect(DB_PATH)
15
- cursor = conn.cursor()
16
- cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
17
- print(cursor.fetchall())
18
-
19
- query = """
20
- SELECT
21
- F.Fuel_Name,
22
- F.SMILES,
23
- T.Standardised_DCN AS cn
24
- FROM FUEL F
25
- LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
26
- """
27
- df = pd.read_sql_query(query, conn)
28
- conn.close()
29
- df.dropna(subset=[TARGET_CN, "SMILES"], inplace=True)
30
-
31
- train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
32
- print(df.head())
33
- print(df.columns)
34
-
35
- def load_data():
36
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/database_main.db DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b14779692bb401ac9fc714a3aa8919d4e14f75aef9f92c6004195d89102ebcff
3
- size 344064
 
 
 
 
src/diesel_fragments.db DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e76b070ca56ecaaf083602224e59dbff6d5f94c43960e139643c52d93472acb
3
- size 10002432
 
 
 
 
src/main.py DELETED
@@ -1,704 +0,0 @@
1
- import os
2
- import sys
3
- from pathlib import Path
4
- from dataclasses import dataclass, asdict, field
5
- from typing import List, Dict, Optional, Tuple, Callable
6
- import joblib
7
- import numpy as np
8
- import pandas as pd
9
- import random
10
- from rdkit import Chem
11
- from crem.crem import mutate_mol
12
- from sklearn.base import BaseEstimator, RegressorMixin
13
- from joblib import Parallel, delayed
14
- from tqdm import tqdm
15
- from huggingface_hub import snapshot_download
16
-
17
-
18
-
19
- HF_MODEL_REPOS = {
20
- "cn": "SalZa2004/Cetane_Number_Predictor",
21
- "ysi": "SalZa2004/YSI_Predictor",
22
- "density": "SalZa2004/Density_Predictor",
23
- "lhv": "SalZa2004/LHV_Predictor",
24
- "dynamic_viscosity": "SalZa2004/Dynamic_Viscosity_Predictor",
25
- "bp": "SalZa2004/Boiling_Point_Predictor",
26
- }
27
-
28
- def get_hf_model_dir(repo_id: str) -> Path:
29
- """
30
- Download a Hugging Face model repo and return local path.
31
- Uses HF cache automatically.
32
- """
33
- local_dir = snapshot_download(
34
- repo_id=repo_id,
35
- repo_type="model",
36
- local_dir=None, # use HF cache
37
- local_dir_use_symlinks=True
38
- )
39
- return Path(local_dir)
40
-
41
-
42
- # === Project Setup ===
43
- PROJECT_ROOT = Path.cwd()
44
- SRC_DIR = PROJECT_ROOT / "src"
45
- sys.path.append(str(PROJECT_ROOT))
46
-
47
- INITIAL_PRED_CACHE = PROJECT_ROOT / "cache" / "initial_predictions.parquet"
48
-
49
-
50
- from shared_features import FeatureSelector, featurize_df
51
- from data_prep import df
52
-
53
-
54
- class GenericPredictor:
55
- """Generic predictor that works for any property model."""
56
-
57
- def __init__(self, model_dir: Path, property_name: str):
58
- self.property_name = property_name
59
-
60
-
61
- model_path = model_dir / "model.joblib"
62
- selector_path = model_dir / "selector.joblib"
63
-
64
- if not model_path.exists():
65
- raise FileNotFoundError(f"Missing model.joblib in {model_dir}")
66
- if not selector_path.exists():
67
- raise FileNotFoundError(f"Missing selector.joblib in {model_dir}")
68
-
69
- self.model = joblib.load(model_path)
70
- self.selector = FeatureSelector.load(selector_path)
71
-
72
- print(f"✓ Loaded {property_name} predictor")
73
-
74
-
75
- def predict(self, smiles_list):
76
- """Inference on a list of SMILES strings."""
77
- if isinstance(smiles_list, str):
78
- smiles_list = [smiles_list]
79
-
80
- X_full = featurize_df(smiles_list, return_df=False)
81
-
82
- if X_full is None:
83
- print(f"⚠ Warning: No valid molecules found for {self.property_name}!")
84
- return []
85
-
86
- X_selected = self.selector.transform(X_full)
87
- predictions = self.model.predict(X_selected)
88
- return predictions.tolist()
89
-
90
- def predict_with_details(self, smiles_list):
91
- """Inference with valid/invalid info."""
92
- if isinstance(smiles_list, str):
93
- smiles_list = [smiles_list]
94
-
95
- df = pd.DataFrame({"SMILES": smiles_list})
96
- X_full, df_valid = featurize_df(df, return_df=True)
97
-
98
- col_name = f"Predicted_{self.property_name}"
99
-
100
- if X_full is None:
101
- return pd.DataFrame(columns=["SMILES", col_name, "Valid"])
102
-
103
- X_selected = self.selector.transform(X_full)
104
- predictions = self.model.predict(X_selected)
105
-
106
- df_valid[col_name] = predictions
107
- df_valid["Valid"] = True
108
-
109
- all_results = pd.DataFrame({"SMILES": smiles_list})
110
- all_results = all_results.merge(
111
- df_valid[["SMILES", col_name, "Valid"]],
112
- on="SMILES", how="left"
113
- )
114
- all_results["Valid"] = all_results["Valid"].fillna(False)
115
-
116
- return all_results
117
-
118
-
119
-
120
- @dataclass
121
- class EvolutionConfig:
122
- """Configuration for evolutionary algorithm."""
123
- target_cn: float
124
- minimize_ysi: bool = True
125
- generations: int = 6
126
- population_size: int = 50
127
- mutations_per_parent: int = 5
128
- survivor_fraction: float = 0.5
129
- min_bp: float = 60
130
- max_bp: float = 250
131
- min_dynamic_viscosity: float = 0.0
132
- max_dynamic_viscosity: float = 2.0
133
- min_density: float = 720
134
- min_lhv: float = 30
135
- use_bp_filter: bool = True
136
- use_density_filter: bool = True
137
- use_lhv_filter: bool = True
138
- use_dynamic_viscosity_filter: bool = True
139
- batch_size: int = 200 # Increased default for better throughput
140
- max_offspring_attempts: int = 10
141
- n_jobs: int = -1 # Number of parallel jobs for mutation (-1 = all cores)
142
-
143
- def __post_init__(self):
144
- """Validate configuration parameters."""
145
- if self.target_cn < 0:
146
- raise ValueError("target_cn must be positive")
147
- if not 0 < self.survivor_fraction < 1:
148
- raise ValueError("survivor_fraction must be between 0 and 1")
149
- if self.min_bp >= self.max_bp:
150
- raise ValueError("min_bp must be less than max_bp")
151
- if self.population_size < 2:
152
- raise ValueError("population_size must be at least 2")
153
-
154
- @dataclass
155
- class Molecule:
156
- """Represents a molecule with its properties."""
157
- smiles: str
158
- cn: float
159
- cn_error: float
160
- bp: Optional[float] = None
161
- ysi: Optional[float] = None
162
- density: Optional[float] = None
163
- lhv: Optional[float] = None
164
- dynamic_viscosity: Optional[float] = None
165
- _mol_cache: Optional[Chem.Mol] = field(default=None, repr=False, compare=False)
166
-
167
- def get_mol(self) -> Optional[Chem.Mol]:
168
- """Get cached RDKit Mol object to avoid repeated conversions."""
169
- if self._mol_cache is None:
170
- self._mol_cache = Chem.MolFromSmiles(self.smiles)
171
- return self._mol_cache
172
-
173
- def dominates(self, other: 'Molecule') -> bool:
174
- """Check if this molecule Pareto-dominates another."""
175
- better_cn = self.cn_error <= other.cn_error
176
- better_ysi = self.ysi <= other.ysi if self.ysi is not None else True
177
- strictly_better = (self.cn_error < other.cn_error or
178
- (self.ysi is not None and self.ysi < other.ysi))
179
- return better_cn and better_ysi and strictly_better
180
-
181
- def to_dict(self) -> Dict:
182
- """Convert to dictionary for DataFrame creation."""
183
- return {k: v for k, v in asdict(self).items()
184
- if v is not None and k != '_mol_cache'}
185
-
186
-
187
- class PropertyPredictor:
188
- """Handles batch prediction for all molecular properties with caching."""
189
-
190
- def __init__(self, config: EvolutionConfig):
191
- self.config = config
192
- self.predictors = {}
193
- self.prediction_cache = {}
194
-
195
- # --- Always load CN ---
196
- cn_dir = get_hf_model_dir(HF_MODEL_REPOS["cn"])
197
- self.predictors["cn"] = GenericPredictor(
198
- cn_dir,
199
- "Cetane Number"
200
- )
201
-
202
- # --- Optional predictors ---
203
- if config.minimize_ysi:
204
- ysi_dir = get_hf_model_dir(HF_MODEL_REPOS["ysi"])
205
- self.predictors["ysi"] = GenericPredictor(
206
- ysi_dir,
207
- "YSI"
208
- )
209
-
210
- if config.use_bp_filter:
211
- bp_dir = get_hf_model_dir(HF_MODEL_REPOS["bp"])
212
- self.predictors["bp"] = GenericPredictor(
213
- bp_dir,
214
- "Boiling Point"
215
- )
216
-
217
- if config.use_density_filter:
218
- density_dir = get_hf_model_dir(HF_MODEL_REPOS["density"])
219
- self.predictors["density"] = GenericPredictor(
220
- density_dir,
221
- "Density"
222
- )
223
-
224
- if config.use_lhv_filter:
225
- lhv_dir = get_hf_model_dir(HF_MODEL_REPOS["lhv"])
226
- self.predictors["lhv"] = GenericPredictor(
227
- lhv_dir,
228
- "Lower Heating Value"
229
- )
230
-
231
- if config.use_dynamic_viscosity_filter:
232
- dv_dir = get_hf_model_dir(HF_MODEL_REPOS["dynamic_viscosity"])
233
- self.predictors["dynamic_viscosity"] = GenericPredictor(
234
- dv_dir,
235
- "Dynamic Viscosity"
236
- )
237
-
238
-
239
- def _safe_predict(self, predictions: List) -> List[Optional[float]]:
240
- """Safely convert predictions, handling None/NaN/inf values."""
241
- return [
242
- float(pred) if pred is not None and np.isfinite(pred) else None
243
- for pred in predictions
244
- ]
245
-
246
- def _predict_batch(self, property_name: str, smiles_list: List[str]) -> List[Optional[float]]:
247
- """Generic batch prediction method."""
248
- predictor = self.predictors.get(property_name)
249
- if not smiles_list or predictor is None:
250
- return [None] * len(smiles_list)
251
-
252
- try:
253
- predictions = predictor.predict(smiles_list)
254
- return self._safe_predict(predictions)
255
- except Exception as e:
256
- print(f"⚠️ Warning: {property_name.upper()} prediction failed: {e}")
257
- return [None] * len(smiles_list)
258
-
259
- def predict_all_properties(self, smiles_list: List[str]) -> Dict[str, List[Optional[float]]]:
260
-
261
- if not smiles_list:
262
- return {}
263
-
264
- # --- ONE featurization ---
265
- X_full = featurize_df(smiles_list, return_df=False)
266
- if X_full is None:
267
- return {}
268
-
269
- results = {}
270
-
271
- for prop, predictor in self.predictors.items():
272
- try:
273
- X_sel = predictor.selector.transform(X_full)
274
- preds = predictor.model.predict(X_sel)
275
- results[prop] = self._safe_predict(preds)
276
- except Exception as e:
277
- print(f"⚠️ {prop} prediction failed: {e}")
278
- results[prop] = [None] * len(smiles_list)
279
-
280
- return results
281
-
282
-
283
-
284
- class Population:
285
- """Manages the population of molecules."""
286
-
287
- def __init__(self, config: EvolutionConfig):
288
- self.config = config
289
- self.molecules: List[Molecule] = []
290
- self.seen_smiles: set = set()
291
-
292
- def add_molecule(self, mol: Molecule) -> bool:
293
- """Add a molecule if it's not already in the population."""
294
- if mol.smiles in self.seen_smiles:
295
- return False
296
- self.molecules.append(mol)
297
- self.seen_smiles.add(mol.smiles)
298
- return True
299
-
300
- def add_molecules(self, molecules: List[Molecule]) -> int:
301
- """Add multiple molecules, return count added."""
302
- return sum(self.add_molecule(mol) for mol in molecules)
303
-
304
- def pareto_front(self) -> List[Molecule]:
305
- """Extract Pareto front using optimized vectorized operations."""
306
- if not self.config.minimize_ysi:
307
- return []
308
-
309
- n = len(self.molecules)
310
- if n == 0:
311
- return []
312
-
313
- # Create numpy arrays for vectorized operations
314
- cn_errors = np.array([m.cn_error for m in self.molecules])
315
- ysis = np.array([m.ysi for m in self.molecules])
316
-
317
- # Vectorized dominance check
318
- is_pareto = np.ones(n, dtype=bool)
319
- for i in range(n):
320
- if not is_pareto[i]:
321
- continue
322
- # Check if any other point dominates point i
323
- dominates_i = (
324
- (cn_errors <= cn_errors[i]) &
325
- (ysis <= ysis[i]) &
326
- ((cn_errors < cn_errors[i]) | (ysis < ysis[i]))
327
- )
328
- dominates_i[i] = False # Don't compare with itself
329
- is_pareto[i] = not np.any(dominates_i)
330
-
331
- return [self.molecules[i] for i in np.where(is_pareto)[0]]
332
-
333
- def get_survivors(self) -> List[Molecule]:
334
- """Select survivors for the next generation."""
335
- target_size = int(self.config.population_size * self.config.survivor_fraction)
336
-
337
- if self.config.minimize_ysi:
338
- survivors = self.pareto_front()
339
-
340
- # Sort key for combined objectives
341
- sort_key = lambda m: m.cn_error + m.ysi
342
-
343
- if len(survivors) > target_size:
344
- survivors = sorted(survivors, key=sort_key)[:target_size]
345
- elif len(survivors) < target_size:
346
- remainder = [m for m in self.molecules if m not in survivors]
347
- remainder = sorted(remainder, key=sort_key)
348
- survivors.extend(remainder[:target_size - len(survivors)])
349
- else:
350
- survivors = sorted(self.molecules, key=lambda m: m.cn_error)[:target_size]
351
-
352
- return survivors
353
-
354
- def to_dataframe(self) -> pd.DataFrame:
355
- """Convert population to DataFrame."""
356
- df = pd.DataFrame([m.to_dict() for m in self.molecules])
357
-
358
- sort_cols = ["cn_error", "ysi"] if self.config.minimize_ysi else ["cn_error"]
359
- df = df.sort_values(sort_cols, ascending=True)
360
- df.insert(0, 'rank', range(1, len(df) + 1))
361
- return df
362
-
363
-
364
- class MolecularEvolution:
365
- """Main evolutionary algorithm coordinator with optimized performance."""
366
-
367
- REP_DB_PATH = "diesel_fragments.db"
368
-
369
- def __init__(self, config: EvolutionConfig):
370
- self.config = config
371
- self.predictor = PropertyPredictor(config)
372
- self.population = Population(config)
373
-
374
- def _mutate_molecule(self, mol: Chem.Mol) -> List[str]:
375
- """Generate mutations for a molecule using CREM with set-based deduplication."""
376
- try:
377
- mutants = set(mutate_mol(
378
- mol,
379
- db_name=str(self.REP_DB_PATH),
380
- max_size=2,
381
- return_mol=False
382
- ))
383
- # Single set operation instead of list comprehension
384
- return list(mutants - self.population.seen_smiles)
385
- except Exception:
386
- return []
387
-
388
- def _create_molecules(self, smiles_list: List[str]) -> List[Molecule]:
389
- """Create Molecule objects from SMILES with vectorized validation."""
390
- if not smiles_list:
391
- return []
392
-
393
- # Get all predictions at once
394
- predictions = self.predictor.predict_all_properties(smiles_list)
395
-
396
- # Vectorized validation using numpy
397
- n = len(smiles_list)
398
- cn_vals = np.array(predictions.get('cn', [None] * n))
399
- valid_mask = ~np.isnan(cn_vals) # Start with CN validity
400
-
401
- if self.config.minimize_ysi:
402
- ysi_vals = np.array(predictions.get('ysi', [None] * n))
403
- valid_mask &= ~np.isnan(ysi_vals)
404
-
405
- # Vectorized constraint checking
406
- if 'bp' in predictions and self.config.use_bp_filter:
407
- bp_vals = np.array([v if v is not None else np.nan for v in predictions['bp']])
408
- valid_mask &= (bp_vals >= self.config.min_bp) & (bp_vals <= self.config.max_bp)
409
-
410
- if 'density' in predictions and self.config.use_density_filter:
411
- density_vals = np.array([v if v is not None else np.nan for v in predictions['density']])
412
- valid_mask &= (density_vals > self.config.min_density)
413
-
414
- if 'lhv' in predictions and self.config.use_lhv_filter:
415
- lhv_vals = np.array([v if v is not None else np.nan for v in predictions['lhv']])
416
- valid_mask &= (lhv_vals > self.config.min_lhv)
417
-
418
- if 'dynamic_viscosity' in predictions and self.config.use_dynamic_viscosity_filter:
419
- dv_vals = np.array([v if v is not None else np.nan for v in predictions['dynamic_viscosity']])
420
- valid_mask &= (dv_vals > self.config.min_dynamic_viscosity) & (dv_vals <= self.config.max_dynamic_viscosity)
421
-
422
- # Create molecules only for valid indices
423
- molecules = []
424
- for i in np.where(valid_mask)[0]:
425
- molecules.append(Molecule(
426
- smiles=smiles_list[i],
427
- cn=predictions['cn'][i],
428
- cn_error=abs(predictions['cn'][i] - self.config.target_cn),
429
- bp=predictions.get('bp', [None]*n)[i],
430
- ysi=predictions.get('ysi', [None]*n)[i],
431
- density=predictions.get('density', [None]*n)[i],
432
- lhv=predictions.get('lhv', [None]*n)[i],
433
- dynamic_viscosity=predictions.get('dynamic_viscosity', [None]*n)[i]
434
- ))
435
-
436
- return molecules
437
-
438
- def initialize_population(self, initial_smiles: List[str]) -> int:
439
-
440
- cache_path = INITIAL_PRED_CACHE
441
- cache_path.parent.mkdir(exist_ok=True)
442
-
443
- if cache_path.exists():
444
- print("✓ Loading cached initial predictions")
445
- df_pred = pd.read_parquet(cache_path)
446
- else:
447
- print("Predicting properties for initial population (cached)...")
448
- predictions = self.predictor.predict_all_properties(initial_smiles)
449
-
450
- df_pred = pd.DataFrame({
451
- "smiles": initial_smiles,
452
- **predictions
453
- })
454
-
455
- df_pred.to_parquet(cache_path)
456
-
457
- # --- Apply constraints & build Molecules ---
458
- molecules = []
459
-
460
- for _, row in df_pred.iterrows():
461
- if row["cn"] is None:
462
- continue
463
-
464
- if self.config.minimize_ysi and pd.isna(row.get("ysi")):
465
- continue
466
-
467
- if self.config.use_bp_filter:
468
- if not (self.config.min_bp <= row["bp"] <= self.config.max_bp):
469
- continue
470
-
471
- if self.config.use_density_filter:
472
- if row["density"] <= self.config.min_density:
473
- continue
474
-
475
- if self.config.use_lhv_filter:
476
- if row["lhv"] <= self.config.min_lhv:
477
- continue
478
-
479
- if self.config.use_dynamic_viscosity_filter:
480
- if not (
481
- self.config.min_dynamic_viscosity
482
- < row["dynamic_viscosity"]
483
- <= self.config.max_dynamic_viscosity
484
- ):
485
- continue
486
-
487
- molecules.append(
488
- Molecule(
489
- smiles=row["smiles"],
490
- cn=row["cn"],
491
- cn_error=abs(row["cn"] - self.config.target_cn),
492
- bp=row["bp"],
493
- ysi=row.get("ysi"),
494
- density=row["density"],
495
- lhv=row["lhv"],
496
- dynamic_viscosity=row["dynamic_viscosity"]
497
- )
498
- )
499
-
500
- return self.population.add_molecules(molecules)
501
-
502
-
503
- def _log_generation_stats(self, generation: int):
504
- """Log statistics for the current generation."""
505
- mols = self.population.molecules
506
- best_cn = min(mols, key=lambda m: m.cn_error)
507
- avg_cn_err = np.mean([m.cn_error for m in mols])
508
-
509
- log_dict = {
510
- "generation": generation,
511
- "best_cn_error": best_cn.cn_error,
512
- "population_size": len(mols),
513
- "avg_cn_error": avg_cn_err,
514
- }
515
-
516
- print_msg = (f"Gen {generation}/{self.config.generations} | "
517
- f"Pop {len(mols)} | "
518
- f"Best CN err: {best_cn.cn_error:.3f} | "
519
- f"Avg CN err: {avg_cn_err:.3f}")
520
-
521
- if self.config.minimize_ysi:
522
- front = self.population.pareto_front()
523
- best_ysi = min(mols, key=lambda m: m.ysi)
524
- avg_ysi = np.mean([m.ysi for m in mols])
525
-
526
- log_dict.update({
527
- "best_ysi": best_ysi.ysi,
528
- "pareto_size": len(front),
529
- "avg_ysi": avg_ysi,
530
- })
531
-
532
- print_msg += (f" | Best YSI: {best_ysi.ysi:.3f} | "
533
- f"Avg YSI: {avg_ysi:.3f} | "
534
- f"Pareto size: {len(front)}")
535
-
536
- print(print_msg)
537
-
538
-
539
- def _generate_offspring(self, survivors: List[Molecule]) -> List[Molecule]:
540
- """Generate offspring from survivors with parallel mutation."""
541
- target_count = self.config.population_size - len(survivors)
542
- max_attempts = target_count * self.config.max_offspring_attempts
543
-
544
- # Generate parent pool
545
- parents = [random.choice(survivors) for _ in range(max_attempts)]
546
- parent_mols = [p.get_mol() for p in parents] # Use cached Mol objects
547
- parent_mols = [m for m in parent_mols if m is not None]
548
-
549
- # Parallel mutation generation
550
- print(f" → Generating mutations in parallel ({len(parent_mols)} parents)...")
551
- all_children_nested = Parallel(n_jobs=self.config.n_jobs, batch_size=10)(
552
- delayed(self._mutate_molecule)(mol) for mol in parent_mols
553
- )
554
-
555
- # Flatten and limit
556
- all_children = [child for children in all_children_nested for child in children]
557
- all_children = all_children[:target_count * 3] # Reasonable limit
558
-
559
- # Batch evaluation
560
- if all_children:
561
- print(f" → Evaluating {len(all_children)} offspring...")
562
- new_molecules = self._create_molecules(all_children)
563
- all_children.clear() # Explicit memory cleanup
564
- return new_molecules
565
-
566
- return []
567
-
568
- def _run_evolution_loop(self):
569
- """Run the main evolution loop with progress tracking."""
570
- for gen in tqdm(range(1, self.config.generations + 1), desc="Evolution"):
571
- self._log_generation_stats(gen)
572
-
573
- survivors = self.population.get_survivors()
574
- offspring = self._generate_offspring(survivors)
575
-
576
- # Create new population
577
- new_pop = Population(self.config)
578
- new_pop.add_molecules(survivors + offspring)
579
- self.population = new_pop
580
-
581
- def _generate_results(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
582
- """Generate final results DataFrames."""
583
- final_df = self.population.to_dataframe()
584
-
585
- if self.config.minimize_ysi and "ysi" in final_df.columns:
586
- final_df = final_df[
587
- (final_df["cn_error"] < 5) &
588
- (final_df["ysi"] < 50)
589
- ].sort_values(["cn_error", "ysi"], ascending=True)
590
-
591
- # overwrite rank safely
592
- final_df["rank"] = range(1, len(final_df) + 1)
593
-
594
- if self.config.minimize_ysi:
595
- pareto_mols = self.population.pareto_front()
596
- pareto_df = pd.DataFrame([m.to_dict() for m in pareto_mols])
597
-
598
- if not pareto_df.empty:
599
- pareto_df = pareto_df[
600
- (pareto_df['cn_error'] < 5) & (pareto_df['ysi'] < 50)
601
- ].sort_values(["cn_error", "ysi"], ascending=True)
602
- pareto_df.insert(0, 'rank', range(1, len(pareto_df) + 1))
603
- else:
604
- pareto_df = pd.DataFrame()
605
-
606
- return final_df, pareto_df
607
-
608
- def evolve(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
609
- """Run the evolutionary algorithm."""
610
- # Initialize
611
- init_smiles = df["SMILES"].sample(200, random_state=42).tolist()
612
- init_count = self.initialize_population(init_smiles)
613
-
614
- if init_count == 0:
615
- print("❌ No valid initial molecules")
616
- return pd.DataFrame(), pd.DataFrame()
617
-
618
- print(f"✓ Initial population size: {init_count}")
619
-
620
- # Evolution
621
- self._run_evolution_loop()
622
-
623
- # Results
624
- return self._generate_results()
625
-
626
-
627
- def get_user_config() -> EvolutionConfig:
628
- """Get configuration from user input."""
629
- print("\n" + "="*70)
630
- print("MOLECULAR EVOLUTION WITH GENETIC ALGORITHM (OPTIMIZED)")
631
- print("="*70)
632
-
633
- while True:
634
- target = float(input("Enter target CN: ") or "50")
635
- if target > 40:
636
- break
637
- print("⚠️ Target CN is too low, optimization may be challenging.")
638
- print("Consider using a higher target CN for better results.\n")
639
-
640
- minimize_ysi = input("Minimise YSI (y/n): ").strip().lower() in ['y', 'yes']
641
-
642
- return EvolutionConfig(target_cn=target, minimize_ysi=minimize_ysi)
643
-
644
-
645
- def save_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, minimize_ysi: bool):
646
- """Save results to CSV files."""
647
- results_dir = Path("results")
648
- results_dir.mkdir(exist_ok=True)
649
-
650
- final_df.to_csv(results_dir / "final_population.csv", index=False)
651
- if minimize_ysi and not pareto_df.empty:
652
- pareto_df.to_csv(results_dir / "pareto_front.csv", index=False)
653
-
654
- print("\n✓ Saved to results/")
655
-
656
-
657
- def display_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, minimize_ysi: bool):
658
- """Display results to console."""
659
- cols = (["rank", "smiles", "cn", "cn_error", "ysi", "bp", "density", "lhv", "dynamic_viscosity"])
660
-
661
- print("\n=== TOP 10 (sorted) ===")
662
- print(final_df.head(10)[cols].to_string(index=False))
663
-
664
- if minimize_ysi and not pareto_df.empty:
665
- print("\n=== PARETO FRONT (ranked) ===")
666
- print(pareto_df[["rank", "smiles", "cn", "cn_error", "ysi", "bp", "density", "lhv", "dynamic_viscosity"]]
667
- .head(20).to_string(index=False))
668
-
669
-
670
- def main():
671
- """Main execution function with optional profiling."""
672
- import cProfile
673
- import pstats
674
-
675
- config = get_user_config()
676
-
677
- # Optional profiling
678
- profiler = None
679
- if os.environ.get('PROFILE'):
680
- profiler = cProfile.Profile()
681
- profiler.enable()
682
-
683
- project_name = "cetane-ysi-pareto" if config.minimize_ysi else "cetane-optimization"
684
-
685
-
686
- evolution = MolecularEvolution(config)
687
- final_df, pareto_df = evolution.evolve()
688
-
689
-
690
- # Display and save results
691
- display_results(final_df, pareto_df, config.minimize_ysi)
692
- save_results(final_df, pareto_df, config.minimize_ysi)
693
-
694
- # Print profiling stats if enabled
695
- if profiler:
696
- profiler.disable()
697
- stats = pstats.Stats(profiler)
698
- stats.sort_stats('cumulative')
699
- print("\n=== PROFILING STATS (Top 20) ===")
700
- stats.print_stats(20)
701
-
702
-
703
- if __name__ == "__main__":
704
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/model_config.py DELETED
@@ -1,53 +0,0 @@
1
- """
2
- Model configuration for loading from Hugging Face Hub.
3
-
4
- Instructions:
5
- 1. Upload your models to Hugging Face
6
- 2. Update the repo IDs below with your actual repository names
7
- 3. Set USE_LOCAL_MODELS=false in your environment (default)
8
- """
9
-
10
- import os
11
-
12
- # Toggle between local files and HF Hub
13
- # Set to 'true' for local development, 'false' for deployment
14
- USE_LOCAL_MODELS = os.getenv('USE_LOCAL_MODELS', 'false').lower() == 'true'
15
-
16
- # ============================================================================
17
- # HUGGING FACE MODEL REPOSITORIES
18
- # ============================================================================
19
- # Update these with your actual Hugging Face repository IDs
20
- # Format: "username/repo-name" or "organization/repo-name"
21
-
22
- HF_MODEL_REPOS = {
23
- 'cn': "SalZa2004/Cetane_Number_Predictor", # Example: "john-doe/cetane-predictor"
24
- 'ysi': "SalZa2004/YSI_Predictor", # Example: "john-doe/ysi-predictor"
25
- 'bp': "SalZa2004/BP_Predictor", # Example: "john-doe/bp-predictor"
26
- 'density': "SalZa2004/Density_Predictor", # Example: "john-doe/density-predictor"
27
- 'lhv': "SalZa2004/LHV_Predictor", # Example: "john-doe/lhv-predictor"
28
- }
29
-
30
- # ============================================================================
31
- # VALIDATION
32
- # ============================================================================
33
-
34
- def validate_config():
35
- """Validate that configuration is properly set up."""
36
- if not USE_LOCAL_MODELS:
37
- # Check if HF repos are configured
38
- for prop, repo in HF_MODEL_REPOS.items():
39
- if repo == f"SalZa2004/{prop}-predictor":
40
- print(f"⚠️ Warning: {prop} model repo not configured!")
41
- print(f" Update HF_MODEL_REPOS['{prop}'] in model_config.py")
42
- return False
43
- return True
44
-
45
- # Run validation on import
46
- if __name__ != "__main__":
47
- if not validate_config() and not USE_LOCAL_MODELS:
48
- print("\n" + "="*70)
49
- print("❌ MODEL CONFIGURATION INCOMPLETE")
50
- print("="*70)
51
- print("\nPlease update model_config.py with your Hugging Face repository IDs.")
52
- print("Example: HF_MODEL_REPOS['cn'] = 'john-doe/cetane-predictor'")
53
- print("="*70 + "\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/shared_features.py DELETED
@@ -1,233 +0,0 @@
1
- import os
2
- import sqlite3
3
- import pandas as pd
4
- import numpy as np
5
- from sklearn.model_selection import train_test_split
6
-
7
- PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
8
- DB_PATH = os.path.join(PROJECT_ROOT, "data", "database", "database_main.db")
9
- from functools import lru_cache
10
-
11
-
12
-
13
-
14
- def load_raw_data():
15
- """Load raw data from database."""
16
- print("Connecting to SQLite database...")
17
- conn = sqlite3.connect(DB_PATH)
18
-
19
- query = """
20
- SELECT
21
- F.Fuel_Name,
22
- F.SMILES,
23
- T.Standardised_DCN AS cn
24
- FROM FUEL F
25
- LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
26
- """
27
- df = pd.read_sql_query(query, conn)
28
- conn.close()
29
-
30
- # Clean data
31
- df.dropna(subset=["cn", "SMILES"], inplace=True)
32
-
33
- return df
34
-
35
-
36
- # ============================================================================
37
- # 2. FEATURIZATION MODULE
38
- # ============================================================================
39
- from rdkit import Chem
40
- from rdkit.Chem import Descriptors, rdFingerprintGenerator
41
- from tqdm import tqdm
42
-
43
- # Get descriptor names globally
44
- DESCRIPTOR_NAMES = [d[0] for d in Descriptors._descList]
45
- desc_functions = [d[1] for d in Descriptors._descList]
46
-
47
- def morgan_fp_from_mol(mol, radius=2, n_bits=2048):
48
- """Generate Morgan fingerprint."""
49
- fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
50
- fp = fpgen.GetFingerprint(mol)
51
- arr = np.array(list(fp.ToBitString()), dtype=int)
52
- return arr
53
-
54
- def physchem_desc_from_mol(mol):
55
- """Calculate physicochemical descriptors."""
56
- try:
57
- desc = np.array([fn(mol) for fn in desc_functions], dtype=np.float32)
58
- desc = np.nan_to_num(desc, nan=0.0, posinf=0.0, neginf=0.0)
59
- return desc
60
- except:
61
- return None
62
-
63
- def featurize(smiles):
64
- """Convert SMILES to feature vector."""
65
- mol = Chem.MolFromSmiles(smiles)
66
- if mol is None:
67
- return None
68
-
69
- fp = morgan_fp_from_mol(mol)
70
- desc = physchem_desc_from_mol(mol)
71
-
72
- if fp is None or desc is None:
73
- return None
74
-
75
- return np.hstack([fp, desc])
76
-
77
- def featurize_df(df, smiles_col="SMILES", return_df=True):
78
- """
79
- Featurize a DataFrame or list of SMILES (vectorized for speed).
80
- """
81
- # Handle different input types
82
- if isinstance(df, (list, np.ndarray)):
83
- df = pd.DataFrame({smiles_col: df})
84
- elif isinstance(df, pd.Series):
85
- df = pd.DataFrame({smiles_col: df})
86
-
87
- # Convert all SMILES to molecules in batch
88
- mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_col]]
89
-
90
- features = []
91
- valid_indices = []
92
-
93
- # Process valid molecules
94
- for i, mol in enumerate(tqdm(mols, desc="Featurizing")):
95
- if mol is None:
96
- continue
97
-
98
- try:
99
- fp = morgan_fp_from_mol(mol)
100
- desc = physchem_desc_from_mol(mol)
101
-
102
- if fp is not None and desc is not None:
103
- features.append(np.hstack([fp, desc]))
104
- valid_indices.append(i)
105
- except:
106
- continue
107
-
108
- if len(features) == 0:
109
- return (None, None) if return_df else None
110
-
111
- X = np.vstack(features)
112
-
113
- if return_df:
114
- df_valid = df.iloc[valid_indices].reset_index(drop=True)
115
- return X, df_valid
116
- else:
117
- return X
118
- @lru_cache(maxsize=50_000)
119
-
120
- def cached_featurize_smiles(smiles: str):
121
- X = featurize_df([smiles], return_df=False)
122
- if X is None:
123
- return None
124
- return X[0] # single feature vector
125
-
126
- # ============================================================================
127
- # 3. FEATURE SELECTOR CLASS
128
- # ============================================================================
129
- import joblib
130
-
131
- class FeatureSelector:
132
- """Feature selection pipeline that can be saved and reused."""
133
-
134
- def __init__(self, n_morgan=2048, corr_threshold=0.95, top_k=300):
135
- self.n_morgan = n_morgan
136
- self.corr_threshold = corr_threshold
137
- self.top_k = top_k
138
-
139
- # Filled during fit()
140
- self.corr_cols_to_drop = None
141
- self.selected_indices = None
142
- self.is_fitted = False
143
-
144
- def fit(self, X, y):
145
- """Fit the feature selector on training data."""
146
- print("\n" + "="*70)
147
- print("FITTING FEATURE SELECTOR")
148
- print("="*70)
149
-
150
- # Step 1: Split Morgan and descriptors
151
- X_mfp = X[:, :self.n_morgan]
152
- X_desc = X[:, self.n_morgan:]
153
-
154
- print(f"Morgan fingerprints: {X_mfp.shape[1]}")
155
- print(f"Descriptors: {X_desc.shape[1]}")
156
-
157
- # Step 2: Remove correlated descriptors
158
- desc_df = pd.DataFrame(X_desc)
159
- corr_matrix = desc_df.corr().abs()
160
- upper = corr_matrix.where(
161
- np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
162
- )
163
-
164
- self.corr_cols_to_drop = [
165
- col for col in upper.columns if any(upper[col] > self.corr_threshold)
166
- ]
167
-
168
- print(f"Correlated descriptors removed: {len(self.corr_cols_to_drop)}")
169
-
170
- desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
171
- X_corr = np.hstack([X_mfp, desc_filtered])
172
-
173
- print(f"Features after correlation filter: {X_corr.shape[1]}")
174
-
175
- # Step 3: Feature importance selection
176
- from sklearn.ensemble import ExtraTreesRegressor
177
-
178
- print("Running feature importance selection...")
179
- model = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
180
- model.fit(X_corr, y)
181
-
182
- importances = model.feature_importances_
183
- indices = np.argsort(importances)[::-1]
184
-
185
- self.selected_indices = indices[:self.top_k]
186
-
187
- print(f"Final selected features: {len(self.selected_indices)}")
188
-
189
- self.is_fitted = True
190
- return self
191
-
192
- def transform(self, X):
193
- """Apply the fitted feature selection to new data."""
194
- if not self.is_fitted:
195
- raise RuntimeError("FeatureSelector must be fitted before transform!")
196
-
197
- # Step 1: Split Morgan and descriptors
198
- X_mfp = X[:, :self.n_morgan]
199
- X_desc = X[:, self.n_morgan:]
200
-
201
- # Step 2: Remove same correlated descriptors
202
- desc_df = pd.DataFrame(X_desc)
203
- desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
204
- X_corr = np.hstack([X_mfp, desc_filtered])
205
-
206
- # Step 3: Select same important features
207
- X_selected = X_corr[:, self.selected_indices]
208
-
209
- return X_selected
210
-
211
- def fit_transform(self, X, y):
212
- """Fit and transform in one step."""
213
- return self.fit(X, y).transform(X)
214
-
215
- def save(self, filepath='feature_selector.joblib'):
216
- """Save the fitted selector."""
217
- if not self.is_fitted:
218
- raise RuntimeError("Cannot save unfitted selector!")
219
-
220
- # Create directory if it doesn't exist
221
- os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
222
-
223
- joblib.dump(self, filepath)
224
- print(f"✓ Feature selector saved to {filepath}")
225
-
226
- @staticmethod
227
- def load(filepath='feature_selector.joblib'):
228
- """Load a fitted selector."""
229
- selector = joblib.load(filepath)
230
- if not selector.is_fitted:
231
- raise RuntimeError("Loaded selector is not fitted!")
232
- print(f"✓ Feature selector loaded from {filepath}")
233
- return selector
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/streamlit_app.py DELETED
@@ -1,161 +0,0 @@
1
- import streamlit as st
2
- from pathlib import Path
3
- import pandas as pd
4
- import os
5
- from shared_features import FeatureSelector, featurize_df
6
- # -----------------------------
7
- # OPTIONAL: Disable wandb on HF
8
- # -----------------------------
9
- os.environ["WANDB_MODE"] = "disabled"
10
-
11
- # -----------------------------
12
- # Import your existing code
13
- # -----------------------------
14
- from dataclasses import asdict
15
- from main import (
16
- EvolutionConfig,
17
- MolecularEvolution
18
- )
19
-
20
- # -----------------------------
21
- # Page config
22
- # -----------------------------
23
- st.set_page_config(
24
- page_title="Molecular Evolution Optimizer",
25
- layout="wide"
26
- )
27
-
28
- st.title("🧬 Molecular Evolution for Cetane Optimization")
29
- st.markdown(
30
- """
31
- This app runs a **genetic algorithm** to evolve molecules towards a
32
- **target Cetane Number (CN)**, optionally minimizing **YSI** and
33
- enforcing physical constraints.
34
- """
35
- )
36
-
37
- # -----------------------------
38
- # Sidebar: Configuration
39
- # -----------------------------
40
- st.sidebar.header("⚙️ Evolution Configuration")
41
-
42
- target_cn = st.sidebar.slider(
43
- "Target Cetane Number (CN)",
44
- min_value=40.0,
45
- max_value=80.0,
46
- value=50.0,
47
- step=1.0
48
- )
49
-
50
- minimize_ysi = st.sidebar.checkbox(
51
- "Minimize YSI",
52
- value=True
53
- )
54
-
55
- generations = st.sidebar.slider(
56
- "Generations",
57
- min_value=1,
58
- max_value=20,
59
- value=6
60
- )
61
-
62
- population_size = st.sidebar.slider(
63
- "Population Size",
64
- min_value=10,
65
- max_value=200,
66
- value=50,
67
- step=10
68
- )
69
-
70
- survivor_fraction = st.sidebar.slider(
71
- "Survivor Fraction",
72
- min_value=0.1,
73
- max_value=0.9,
74
- value=0.5,
75
- step=0.05
76
- )
77
-
78
- st.sidebar.subheader("🔬 Property Filters")
79
-
80
- use_bp_filter = st.sidebar.checkbox("Use Boiling Point filter", True)
81
- use_density_filter = st.sidebar.checkbox("Use Density filter", True)
82
- use_lhv_filter = st.sidebar.checkbox("Use LHV filter", True)
83
- use_dv_filter = st.sidebar.checkbox("Use Dynamic Viscosity filter", True)
84
-
85
- # -----------------------------
86
- # Build config
87
- # -----------------------------
88
- config = EvolutionConfig(
89
- target_cn=target_cn,
90
- minimize_ysi=minimize_ysi,
91
- generations=generations,
92
- population_size=population_size,
93
- survivor_fraction=survivor_fraction,
94
- use_bp_filter=use_bp_filter,
95
- use_density_filter=use_density_filter,
96
- use_lhv_filter=use_lhv_filter,
97
- use_dynamic_viscosity_filter=use_dv_filter,
98
- )
99
-
100
- # -----------------------------
101
- # Run button
102
- # -----------------------------
103
- run = st.button("🚀 Run Evolution")
104
-
105
- if run:
106
- with st.spinner("Running molecular evolution... This may take several minutes."):
107
-
108
- evolution = MolecularEvolution(config)
109
- final_df, pareto_df = evolution.evolve()
110
-
111
- st.success("Evolution completed!")
112
-
113
- # -----------------------------
114
- # Results: Final population
115
- # -----------------------------
116
- st.header("📊 Final Population")
117
-
118
- if final_df.empty:
119
- st.warning("No valid molecules found.")
120
- else:
121
- st.dataframe(final_df, use_container_width=True)
122
-
123
- csv = final_df.to_csv(index=False).encode("utf-8")
124
- st.download_button(
125
- "⬇️ Download Final Population CSV",
126
- csv,
127
- "final_population.csv",
128
- "text/csv"
129
- )
130
-
131
- # -----------------------------
132
- # Results: Pareto front
133
- # -----------------------------
134
- if minimize_ysi:
135
- st.header("🏆 Pareto Front")
136
-
137
- if pareto_df.empty:
138
- st.info("No Pareto-optimal molecules found.")
139
- else:
140
- st.dataframe(pareto_df, use_container_width=True)
141
-
142
- pareto_csv = pareto_df.to_csv(index=False).encode("utf-8")
143
- st.download_button(
144
- "⬇️ Download Pareto Front CSV",
145
- pareto_csv,
146
- "pareto_front.csv",
147
- "text/csv"
148
- )
149
-
150
- # -----------------------------
151
- # Quick plots
152
- # -----------------------------
153
- if not final_df.empty:
154
- st.header("📈 CN Error vs YSI")
155
-
156
- if "ysi" in final_df.columns:
157
- st.scatter_chart(
158
- final_df,
159
- x="cn_error",
160
- y="ysi"
161
- )