carrotcake3 commited on
Commit
53b198d
·
verified ·
1 Parent(s): a6eaf05

Update core/evolution/evolution.py

Browse files
Files changed (1) hide show
  1. core/evolution/evolution.py +238 -233
core/evolution/evolution.py CHANGED
@@ -1,234 +1,239 @@
1
- from .population import Population
2
- from .molecule import Molecule
3
- from core.predictors.pure_component.property_predictor import PropertyPredictor
4
- from core.config import EvolutionConfig
5
- from crem.crem import mutate_mol
6
- from rdkit import Chem
7
- import pandas as pd
8
- import numpy as np
9
- import random
10
- from typing import List, Tuple
11
- from core.data_prep import df # Initial dataset for sampling
12
- from pathlib import Path
13
-
14
- class MolecularEvolution:
15
- """Main evolutionary algorithm coordinator."""
16
- BASE_DIR = Path(__file__).resolve().parent.parent.parent
17
- REP_DB_PATH = BASE_DIR / "data" / "fragments" / "diesel_fragments.db"
18
-
19
- def __init__(self, config: EvolutionConfig):
20
- self.config = config
21
- self.predictor = PropertyPredictor(config)
22
- self.population = Population(config)
23
-
24
- def _mutate_molecule(self, mol: Chem.Mol) -> List[str]:
25
- """Generate mutations for a molecule using CREM."""
26
- try:
27
- mutants = list(mutate_mol(
28
- mol,
29
- db_name=str(self.REP_DB_PATH),
30
- max_size=2,
31
- return_mol=False
32
- ))
33
- return [m for m in mutants if m and m not in self.population.seen_smiles]
34
- except Exception:
35
- return []
36
-
37
- def _create_molecules(self, smiles_list: List[str]) -> List[Molecule]:
38
- """Create Molecule objects from SMILES with predictions (OPTIMIZED)."""
39
- if not smiles_list:
40
- return []
41
-
42
- # OPTIMIZATION: Single featurization + all predictions
43
- predictions = self.predictor.predict_all_properties(smiles_list)
44
-
45
- molecules = []
46
- for i, smiles in enumerate(smiles_list):
47
- # Extract predictions for this molecule
48
- props = {k: v[i] for k, v in predictions.items()}
49
-
50
- # Validate required properties
51
- if props.get('cn') is None:
52
- continue
53
- if self.config.minimize_ysi and props.get('ysi') is None:
54
- continue
55
-
56
- # Validate filtered properties
57
- if not all(self.predictor.is_valid(k, props.get(k)) for k in ['bp', 'density', 'lhv', 'dynamic_viscosity']):
58
- continue
59
-
60
- molecules.append(Molecule(
61
- smiles=smiles,
62
- cn=props['cn'],
63
- cn_error=abs(props['cn'] - self.config.target_cn),
64
- cn_score=props['cn'], # For maximize mode
65
- bp=props.get('bp'),
66
- ysi=props.get('ysi'),
67
- density=props.get('density'),
68
- lhv=props.get('lhv'),
69
- dynamic_viscosity=props.get('dynamic_viscosity')
70
- ))
71
-
72
- return molecules
73
-
74
- def initialize_population(self, initial_smiles: List[str]) -> int:
75
- """Initialize the population from initial SMILES."""
76
- print("Predicting properties for initial population...")
77
- molecules = self._create_molecules(initial_smiles)
78
- return self.population.add_molecules(molecules)
79
-
80
- def _log_generation_stats(self, generation: int):
81
- """Log statistics for the current generation."""
82
- mols = self.population.molecules
83
-
84
- if self.config.maximize_cn:
85
- best_cn = max(mols, key=lambda m: m.cn)
86
- avg_cn = np.mean([m.cn for m in mols])
87
-
88
- print_msg = (f"Gen {generation}/{self.config.generations} | "
89
- f"Pop {len(mols)} | "
90
- f"Best CN: {best_cn.cn:.3f} | "
91
- f"Avg CN: {avg_cn:.3f}")
92
- else:
93
- best_cn = min(mols, key=lambda m: m.cn_error)
94
- avg_cn_err = np.mean([m.cn_error for m in mols])
95
-
96
- print_msg = (f"Gen {generation}/{self.config.generations} | "
97
- f"Pop {len(mols)} | "
98
- f"Best CN err: {best_cn.cn_error:.3f} | "
99
- f"Avg CN err: {avg_cn_err:.3f}")
100
-
101
- if self.config.minimize_ysi:
102
- front = self.population.pareto_front()
103
- best_ysi = min(mols, key=lambda m: m.ysi)
104
- avg_ysi = np.mean([m.ysi for m in mols])
105
-
106
- print_msg += (f" | Best YSI: {best_ysi.ysi:.3f} | "
107
- f"Avg YSI: {avg_ysi:.3f} | "
108
- f"Pareto: {len(front)}")
109
-
110
- print(print_msg)
111
-
112
- def _generate_offspring(self, survivors: List[Molecule]) -> List[Molecule]:
113
- """Generates offspring from survivors."""
114
- target_count = self.config.population_size - len(survivors)
115
- max_attempts = target_count * self.config.max_offspring_attempts
116
-
117
- all_children = []
118
- new_molecules = []
119
-
120
- print(f" → Generating offspring (target: {target_count})...")
121
-
122
- for attempt in range(max_attempts):
123
- if len(new_molecules) >= target_count:
124
- break
125
-
126
- # Generate mutations
127
- parent = random.choice(survivors)
128
- mol = Chem.MolFromSmiles(parent.smiles)
129
- if mol is None:
130
- continue
131
-
132
- children = self._mutate_molecule(mol)
133
- all_children.extend(children[:self.config.mutations_per_parent])
134
-
135
- # Process in larger batches (single featurization per batch)
136
- if len(all_children) >= self.config.batch_size:
137
- print(f" → Evaluating batch of {len(all_children)} (featurizing once)...")
138
- new_molecules.extend(self._create_molecules(all_children))
139
- all_children = []
140
-
141
- # Process remaining children
142
- if all_children:
143
- print(f" → Evaluating final batch of {len(all_children)}...")
144
- new_molecules.extend(self._create_molecules(all_children))
145
-
146
- print(f" ✓ Generated {len(new_molecules)} valid offspring")
147
- return new_molecules
148
-
149
- def _run_evolution_loop(self):
150
- """Run the main evolution loop."""
151
- for gen in range(1, self.config.generations + 1):
152
- self._log_generation_stats(gen)
153
-
154
- survivors = self.population.get_survivors()
155
- offspring = self._generate_offspring(survivors)
156
-
157
- # Create new population
158
- new_pop = Population(self.config)
159
- new_pop.add_molecules(survivors + offspring)
160
- self.population = new_pop
161
-
162
- def _generate_results(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
163
- """Generate final results DataFrames."""
164
- final_df = self.population.to_dataframe()
165
-
166
- # Apply different filtering based on mode
167
- if self.config.maximize_cn:
168
- if self.config.minimize_ysi and "ysi" in final_df.columns:
169
- # Maximize CN + minimize YSI: keep high CN, low YSI
170
- final_df = final_df[
171
- (final_df["cn"] > 50) &
172
- (final_df["ysi"] < 50)
173
- ].sort_values(["cn", "ysi"], ascending=[False, True])
174
- else:
175
- # Maximize CN only: just keep high CN
176
- final_df = final_df[final_df["cn"] > 50].sort_values("cn", ascending=False)
177
- else:
178
- if self.config.minimize_ysi and "ysi" in final_df.columns:
179
- # Target CN + minimize YSI: keep low error, low YSI
180
- final_df = final_df[
181
- (final_df["cn_error"] < 5) &
182
- (final_df["ysi"] < 50)
183
- ].sort_values(["cn_error", "ysi"], ascending=True)
184
- else:
185
- # Target CN only: just keep low error
186
- final_df = final_df[final_df["cn_error"] < 5].sort_values("cn_error", ascending=True)
187
-
188
- # Overwrite rank safely
189
- final_df["rank"] = range(1, len(final_df) + 1)
190
-
191
- if self.config.minimize_ysi:
192
- pareto_mols = self.population.pareto_front()
193
- pareto_df = pd.DataFrame([m.to_dict() for m in pareto_mols])
194
-
195
- if not pareto_df.empty:
196
- if self.config.maximize_cn:
197
- pareto_df = pareto_df[
198
- (pareto_df['cn'] > 50) & (pareto_df['ysi'] < 50)
199
- ].sort_values(["cn", "ysi"], ascending=[False, True])
200
- else:
201
- pareto_df = pareto_df[
202
- (pareto_df['cn_error'] < 5) & (pareto_df['ysi'] < 50)
203
- ].sort_values(["cn_error", "ysi"], ascending=True)
204
-
205
- pareto_df.insert(0, 'rank', range(1, len(pareto_df) + 1))
206
- else:
207
- pareto_df = pd.DataFrame()
208
-
209
- return final_df, pareto_df
210
-
211
-
212
- def evolve(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
213
- """Run the evolutionary algorithm."""
214
- # Initialize
215
- df_bins = pd.qcut(df["cn"], q=30)
216
- initial_smiles = (
217
- df.groupby(df_bins, observed=False)
218
- .apply(lambda x: x.sample(20, random_state=42))
219
- .reset_index(drop=True)["SMILES"]
220
- .tolist()
221
- )
222
- init_count = self.initialize_population(initial_smiles)
223
-
224
- if init_count == 0:
225
- print("No valid initial molecules")
226
- return pd.DataFrame(), pd.DataFrame()
227
-
228
- print(f"✓ Initial population size: {init_count}\n")
229
-
230
- # Evolution
231
- self._run_evolution_loop()
232
-
233
- # Results
 
 
 
 
 
234
  return self._generate_results()
 
1
+ from .population import Population
2
+ from .molecule import Molecule
3
+ from core.predictors.pure_component.property_predictor import PropertyPredictor
4
+ from core.config import EvolutionConfig
5
+ from crem.crem import mutate_mol
6
+ from rdkit import Chem
7
+ import pandas as pd
8
+ import numpy as np
9
+ import random
10
+ from typing import List, Tuple
11
+ from core.data_prep import df # Initial dataset for sampling
12
+ from pathlib import Path
13
+
14
+ class MolecularEvolution:
15
+ """Main evolutionary algorithm coordinator."""
16
+ BASE_DIR = Path(__file__).resolve().parent.parent.parent
17
+ REP_DB_PATH = BASE_DIR / "data" / "fragments" / "diesel_fragments.db"
18
+
19
+ def __init__(self, config: EvolutionConfig):
20
+ self.config = config
21
+ self.predictor = PropertyPredictor(config)
22
+ self.population = Population(config)
23
+
24
+ def _mutate_molecule(self, mol: Chem.Mol) -> List[str]:
25
+ """Generate mutations for a molecule using CREM."""
26
+ try:
27
+ mutants = list(mutate_mol(
28
+ mol,
29
+ db_name=str(self.REP_DB_PATH),
30
+ max_size=2,
31
+ return_mol=False
32
+ ))
33
+ return [m for m in mutants if m and m not in self.population.seen_smiles]
34
+
35
+ except SystemExit:
36
+ # CREM can call sys.exit(1) internally; this prevents Gunicorn worker crash
37
+ return []
38
+
39
+ except Exception:
40
+ return []
41
+
42
+ def _create_molecules(self, smiles_list: List[str]) -> List[Molecule]:
43
+ """Create Molecule objects from SMILES with predictions (OPTIMIZED)."""
44
+ if not smiles_list:
45
+ return []
46
+
47
+ # OPTIMIZATION: Single featurization + all predictions
48
+ predictions = self.predictor.predict_all_properties(smiles_list)
49
+
50
+ molecules = []
51
+ for i, smiles in enumerate(smiles_list):
52
+ # Extract predictions for this molecule
53
+ props = {k: v[i] for k, v in predictions.items()}
54
+
55
+ # Validate required properties
56
+ if props.get('cn') is None:
57
+ continue
58
+ if self.config.minimize_ysi and props.get('ysi') is None:
59
+ continue
60
+
61
+ # Validate filtered properties
62
+ if not all(self.predictor.is_valid(k, props.get(k)) for k in ['bp', 'density', 'lhv', 'dynamic_viscosity']):
63
+ continue
64
+
65
+ molecules.append(Molecule(
66
+ smiles=smiles,
67
+ cn=props['cn'],
68
+ cn_error=abs(props['cn'] - self.config.target_cn),
69
+ cn_score=props['cn'], # For maximize mode
70
+ bp=props.get('bp'),
71
+ ysi=props.get('ysi'),
72
+ density=props.get('density'),
73
+ lhv=props.get('lhv'),
74
+ dynamic_viscosity=props.get('dynamic_viscosity')
75
+ ))
76
+
77
+ return molecules
78
+
79
+ def initialize_population(self, initial_smiles: List[str]) -> int:
80
+ """Initialize the population from initial SMILES."""
81
+ print("Predicting properties for initial population...")
82
+ molecules = self._create_molecules(initial_smiles)
83
+ return self.population.add_molecules(molecules)
84
+
85
+ def _log_generation_stats(self, generation: int):
86
+ """Log statistics for the current generation."""
87
+ mols = self.population.molecules
88
+
89
+ if self.config.maximize_cn:
90
+ best_cn = max(mols, key=lambda m: m.cn)
91
+ avg_cn = np.mean([m.cn for m in mols])
92
+
93
+ print_msg = (f"Gen {generation}/{self.config.generations} | "
94
+ f"Pop {len(mols)} | "
95
+ f"Best CN: {best_cn.cn:.3f} | "
96
+ f"Avg CN: {avg_cn:.3f}")
97
+ else:
98
+ best_cn = min(mols, key=lambda m: m.cn_error)
99
+ avg_cn_err = np.mean([m.cn_error for m in mols])
100
+
101
+ print_msg = (f"Gen {generation}/{self.config.generations} | "
102
+ f"Pop {len(mols)} | "
103
+ f"Best CN err: {best_cn.cn_error:.3f} | "
104
+ f"Avg CN err: {avg_cn_err:.3f}")
105
+
106
+ if self.config.minimize_ysi:
107
+ front = self.population.pareto_front()
108
+ best_ysi = min(mols, key=lambda m: m.ysi)
109
+ avg_ysi = np.mean([m.ysi for m in mols])
110
+
111
+ print_msg += (f" | Best YSI: {best_ysi.ysi:.3f} | "
112
+ f"Avg YSI: {avg_ysi:.3f} | "
113
+ f"Pareto: {len(front)}")
114
+
115
+ print(print_msg)
116
+
117
+ def _generate_offspring(self, survivors: List[Molecule]) -> List[Molecule]:
118
+ """Generates offspring from survivors."""
119
+ target_count = self.config.population_size - len(survivors)
120
+ max_attempts = target_count * self.config.max_offspring_attempts
121
+
122
+ all_children = []
123
+ new_molecules = []
124
+
125
+ print(f" → Generating offspring (target: {target_count})...")
126
+
127
+ for attempt in range(max_attempts):
128
+ if len(new_molecules) >= target_count:
129
+ break
130
+
131
+ # Generate mutations
132
+ parent = random.choice(survivors)
133
+ mol = Chem.MolFromSmiles(parent.smiles)
134
+ if mol is None:
135
+ continue
136
+
137
+ children = self._mutate_molecule(mol)
138
+ all_children.extend(children[:self.config.mutations_per_parent])
139
+
140
+ # Process in larger batches (single featurization per batch)
141
+ if len(all_children) >= self.config.batch_size:
142
+ print(f" → Evaluating batch of {len(all_children)} (featurizing once)...")
143
+ new_molecules.extend(self._create_molecules(all_children))
144
+ all_children = []
145
+
146
+ # Process remaining children
147
+ if all_children:
148
+ print(f" → Evaluating final batch of {len(all_children)}...")
149
+ new_molecules.extend(self._create_molecules(all_children))
150
+
151
+ print(f" ✓ Generated {len(new_molecules)} valid offspring")
152
+ return new_molecules
153
+
154
+ def _run_evolution_loop(self):
155
+ """Run the main evolution loop."""
156
+ for gen in range(1, self.config.generations + 1):
157
+ self._log_generation_stats(gen)
158
+
159
+ survivors = self.population.get_survivors()
160
+ offspring = self._generate_offspring(survivors)
161
+
162
+ # Create new population
163
+ new_pop = Population(self.config)
164
+ new_pop.add_molecules(survivors + offspring)
165
+ self.population = new_pop
166
+
167
+ def _generate_results(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
168
+ """Generate final results DataFrames."""
169
+ final_df = self.population.to_dataframe()
170
+
171
+ # Apply different filtering based on mode
172
+ if self.config.maximize_cn:
173
+ if self.config.minimize_ysi and "ysi" in final_df.columns:
174
+ # Maximize CN + minimize YSI: keep high CN, low YSI
175
+ final_df = final_df[
176
+ (final_df["cn"] > 50) &
177
+ (final_df["ysi"] < 50)
178
+ ].sort_values(["cn", "ysi"], ascending=[False, True])
179
+ else:
180
+ # Maximize CN only: just keep high CN
181
+ final_df = final_df[final_df["cn"] > 50].sort_values("cn", ascending=False)
182
+ else:
183
+ if self.config.minimize_ysi and "ysi" in final_df.columns:
184
+ # Target CN + minimize YSI: keep low error, low YSI
185
+ final_df = final_df[
186
+ (final_df["cn_error"] < 5) &
187
+ (final_df["ysi"] < 50)
188
+ ].sort_values(["cn_error", "ysi"], ascending=True)
189
+ else:
190
+ # Target CN only: just keep low error
191
+ final_df = final_df[final_df["cn_error"] < 5].sort_values("cn_error", ascending=True)
192
+
193
+ # Overwrite rank safely
194
+ final_df["rank"] = range(1, len(final_df) + 1)
195
+
196
+ if self.config.minimize_ysi:
197
+ pareto_mols = self.population.pareto_front()
198
+ pareto_df = pd.DataFrame([m.to_dict() for m in pareto_mols])
199
+
200
+ if not pareto_df.empty:
201
+ if self.config.maximize_cn:
202
+ pareto_df = pareto_df[
203
+ (pareto_df['cn'] > 50) & (pareto_df['ysi'] < 50)
204
+ ].sort_values(["cn", "ysi"], ascending=[False, True])
205
+ else:
206
+ pareto_df = pareto_df[
207
+ (pareto_df['cn_error'] < 5) & (pareto_df['ysi'] < 50)
208
+ ].sort_values(["cn_error", "ysi"], ascending=True)
209
+
210
+ pareto_df.insert(0, 'rank', range(1, len(pareto_df) + 1))
211
+ else:
212
+ pareto_df = pd.DataFrame()
213
+
214
+ return final_df, pareto_df
215
+
216
+
217
+ def evolve(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
218
+ """Run the evolutionary algorithm."""
219
+ # Initialize
220
+ df_bins = pd.qcut(df["cn"], q=30)
221
+ initial_smiles = (
222
+ df.groupby(df_bins, observed=False)
223
+ .apply(lambda x: x.sample(20, random_state=42))
224
+ .reset_index(drop=True)["SMILES"]
225
+ .tolist()
226
+ )
227
+ init_count = self.initialize_population(initial_smiles)
228
+
229
+ if init_count == 0:
230
+ print("No valid initial molecules")
231
+ return pd.DataFrame(), pd.DataFrame()
232
+
233
+ print(f"✓ Initial population size: {init_count}\n")
234
+
235
+ # Evolution
236
+ self._run_evolution_loop()
237
+
238
+ # Results
239
  return self._generate_results()