SalZa2004 commited on
Commit
a24c075
·
verified ·
1 Parent(s): 16ea0d2

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. shared_features.py +223 -0
shared_features.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sqlite3
3
+ import pandas as pd
4
+ import numpy as np
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
8
+ DB_PATH = os.path.join(PROJECT_ROOT, "data", "database", "database_main.db")
9
+
10
+ def load_raw_data():
11
+ """Load raw data from database."""
12
+ print("Connecting to SQLite database...")
13
+ conn = sqlite3.connect(DB_PATH)
14
+
15
+ query = """
16
+ SELECT
17
+ F.Fuel_Name,
18
+ F.SMILES,
19
+ T.Standardised_DCN AS cn
20
+ FROM FUEL F
21
+ LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
22
+ """
23
+ df = pd.read_sql_query(query, conn)
24
+ conn.close()
25
+
26
+ # Clean data
27
+ df.dropna(subset=["cn", "SMILES"], inplace=True)
28
+
29
+ return df
30
+
31
+
32
+ # ============================================================================
33
+ # 2. FEATURIZATION MODULE
34
+ # ============================================================================
35
+ from rdkit import Chem
36
+ from rdkit.Chem import Descriptors, rdFingerprintGenerator
37
+ from tqdm import tqdm
38
+
39
+ # Get descriptor names globally
40
+ DESCRIPTOR_NAMES = [d[0] for d in Descriptors._descList]
41
+ desc_functions = [d[1] for d in Descriptors._descList]
42
+
43
+ def morgan_fp_from_mol(mol, radius=2, n_bits=2048):
44
+ """Generate Morgan fingerprint."""
45
+ fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
46
+ fp = fpgen.GetFingerprint(mol)
47
+ arr = np.array(list(fp.ToBitString()), dtype=int)
48
+ return arr
49
+
50
+ def physchem_desc_from_mol(mol):
51
+ """Calculate physicochemical descriptors."""
52
+ try:
53
+ desc = np.array([fn(mol) for fn in desc_functions], dtype=np.float32)
54
+ desc = np.nan_to_num(desc, nan=0.0, posinf=0.0, neginf=0.0)
55
+ return desc
56
+ except:
57
+ return None
58
+
59
+ def featurize(smiles):
60
+ """Convert SMILES to feature vector."""
61
+ mol = Chem.MolFromSmiles(smiles)
62
+ if mol is None:
63
+ return None
64
+
65
+ fp = morgan_fp_from_mol(mol)
66
+ desc = physchem_desc_from_mol(mol)
67
+
68
+ if fp is None or desc is None:
69
+ return None
70
+
71
+ return np.hstack([fp, desc])
72
+
73
+ def featurize_df(df, smiles_col="SMILES", return_df=True):
74
+ """
75
+ Featurize a DataFrame or list of SMILES (vectorized for speed).
76
+ """
77
+ # Handle different input types
78
+ if isinstance(df, (list, np.ndarray)):
79
+ df = pd.DataFrame({smiles_col: df})
80
+ elif isinstance(df, pd.Series):
81
+ df = pd.DataFrame({smiles_col: df})
82
+
83
+ # Convert all SMILES to molecules in batch
84
+ mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_col]]
85
+
86
+ features = []
87
+ valid_indices = []
88
+
89
+ # Process valid molecules
90
+ for i, mol in enumerate(tqdm(mols, desc="Featurizing")):
91
+ if mol is None:
92
+ continue
93
+
94
+ try:
95
+ fp = morgan_fp_from_mol(mol)
96
+ desc = physchem_desc_from_mol(mol)
97
+
98
+ if fp is not None and desc is not None:
99
+ features.append(np.hstack([fp, desc]))
100
+ valid_indices.append(i)
101
+ except:
102
+ continue
103
+
104
+ if len(features) == 0:
105
+ return (None, None) if return_df else None
106
+
107
+ X = np.vstack(features)
108
+
109
+ if return_df:
110
+ df_valid = df.iloc[valid_indices].reset_index(drop=True)
111
+ return X, df_valid
112
+ else:
113
+ return X
114
+
115
+
116
+ # ============================================================================
117
+ # 3. FEATURE SELECTOR CLASS
118
+ # ============================================================================
119
+ import joblib
120
+
121
+ class FeatureSelector:
122
+ """Feature selection pipeline that can be saved and reused."""
123
+
124
+ def __init__(self, n_morgan=2048, corr_threshold=0.95, top_k=300):
125
+ self.n_morgan = n_morgan
126
+ self.corr_threshold = corr_threshold
127
+ self.top_k = top_k
128
+
129
+ # Filled during fit()
130
+ self.corr_cols_to_drop = None
131
+ self.selected_indices = None
132
+ self.is_fitted = False
133
+
134
+ def fit(self, X, y):
135
+ """Fit the feature selector on training data."""
136
+ print("\n" + "="*70)
137
+ print("FITTING FEATURE SELECTOR")
138
+ print("="*70)
139
+
140
+ # Step 1: Split Morgan and descriptors
141
+ X_mfp = X[:, :self.n_morgan]
142
+ X_desc = X[:, self.n_morgan:]
143
+
144
+ print(f"Morgan fingerprints: {X_mfp.shape[1]}")
145
+ print(f"Descriptors: {X_desc.shape[1]}")
146
+
147
+ # Step 2: Remove correlated descriptors
148
+ desc_df = pd.DataFrame(X_desc)
149
+ corr_matrix = desc_df.corr().abs()
150
+ upper = corr_matrix.where(
151
+ np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
152
+ )
153
+
154
+ self.corr_cols_to_drop = [
155
+ col for col in upper.columns if any(upper[col] > self.corr_threshold)
156
+ ]
157
+
158
+ print(f"Correlated descriptors removed: {len(self.corr_cols_to_drop)}")
159
+
160
+ desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
161
+ X_corr = np.hstack([X_mfp, desc_filtered])
162
+
163
+ print(f"Features after correlation filter: {X_corr.shape[1]}")
164
+
165
+ # Step 3: Feature importance selection
166
+ from sklearn.ensemble import ExtraTreesRegressor
167
+
168
+ print("Running feature importance selection...")
169
+ model = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
170
+ model.fit(X_corr, y)
171
+
172
+ importances = model.feature_importances_
173
+ indices = np.argsort(importances)[::-1]
174
+
175
+ self.selected_indices = indices[:self.top_k]
176
+
177
+ print(f"Final selected features: {len(self.selected_indices)}")
178
+
179
+ self.is_fitted = True
180
+ return self
181
+
182
+ def transform(self, X):
183
+ """Apply the fitted feature selection to new data."""
184
+ if not self.is_fitted:
185
+ raise RuntimeError("FeatureSelector must be fitted before transform!")
186
+
187
+ # Step 1: Split Morgan and descriptors
188
+ X_mfp = X[:, :self.n_morgan]
189
+ X_desc = X[:, self.n_morgan:]
190
+
191
+ # Step 2: Remove same correlated descriptors
192
+ desc_df = pd.DataFrame(X_desc)
193
+ desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
194
+ X_corr = np.hstack([X_mfp, desc_filtered])
195
+
196
+ # Step 3: Select same important features
197
+ X_selected = X_corr[:, self.selected_indices]
198
+
199
+ return X_selected
200
+
201
+ def fit_transform(self, X, y):
202
+ """Fit and transform in one step."""
203
+ return self.fit(X, y).transform(X)
204
+
205
+ def save(self, filepath='feature_selector.joblib'):
206
+ """Save the fitted selector."""
207
+ if not self.is_fitted:
208
+ raise RuntimeError("Cannot save unfitted selector!")
209
+
210
+ # Create directory if it doesn't exist
211
+ os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
212
+
213
+ joblib.dump(self, filepath)
214
+ print(f"✓ Feature selector saved to {filepath}")
215
+
216
+ @staticmethod
217
+ def load(filepath='feature_selector.joblib'):
218
+ """Load a fitted selector."""
219
+ selector = joblib.load(filepath)
220
+ if not selector.is_fitted:
221
+ raise RuntimeError("Loaded selector is not fitted!")
222
+ print(f"✓ Feature selector loaded from {filepath}")
223
+ return selector