Upload Scenario_heldout_final_PRECISE.py
Browse files
Scenario_heldout_final_PRECISE.py
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import warnings
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
import json
|
| 6 |
+
import time
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
import os
|
| 9 |
+
from datetime import datetime as _dt, timezone as _tz
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
from sklearn.exceptions import ConvergenceWarning
|
| 13 |
+
from sklearn.mixture import GaussianMixture
|
| 14 |
+
from sklearn.preprocessing import StandardScaler
|
| 15 |
+
from sklearn.linear_model import LassoCV
|
| 16 |
+
from sklearn.svm import SVC
|
| 17 |
+
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, VotingClassifier
|
| 18 |
+
from sklearn.pipeline import Pipeline
|
| 19 |
+
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
|
| 20 |
+
from sklearn.metrics import (
|
| 21 |
+
accuracy_score, precision_score, recall_score,
|
| 22 |
+
f1_score, balanced_accuracy_score, matthews_corrcoef
|
| 23 |
+
)
|
| 24 |
+
from joblib import dump
|
| 25 |
+
|
| 26 |
+
# -------------------------
|
| 27 |
+
# Logging & warnings
|
| 28 |
+
# -------------------------
|
| 29 |
+
logging.basicConfig(
|
| 30 |
+
filename='nested_lodo_groupsv1.log',
|
| 31 |
+
level=logging.INFO,
|
| 32 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 33 |
+
)
|
| 34 |
+
warnings.filterwarnings('ignore', category=UserWarning)
|
| 35 |
+
warnings.filterwarnings('ignore', category=ConvergenceWarning)
|
| 36 |
+
|
| 37 |
+
# Create directories for saving models if they don't exist
|
| 38 |
+
os.makedirs('models_GBMv1/scenario_1', exist_ok=True)
|
| 39 |
+
os.makedirs('models_GBMv1/scenario_2', exist_ok=True)
|
| 40 |
+
os.makedirs('models_GBMv1/scenario_3', exist_ok=True)
|
| 41 |
+
os.makedirs('models_LM22v1/scenario_1', exist_ok=True)
|
| 42 |
+
os.makedirs('models_LM22v1/scenario_2', exist_ok=True)
|
| 43 |
+
os.makedirs('models_LM22v1/scenario_3', exist_ok=True)
|
| 44 |
+
|
| 45 |
+
# -------------------------
|
| 46 |
+
# Caching for pipelines
|
| 47 |
+
# -------------------------
|
| 48 |
+
# Joblib.Memory cache disabled to avoid creating cache directories and
|
| 49 |
+
# PermissionError race conditions on Windows when using parallel workers.
|
| 50 |
+
memory = None
|
| 51 |
+
logging.info("Joblib Memory disabled; no pipeline caching will be used")
|
| 52 |
+
|
| 53 |
+
# Helper: convert numpy scalars/arrays and dicts into JSON-serializable Python types
|
| 54 |
+
|
| 55 |
+
def _convert_obj(o):
|
| 56 |
+
"""Recursively convert numpy types/arrays to native Python objects for JSON dumping."""
|
| 57 |
+
# numpy arrays -> lists
|
| 58 |
+
if hasattr(o, 'tolist') and not isinstance(o, (dict, list, str, bytes)):
|
| 59 |
+
try:
|
| 60 |
+
return o.tolist()
|
| 61 |
+
except Exception:
|
| 62 |
+
return str(o)
|
| 63 |
+
# dict -> convert values
|
| 64 |
+
if isinstance(o, dict):
|
| 65 |
+
return {k: _convert_obj(v) for k, v in o.items()}
|
| 66 |
+
# list/tuple -> convert items
|
| 67 |
+
if isinstance(o, (list, tuple)):
|
| 68 |
+
return [_convert_obj(v) for v in o]
|
| 69 |
+
# numpy scalar -> python native
|
| 70 |
+
if isinstance(o, (np.integer, np.floating, np.bool_)):
|
| 71 |
+
return o.item()
|
| 72 |
+
# otherwise return as-is
|
| 73 |
+
return o
|
| 74 |
+
|
| 75 |
+
def _cv_results_to_serializable(cv_dict):
|
| 76 |
+
"""Convert sklearn cv_results_ dict values (numpy arrays) into lists where needed."""
|
| 77 |
+
out = {}
|
| 78 |
+
for k, v in cv_dict.items():
|
| 79 |
+
if hasattr(v, 'tolist'):
|
| 80 |
+
try:
|
| 81 |
+
out[k] = v.tolist()
|
| 82 |
+
except Exception:
|
| 83 |
+
out[k] = str(v)
|
| 84 |
+
else:
|
| 85 |
+
out[k] = _convert_obj(v)
|
| 86 |
+
return out
|
| 87 |
+
|
| 88 |
+
# -------------------------
|
| 89 |
+
# Utility: two-step Lasso selection
|
| 90 |
+
# -------------------------
|
| 91 |
+
def select_features(X, y, alphas=(0.1, 0.01), cv=5, max_iter=10000, n_jobs=1, random_state=42):
|
| 92 |
+
for alpha in alphas:
|
| 93 |
+
lasso = LassoCV(
|
| 94 |
+
alphas=[alpha], cv=cv,
|
| 95 |
+
max_iter=max_iter, n_jobs=n_jobs,
|
| 96 |
+
random_state=random_state
|
| 97 |
+
)
|
| 98 |
+
# fit separately so static analyzers can see the correct type
|
| 99 |
+
lasso.fit(X, y)
|
| 100 |
+
# use flatnonzero to get selected indices as a 1-D array
|
| 101 |
+
support = np.flatnonzero(lasso.coef_ != 0)
|
| 102 |
+
if support.size > 0:
|
| 103 |
+
return support
|
| 104 |
+
raise ValueError(f"No features selected at alphas {alphas}")
|
| 105 |
+
|
| 106 |
+
# -------------------------
|
| 107 |
+
# Define two groups of scenarios with actual paths
|
| 108 |
+
# Scenario definitions_LM22
|
| 109 |
+
scenarios_LM22 = {
|
| 110 |
+
1: {
|
| 111 |
+
'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_TC.csv",
|
| 112 |
+
'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_Ivy/Cbx_LOOCV_heldout_Ivy_Lm22/CIBERSORTx_Job49_Results.csv",
|
| 113 |
+
'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_Ivy.csv",
|
| 114 |
+
'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/IvyGAP/Test_Ivy_LM22/CIBERSORTx_Job55_Results.csv"
|
| 115 |
+
},
|
| 116 |
+
2: {
|
| 117 |
+
'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_ivy.csv",
|
| 118 |
+
'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_TCGA/Cbx_heldoutTCGA_Lm22/CIBERSORTx_Job47_Results.csv",
|
| 119 |
+
'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_TCGA.csv",
|
| 120 |
+
'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/TCGA/Cbx_TCGA_Test_LM22/CIBERSORTx_Job53_Results.csv"
|
| 121 |
+
},
|
| 122 |
+
3: {
|
| 123 |
+
'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_TC_ivy.csv",
|
| 124 |
+
'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_CPTAC/CBx_LOOCV_heldout_CPTAC_LM22/CIBERSORTx_Job51_Results.csv",
|
| 125 |
+
'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_CPTAC.csv",
|
| 126 |
+
'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/CPTAC/Test_CPTAC_LM22/CIBERSORTx_Job57_Results.csv"
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
# Scenario definitions_GBM
|
| 130 |
+
scenarios_GBM = {
|
| 131 |
+
1: {
|
| 132 |
+
'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_TC.csv",
|
| 133 |
+
'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_Ivy/Cbx_LOOCV_heldout_Ivy_GBM/CIBERSORTx_Job50_Results.csv",
|
| 134 |
+
'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_Ivy.csv",
|
| 135 |
+
'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/IvyGAP/Test_Ivy_GBM/CIBERSORTx_Job56_Results.csv"
|
| 136 |
+
},
|
| 137 |
+
2: {
|
| 138 |
+
'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_ivy.csv",
|
| 139 |
+
'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_TCGA/Cbx_LOOCV_TCGA_heldout_GBM/CIBERSORTx_Job48_Results.csv",
|
| 140 |
+
'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_TCGA.csv",
|
| 141 |
+
'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/TCGA/TCGA_test_GBM/CIBERSORTx_Job54_Results.csv"
|
| 142 |
+
},
|
| 143 |
+
3: {
|
| 144 |
+
'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_TC_ivy.csv",
|
| 145 |
+
'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_CPTAC/Cbx_LOOCV_heldout_CPTAC_GBM/CIBERSORTx_Job52_Results.csv",
|
| 146 |
+
'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_CPTAC.csv",
|
| 147 |
+
'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/CPTAC/Test_CPTAC_GBM/CIBERSORTx_Job58_Results.csv"
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
signature_groups = {
|
| 152 |
+
'LM22': scenarios_LM22,
|
| 153 |
+
'GBM': scenarios_GBM
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
# -------------------------
|
| 157 |
+
# Hyperparameter grids
|
| 158 |
+
# -------------------------
|
| 159 |
+
param_dist_svm = {
|
| 160 |
+
'clf__C': [1, 10],
|
| 161 |
+
'clf__gamma': [0.01, 0.1],
|
| 162 |
+
'clf__kernel': ['rbf']
|
| 163 |
+
}
|
| 164 |
+
param_dist_ensemble = {
|
| 165 |
+
'ensemble__svm__classifier__C': [1],
|
| 166 |
+
'ensemble__svm__classifier__kernel': ['rbf'],
|
| 167 |
+
'ensemble__rf__n_estimators': [100, 200],
|
| 168 |
+
'ensemble__rf__max_depth': [None],
|
| 169 |
+
'ensemble__gb__max_iter': [100],
|
| 170 |
+
'ensemble__gb__learning_rate': [0.1]
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
# -------------------------
|
| 174 |
+
# Process each signature group
|
| 175 |
+
# -------------------------
|
| 176 |
+
for sig_name, scenarios in signature_groups.items():
|
| 177 |
+
all_results = {}
|
| 178 |
+
all_features = {}
|
| 179 |
+
all_cv = {}
|
| 180 |
+
|
| 181 |
+
for scen_id, paths in scenarios.items():
|
| 182 |
+
logging.info(f"[{sig_name}] Starting {scen_id}")
|
| 183 |
+
t0 = time.time()
|
| 184 |
+
|
| 185 |
+
# Load & align training data
|
| 186 |
+
rad_tr = pd.read_csv(paths['train_radiomics'], index_col=0)
|
| 187 |
+
imm_tr = pd.read_csv(paths['train_immune'], index_col=0)
|
| 188 |
+
df_tr = pd.merge(rad_tr, imm_tr, left_index=True, right_index=True, how='inner')
|
| 189 |
+
|
| 190 |
+
# Load & align held-out data
|
| 191 |
+
rad_ho = pd.read_csv(paths['heldout_radiomics'], index_col=0)
|
| 192 |
+
imm_ho = pd.read_csv(paths['heldout_immune'], index_col=0)
|
| 193 |
+
df_ho = pd.merge(rad_ho, imm_ho, left_index=True, right_index=True, how='inner')
|
| 194 |
+
|
| 195 |
+
scen_results = {}
|
| 196 |
+
scen_features = {}
|
| 197 |
+
scen_cv = {}
|
| 198 |
+
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
| 199 |
+
|
| 200 |
+
# Determine immune feature columns (may differ by signature)
|
| 201 |
+
immune_cols = imm_tr.columns.intersection(imm_ho.columns)
|
| 202 |
+
if immune_cols.empty:
|
| 203 |
+
raise ValueError(f"{sig_name}:{scen_id} - no matching immune features between train and held-out")
|
| 204 |
+
logging.info(f"{sig_name}:{scen_id} - {len(immune_cols)} immune features: {immune_cols.tolist()}")
|
| 205 |
+
|
| 206 |
+
for col in tqdm(immune_cols, desc=f"{sig_name}:{scen_id}"):
|
| 207 |
+
try:
|
| 208 |
+
# GMM labeling on train
|
| 209 |
+
gmm = GaussianMixture(n_components=2, random_state=42)
|
| 210 |
+
y_tr = gmm.fit_predict(df_tr[[col]].values)
|
| 211 |
+
if len(np.unique(y_tr)) < 2:
|
| 212 |
+
continue
|
| 213 |
+
y_ho = gmm.predict(df_ho[[col]].values)
|
| 214 |
+
# ensure label 1 = higher mean
|
| 215 |
+
m0, m1 = gmm.means_.flatten()
|
| 216 |
+
if m0 < m1:
|
| 217 |
+
y_tr = 1 - y_tr; y_ho = 1 - y_ho
|
| 218 |
+
# save gmm model
|
| 219 |
+
gmm_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_gmm_model.joblib'
|
| 220 |
+
os.makedirs(os.path.dirname(gmm_model_path), exist_ok=True)
|
| 221 |
+
dump(gmm, gmm_model_path)
|
| 222 |
+
logging.info(f"Saved GMM model to {gmm_model_path}")
|
| 223 |
+
logging.info(f"GMM means for {sig_name}:{scen_id}, col {col}: {gmm.means_.flatten().tolist()}")
|
| 224 |
+
|
| 225 |
+
# Feature selection
|
| 226 |
+
X_tr = df_tr.drop(columns=[col]).values
|
| 227 |
+
X_ho = df_ho.drop(columns=[col]).values
|
| 228 |
+
sel = select_features(X_tr, y_tr)
|
| 229 |
+
X_tr_sel, X_ho_sel = X_tr[:, sel], X_ho[:, sel]
|
| 230 |
+
feat_names = df_tr.drop(columns=[col]).columns.tolist()
|
| 231 |
+
sel_names = [feat_names[i] for i in sel]
|
| 232 |
+
|
| 233 |
+
# Save selected feature names for this model so retraining can reuse them
|
| 234 |
+
sel_feat_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_selected_features.json'
|
| 235 |
+
os.makedirs(os.path.dirname(sel_feat_path), exist_ok=True)
|
| 236 |
+
ts = _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S')
|
| 237 |
+
meta = {'saved_at': _dt.now(_tz.utc).isoformat(), 'version': ts, 'selected_features': sel_names}
|
| 238 |
+
with open(sel_feat_path, 'w') as _f:
|
| 239 |
+
json.dump(meta, _f, indent=2)
|
| 240 |
+
|
| 241 |
+
# SVM nested CV
|
| 242 |
+
# Avoid using joblib.Memory at the Pipeline level when running parallel CV (n_jobs != 1).
|
| 243 |
+
# Joblib's Memory can hit race conditions on Windows when multiple workers try to
|
| 244 |
+
# read/write the same cache files which leads to PermissionError (output.pkl).
|
| 245 |
+
# We therefore disable pipeline caching here (memory=None). This does NOT affect
|
| 246 |
+
# saving final models or params (those are written explicitly with dump/json below).
|
| 247 |
+
pipe_svm = Pipeline([
|
| 248 |
+
('scaler', StandardScaler()),
|
| 249 |
+
('clf', SVC(class_weight='balanced', probability=True, random_state=42))
|
| 250 |
+
], memory=None)
|
| 251 |
+
search_svm = RandomizedSearchCV(
|
| 252 |
+
pipe_svm, param_dist_svm, n_iter=5,
|
| 253 |
+
cv=inner_cv, scoring='balanced_accuracy',
|
| 254 |
+
n_jobs=1, refit=True, error_score='raise'
|
| 255 |
+
)
|
| 256 |
+
search_svm.fit(X_tr_sel, y_tr)
|
| 257 |
+
y_pred_svm = search_svm.predict(X_ho_sel)
|
| 258 |
+
cv_svm = {k: (v.tolist() if hasattr(v, 'tolist') else v)
|
| 259 |
+
for k, v in search_svm.cv_results_.items()}
|
| 260 |
+
# save SVM model
|
| 261 |
+
svm_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_model.joblib'
|
| 262 |
+
os.makedirs(os.path.dirname(svm_model_path), exist_ok=True)
|
| 263 |
+
dump(search_svm.best_estimator_, svm_model_path)
|
| 264 |
+
logging.info(f"Saved SVM model to {svm_model_path}")
|
| 265 |
+
logging.info(f"SVM best params for {sig_name}:{scen_id}, col {col}: {search_svm.best_params_}")
|
| 266 |
+
|
| 267 |
+
# Save SVM best params and cv results for reproducibility / retraining (with metadata)
|
| 268 |
+
svm_params_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_params.json'
|
| 269 |
+
svm_cv_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_cv.json'
|
| 270 |
+
os.makedirs(os.path.dirname(svm_params_path), exist_ok=True)
|
| 271 |
+
svm_meta = {
|
| 272 |
+
'saved_at': _dt.now(_tz.utc).isoformat(),
|
| 273 |
+
'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
|
| 274 |
+
'best_params': _convert_obj(search_svm.best_params_)
|
| 275 |
+
}
|
| 276 |
+
with open(svm_params_path, 'w') as _f:
|
| 277 |
+
json.dump(svm_meta, _f, indent=2)
|
| 278 |
+
svm_cv_meta = {
|
| 279 |
+
'saved_at': _dt.now(_tz.utc).isoformat(),
|
| 280 |
+
'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
|
| 281 |
+
'cv_results': _cv_results_to_serializable(search_svm.cv_results_)
|
| 282 |
+
}
|
| 283 |
+
with open(svm_cv_path, 'w') as _f:
|
| 284 |
+
json.dump(svm_cv_meta, _f, indent=2)
|
| 285 |
+
|
| 286 |
+
# Ensemble nested CV
|
| 287 |
+
base_pipe = Pipeline([
|
| 288 |
+
('scaler', StandardScaler()),
|
| 289 |
+
('classifier', SVC(class_weight='balanced', probability=True, random_state=42))
|
| 290 |
+
], memory=None)
|
| 291 |
+
ensemble = VotingClassifier([
|
| 292 |
+
('svm', base_pipe),
|
| 293 |
+
('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
|
| 294 |
+
('gb', HistGradientBoostingClassifier(random_state=42))
|
| 295 |
+
], voting='soft', weights=[1,1,1], n_jobs=1)
|
| 296 |
+
pipe_ens = Pipeline([
|
| 297 |
+
('scaler', StandardScaler()),
|
| 298 |
+
('ensemble', ensemble)
|
| 299 |
+
], memory=None)
|
| 300 |
+
search_ens = RandomizedSearchCV(
|
| 301 |
+
pipe_ens, param_dist_ensemble, n_iter=3,
|
| 302 |
+
cv=inner_cv, scoring='balanced_accuracy',
|
| 303 |
+
n_jobs=1, refit=True, error_score='raise'
|
| 304 |
+
)
|
| 305 |
+
search_ens.fit(X_tr_sel, y_tr)
|
| 306 |
+
y_pred_ens = search_ens.predict(X_ho_sel)
|
| 307 |
+
cv_ens = {k: (v.tolist() if hasattr(v, 'tolist') else v)
|
| 308 |
+
for k, v in search_ens.cv_results_.items()}
|
| 309 |
+
# save Ensemble model
|
| 310 |
+
ens_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_model.joblib'
|
| 311 |
+
os.makedirs(os.path.dirname(ens_model_path), exist_ok=True)
|
| 312 |
+
dump(search_ens.best_estimator_, ens_model_path)
|
| 313 |
+
logging.info(f"Saved Ensemble model to {ens_model_path}")
|
| 314 |
+
logging.info(f"Ensemble best params for {sig_name}:{scen_id}, col {col}: {search_ens.best_params_}")
|
| 315 |
+
|
| 316 |
+
# Save Ensemble best params and cv results for reproducibility / retraining (with metadata)
|
| 317 |
+
ens_params_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_params.json'
|
| 318 |
+
ens_cv_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_cv.json'
|
| 319 |
+
os.makedirs(os.path.dirname(ens_params_path), exist_ok=True)
|
| 320 |
+
ens_meta = {
|
| 321 |
+
'saved_at': _dt.now(_tz.utc).isoformat(),
|
| 322 |
+
'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
|
| 323 |
+
'best_params': _convert_obj(search_ens.best_params_)
|
| 324 |
+
}
|
| 325 |
+
with open(ens_params_path, 'w') as _f:
|
| 326 |
+
json.dump(ens_meta, _f, indent=2)
|
| 327 |
+
ens_cv_meta = {
|
| 328 |
+
'saved_at': _dt.now(_tz.utc).isoformat(),
|
| 329 |
+
'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
|
| 330 |
+
'cv_results': _cv_results_to_serializable(search_ens.cv_results_)
|
| 331 |
+
}
|
| 332 |
+
with open(ens_cv_path, 'w') as _f:
|
| 333 |
+
json.dump(ens_cv_meta, _f, indent=2)
|
| 334 |
+
|
| 335 |
+
# Metrics
|
| 336 |
+
def metrics(y_true, y_pred):
|
| 337 |
+
return {
|
| 338 |
+
'Accuracy': accuracy_score(y_true, y_pred),
|
| 339 |
+
'Precision': precision_score(y_true, y_pred, zero_division=1),
|
| 340 |
+
'Recall': recall_score(y_true, y_pred, zero_division=1),
|
| 341 |
+
'F1 Score': f1_score(y_true, y_pred, zero_division=1),
|
| 342 |
+
'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred),
|
| 343 |
+
'MCC': matthews_corrcoef(y_true, y_pred)
|
| 344 |
+
}
|
| 345 |
+
scen_results[col] = {'SVM': metrics(y_ho, y_pred_svm), 'Ensemble': metrics(y_ho, y_pred_ens)}
|
| 346 |
+
scen_features[col] = sel_names
|
| 347 |
+
scen_cv[col] = {'svm_cv': cv_svm, 'ensemble_cv': cv_ens}
|
| 348 |
+
|
| 349 |
+
except Exception as e:
|
| 350 |
+
# log full traceback for easier debugging (written to nested_lodo_groupsv1.log)
|
| 351 |
+
logging.exception(f"{sig_name}:{scen_id}, col {col}: unexpected error")
|
| 352 |
+
print(f"[ERROR] {sig_name}:{scen_id}, column {col}: {e}")
|
| 353 |
+
|
| 354 |
+
# Save for this scenario
|
| 355 |
+
all_results[scen_id] = scen_results
|
| 356 |
+
all_features[scen_id] = scen_features
|
| 357 |
+
all_cv[scen_id] = scen_cv
|
| 358 |
+
logging.info(f"[{sig_name}] {scen_id} done in {time.time()-t0:.1f}s")
|
| 359 |
+
|
| 360 |
+
# Write group-level JSONs
|
| 361 |
+
with open(f'nestedv1_results111_{sig_name}.json', 'w') as f:
|
| 362 |
+
json.dump(all_results, f, indent=2)
|
| 363 |
+
with open(f'nestedv1_features111_{sig_name}.json', 'w') as f:
|
| 364 |
+
json.dump(all_features, f, indent=2)
|
| 365 |
+
with open(f'nestedv1_cv111_{sig_name}.json', 'w') as f:
|
| 366 |
+
json.dump(all_cv, f, indent=2)
|
| 367 |
+
print(f"✅ {sig_name} group complete: scenarios={list(all_results.keys())}")
|
| 368 |
+
|
| 369 |
+
print("All signature groups processed.")
|