PrazNeuro
/

PRECISE_GBM

+import logging
+import warnings
+import pandas as pd
+import numpy as np
+import json
+import time
+from tqdm import tqdm
+import os
+from datetime import datetime as _dt, timezone as _tz
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.mixture import GaussianMixture
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LassoCV
+from sklearn.svm import SVC
+from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, VotingClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score,
+    f1_score, balanced_accuracy_score, matthews_corrcoef
+)
+from joblib import dump
+# -------------------------
+# Logging & warnings
+# -------------------------
+logging.basicConfig(
+    filename='nested_lodo_groupsv1.log',
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+warnings.filterwarnings('ignore', category=UserWarning)
+warnings.filterwarnings('ignore', category=ConvergenceWarning)
+# Create directories for saving models if they don't exist
+os.makedirs('models_GBMv1/scenario_1', exist_ok=True)
+os.makedirs('models_GBMv1/scenario_2', exist_ok=True)
+os.makedirs('models_GBMv1/scenario_3', exist_ok=True)
+os.makedirs('models_LM22v1/scenario_1', exist_ok=True)
+os.makedirs('models_LM22v1/scenario_2', exist_ok=True)
+os.makedirs('models_LM22v1/scenario_3', exist_ok=True)
+# -------------------------
+# Caching for pipelines
+# -------------------------
+# Joblib.Memory cache disabled to avoid creating cache directories and
+# PermissionError race conditions on Windows when using parallel workers.
+memory = None
+logging.info("Joblib Memory disabled; no pipeline caching will be used")
+# Helper: convert numpy scalars/arrays and dicts into JSON-serializable Python types
+def _convert_obj(o):
+    """Recursively convert numpy types/arrays to native Python objects for JSON dumping."""
+    # numpy arrays -> lists
+    if hasattr(o, 'tolist') and not isinstance(o, (dict, list, str, bytes)):
+        try:
+            return o.tolist()
+        except Exception:
+            return str(o)
+    # dict -> convert values
+    if isinstance(o, dict):
+        return {k: _convert_obj(v) for k, v in o.items()}
+    # list/tuple -> convert items
+    if isinstance(o, (list, tuple)):
+        return [_convert_obj(v) for v in o]
+    # numpy scalar -> python native
+    if isinstance(o, (np.integer, np.floating, np.bool_)):
+        return o.item()
+    # otherwise return as-is
+    return o
+def _cv_results_to_serializable(cv_dict):
+    """Convert sklearn cv_results_ dict values (numpy arrays) into lists where needed."""
+    out = {}
+    for k, v in cv_dict.items():
+        if hasattr(v, 'tolist'):
+            try:
+                out[k] = v.tolist()
+            except Exception:
+                out[k] = str(v)
+        else:
+            out[k] = _convert_obj(v)
+    return out
+# -------------------------
+# Utility: two-step Lasso selection
+# -------------------------
+def select_features(X, y, alphas=(0.1, 0.01), cv=5, max_iter=10000, n_jobs=1, random_state=42):
+    for alpha in alphas:
+        lasso = LassoCV(
+            alphas=[alpha], cv=cv,
+            max_iter=max_iter, n_jobs=n_jobs,
+            random_state=random_state
+        )
+        # fit separately so static analyzers can see the correct type
+        lasso.fit(X, y)
+        # use flatnonzero to get selected indices as a 1-D array
+        support = np.flatnonzero(lasso.coef_ != 0)
+        if support.size > 0:
+            return support
+    raise ValueError(f"No features selected at alphas {alphas}")
+# -------------------------
+# Define two groups of scenarios with actual paths
+# Scenario definitions_LM22
+scenarios_LM22 = {
+    1: {
+        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_TC.csv",
+        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_Ivy/Cbx_LOOCV_heldout_Ivy_Lm22/CIBERSORTx_Job49_Results.csv",
+        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_Ivy.csv",
+        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/IvyGAP/Test_Ivy_LM22/CIBERSORTx_Job55_Results.csv"
+    },
+    2: {
+        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_ivy.csv",
+        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_TCGA/Cbx_heldoutTCGA_Lm22/CIBERSORTx_Job47_Results.csv",
+        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_TCGA.csv",
+        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/TCGA/Cbx_TCGA_Test_LM22/CIBERSORTx_Job53_Results.csv"
+    },
+    3: {
+        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_TC_ivy.csv",
+        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_CPTAC/CBx_LOOCV_heldout_CPTAC_LM22/CIBERSORTx_Job51_Results.csv",
+        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_CPTAC.csv",
+        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/CPTAC/Test_CPTAC_LM22/CIBERSORTx_Job57_Results.csv"
+    }
+}
+# Scenario definitions_GBM
+scenarios_GBM = {
+    1: {
+        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_TC.csv",
+        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_Ivy/Cbx_LOOCV_heldout_Ivy_GBM/CIBERSORTx_Job50_Results.csv",
+        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_Ivy.csv",
+        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/IvyGAP/Test_Ivy_GBM/CIBERSORTx_Job56_Results.csv"
+    },
+    2: {
+        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_ivy.csv",
+        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_TCGA/Cbx_LOOCV_TCGA_heldout_GBM/CIBERSORTx_Job48_Results.csv",
+        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_TCGA.csv",
+        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/TCGA/TCGA_test_GBM/CIBERSORTx_Job54_Results.csv"
+    },
+    3: {
+        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_TC_ivy.csv",
+        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_CPTAC/Cbx_LOOCV_heldout_CPTAC_GBM/CIBERSORTx_Job52_Results.csv",
+        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_CPTAC.csv",
+        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/CPTAC/Test_CPTAC_GBM/CIBERSORTx_Job58_Results.csv"
+    }
+}
+signature_groups = {
+    'LM22': scenarios_LM22,
+    'GBM': scenarios_GBM
+}
+# -------------------------
+# Hyperparameter grids
+# -------------------------
+param_dist_svm = {
+    'clf__C': [1, 10],
+    'clf__gamma': [0.01, 0.1],
+    'clf__kernel': ['rbf']
+}
+param_dist_ensemble = {
+    'ensemble__svm__classifier__C': [1],
+    'ensemble__svm__classifier__kernel': ['rbf'],
+    'ensemble__rf__n_estimators': [100, 200],
+    'ensemble__rf__max_depth': [None],
+    'ensemble__gb__max_iter': [100],
+    'ensemble__gb__learning_rate': [0.1]
+}
+# -------------------------
+# Process each signature group
+# -------------------------
+for sig_name, scenarios in signature_groups.items():
+    all_results = {}
+    all_features = {}
+    all_cv = {}
+    for scen_id, paths in scenarios.items():
+        logging.info(f"[{sig_name}] Starting {scen_id}")
+        t0 = time.time()
+        # Load & align training data
+        rad_tr = pd.read_csv(paths['train_radiomics'], index_col=0)
+        imm_tr = pd.read_csv(paths['train_immune'],    index_col=0)
+        df_tr = pd.merge(rad_tr, imm_tr, left_index=True, right_index=True, how='inner')
+        # Load & align held-out data
+        rad_ho = pd.read_csv(paths['heldout_radiomics'], index_col=0)
+        imm_ho = pd.read_csv(paths['heldout_immune'],    index_col=0)
+        df_ho = pd.merge(rad_ho, imm_ho, left_index=True, right_index=True, how='inner')
+        scen_results = {}
+        scen_features = {}
+        scen_cv = {}
+        inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+        # Determine immune feature columns (may differ by signature)
+        immune_cols = imm_tr.columns.intersection(imm_ho.columns)
+        if immune_cols.empty:
+            raise ValueError(f"{sig_name}:{scen_id} - no matching immune features between train and held-out")
+        logging.info(f"{sig_name}:{scen_id} - {len(immune_cols)} immune features: {immune_cols.tolist()}")
+        for col in tqdm(immune_cols, desc=f"{sig_name}:{scen_id}"):
+            try:
+                # GMM labeling on train
+                gmm = GaussianMixture(n_components=2, random_state=42)
+                y_tr = gmm.fit_predict(df_tr[[col]].values)
+                if len(np.unique(y_tr)) < 2:
+                    continue
+                y_ho = gmm.predict(df_ho[[col]].values)
+                # ensure label 1 = higher mean
+                m0, m1 = gmm.means_.flatten()
+                if m0 < m1:
+                    y_tr = 1 - y_tr; y_ho = 1 - y_ho
+                # save gmm model
+                gmm_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_gmm_model.joblib'
+                os.makedirs(os.path.dirname(gmm_model_path), exist_ok=True)
+                dump(gmm, gmm_model_path)
+                logging.info(f"Saved GMM model to {gmm_model_path}")
+                logging.info(f"GMM means for {sig_name}:{scen_id}, col {col}: {gmm.means_.flatten().tolist()}")
+                # Feature selection
+                X_tr = df_tr.drop(columns=[col]).values
+                X_ho = df_ho.drop(columns=[col]).values
+                sel = select_features(X_tr, y_tr)
+                X_tr_sel, X_ho_sel = X_tr[:, sel], X_ho[:, sel]
+                feat_names = df_tr.drop(columns=[col]).columns.tolist()
+                sel_names = [feat_names[i] for i in sel]
+                # Save selected feature names for this model so retraining can reuse them
+                sel_feat_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_selected_features.json'
+                os.makedirs(os.path.dirname(sel_feat_path), exist_ok=True)
+                ts = _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S')
+                meta = {'saved_at': _dt.now(_tz.utc).isoformat(), 'version': ts, 'selected_features': sel_names}
+                with open(sel_feat_path, 'w') as _f:
+                    json.dump(meta, _f, indent=2)
+                # SVM nested CV
+                # Avoid using joblib.Memory at the Pipeline level when running parallel CV (n_jobs != 1).
+                # Joblib's Memory can hit race conditions on Windows when multiple workers try to
+                # read/write the same cache files which leads to PermissionError (output.pkl).
+                # We therefore disable pipeline caching here (memory=None). This does NOT affect
+                # saving final models or params (those are written explicitly with dump/json below).
+                pipe_svm = Pipeline([
+                    ('scaler', StandardScaler()),
+                    ('clf', SVC(class_weight='balanced', probability=True, random_state=42))
+                ], memory=None)
+                search_svm = RandomizedSearchCV(
+                    pipe_svm, param_dist_svm, n_iter=5,
+                    cv=inner_cv, scoring='balanced_accuracy',
+                    n_jobs=1, refit=True, error_score='raise'
+                )
+                search_svm.fit(X_tr_sel, y_tr)
+                y_pred_svm = search_svm.predict(X_ho_sel)
+                cv_svm = {k: (v.tolist() if hasattr(v, 'tolist') else v)
+                          for k, v in search_svm.cv_results_.items()}
+                # save SVM model
+                svm_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_model.joblib'
+                os.makedirs(os.path.dirname(svm_model_path), exist_ok=True)
+                dump(search_svm.best_estimator_, svm_model_path)
+                logging.info(f"Saved SVM model to {svm_model_path}")
+                logging.info(f"SVM best params for {sig_name}:{scen_id}, col {col}: {search_svm.best_params_}")
+                # Save SVM best params and cv results for reproducibility / retraining (with metadata)
+                svm_params_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_params.json'
+                svm_cv_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_cv.json'
+                os.makedirs(os.path.dirname(svm_params_path), exist_ok=True)
+                svm_meta = {
+                    'saved_at': _dt.now(_tz.utc).isoformat(),
+                    'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
+                    'best_params': _convert_obj(search_svm.best_params_)
+                }
+                with open(svm_params_path, 'w') as _f:
+                    json.dump(svm_meta, _f, indent=2)
+                svm_cv_meta = {
+                    'saved_at': _dt.now(_tz.utc).isoformat(),
+                    'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
+                    'cv_results': _cv_results_to_serializable(search_svm.cv_results_)
+                }
+                with open(svm_cv_path, 'w') as _f:
+                    json.dump(svm_cv_meta, _f, indent=2)
+                # Ensemble nested CV
+                base_pipe = Pipeline([
+                    ('scaler', StandardScaler()),
+                    ('classifier', SVC(class_weight='balanced', probability=True, random_state=42))
+                ], memory=None)
+                ensemble = VotingClassifier([
+                    ('svm', base_pipe),
+                    ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
+                    ('gb', HistGradientBoostingClassifier(random_state=42))
+                ], voting='soft', weights=[1,1,1], n_jobs=1)
+                pipe_ens = Pipeline([
+                    ('scaler', StandardScaler()),
+                    ('ensemble', ensemble)
+                ], memory=None)
+                search_ens = RandomizedSearchCV(
+                    pipe_ens, param_dist_ensemble, n_iter=3,
+                    cv=inner_cv, scoring='balanced_accuracy',
+                    n_jobs=1, refit=True, error_score='raise'
+                )
+                search_ens.fit(X_tr_sel, y_tr)
+                y_pred_ens = search_ens.predict(X_ho_sel)
+                cv_ens = {k: (v.tolist() if hasattr(v, 'tolist') else v)
+                          for k, v in search_ens.cv_results_.items()}
+                # save Ensemble model
+                ens_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_model.joblib'
+                os.makedirs(os.path.dirname(ens_model_path), exist_ok=True)
+                dump(search_ens.best_estimator_, ens_model_path)
+                logging.info(f"Saved Ensemble model to {ens_model_path}")
+                logging.info(f"Ensemble best params for {sig_name}:{scen_id}, col {col}: {search_ens.best_params_}")
+                # Save Ensemble best params and cv results for reproducibility / retraining (with metadata)
+                ens_params_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_params.json'
+                ens_cv_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_cv.json'
+                os.makedirs(os.path.dirname(ens_params_path), exist_ok=True)
+                ens_meta = {
+                    'saved_at': _dt.now(_tz.utc).isoformat(),
+                    'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
+                    'best_params': _convert_obj(search_ens.best_params_)
+                }
+                with open(ens_params_path, 'w') as _f:
+                    json.dump(ens_meta, _f, indent=2)
+                ens_cv_meta = {
+                    'saved_at': _dt.now(_tz.utc).isoformat(),
+                    'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
+                    'cv_results': _cv_results_to_serializable(search_ens.cv_results_)
+                }
+                with open(ens_cv_path, 'w') as _f:
+                    json.dump(ens_cv_meta, _f, indent=2)
+                # Metrics
+                def metrics(y_true, y_pred):
+                    return {
+                        'Accuracy': accuracy_score(y_true, y_pred),
+                        'Precision': precision_score(y_true, y_pred, zero_division=1),
+                        'Recall': recall_score(y_true, y_pred, zero_division=1),
+                        'F1 Score': f1_score(y_true, y_pred, zero_division=1),
+                        'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred),
+                        'MCC': matthews_corrcoef(y_true, y_pred)
+                    }
+                scen_results[col] = {'SVM': metrics(y_ho, y_pred_svm), 'Ensemble': metrics(y_ho, y_pred_ens)}
+                scen_features[col] = sel_names
+                scen_cv[col] = {'svm_cv': cv_svm, 'ensemble_cv': cv_ens}
+            except Exception as e:
+                # log full traceback for easier debugging (written to nested_lodo_groupsv1.log)
+                logging.exception(f"{sig_name}:{scen_id}, col {col}: unexpected error")
+                print(f"[ERROR] {sig_name}:{scen_id}, column {col}: {e}")
+        # Save for this scenario
+        all_results[scen_id] = scen_results
+        all_features[scen_id] = scen_features
+        all_cv[scen_id] = scen_cv
+        logging.info(f"[{sig_name}] {scen_id} done in {time.time()-t0:.1f}s")
+    # Write group-level JSONs
+    with open(f'nestedv1_results111_{sig_name}.json', 'w') as f:
+        json.dump(all_results, f, indent=2)
+    with open(f'nestedv1_features111_{sig_name}.json', 'w') as f:
+        json.dump(all_features, f, indent=2)
+    with open(f'nestedv1_cv111_{sig_name}.json', 'w') as f:
+        json.dump(all_cv, f, indent=2)
+    print(f"✅ {sig_name} group complete: scenarios={list(all_results.keys())}")
+print("All signature groups processed.")