File size: 18,376 Bytes

import logging
import warnings
import pandas as pd
import numpy as np
import json
import time
from tqdm import tqdm
import os
from datetime import datetime as _dt, timezone as _tz


from sklearn.exceptions import ConvergenceWarning
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, balanced_accuracy_score, matthews_corrcoef
)
from joblib import Memory, dump

# -------------------------
# Logging & warnings
# -------------------------
logging.basicConfig(
    filename='nested_lodo_groups.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)

# Create directories for saving models if they don't exist
os.makedirs('models_GBM/scenario_1', exist_ok=True)
os.makedirs('models_GBM/scenario_2', exist_ok=True)
os.makedirs('models_GBM/scenario_3', exist_ok=True)
os.makedirs('models_LM22/scenario_1', exist_ok=True)
os.makedirs('models_LM22/scenario_2', exist_ok=True)
os.makedirs('models_LM22/scenario_3', exist_ok=True)

# -------------------------
# Caching for pipelines
# -------------------------
memory = Memory(location='cache_dir', verbose=0)

# Helper: convert numpy scalars/arrays and dicts into JSON-serializable Python types
import numpy as _np

def _convert_obj(o):
    """Recursively convert numpy types/arrays to native Python objects for JSON dumping."""
    # numpy arrays -> lists
    if hasattr(o, 'tolist') and not isinstance(o, (dict, list, str, bytes)):
        try:
            return o.tolist()
        except Exception:
            return str(o)
    # dict -> convert values
    if isinstance(o, dict):
        return {k: _convert_obj(v) for k, v in o.items()}
    # list/tuple -> convert items
    if isinstance(o, (list, tuple)):
        return [_convert_obj(v) for v in o]
    # numpy scalar -> python native
    if isinstance(o, (_np.integer, _np.floating, _np.bool_)):
        return o.item()
    # otherwise return as-is
    return o

def _cv_results_to_serializable(cv_dict):
    """Convert sklearn cv_results_ dict values (numpy arrays) into lists where needed."""
    out = {}
    for k, v in cv_dict.items():
        if hasattr(v, 'tolist'):
            try:
                out[k] = v.tolist()
            except Exception:
                out[k] = str(v)
        else:
            out[k] = _convert_obj(v)
    return out

# -------------------------
# Utility: two-step Lasso selection
# -------------------------
def select_features(X, y, alphas=(0.1, 0.01), cv=5, max_iter=10000, n_jobs=-1, random_state=42):
    for alpha in alphas:
        lasso = LassoCV(
            alphas=[alpha], cv=cv,
            max_iter=max_iter, n_jobs=n_jobs,
            random_state=random_state
        )
        # fit separately so static analyzers can see the correct type
        lasso.fit(X, y)
        # use flatnonzero to get selected indices as a 1-D array
        support = np.flatnonzero(lasso.coef_ != 0)
        if support.size > 0:
            return support
    raise ValueError(f"No features selected at alphas {alphas}")

# -------------------------
# Define two groups of scenarios with actual paths
# Scenario definitions_LM22
scenarios_LM22 = {
    1: {
        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_TC.csv",
        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_Ivy/Cbx_LOOCV_heldout_Ivy_Lm22/CIBERSORTx_Job49_Results.csv",
        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_Ivy.csv",
        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/IvyGAP/Test_Ivy_LM22/CIBERSORTx_Job55_Results.csv"
    },
    2: {
        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_ivy.csv",
        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_TCGA/Cbx_heldoutTCGA_Lm22/CIBERSORTx_Job47_Results.csv",
        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_TCGA.csv",
        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/TCGA/Cbx_TCGA_Test_LM22/CIBERSORTx_Job53_Results.csv"
    },
    3: {
        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_TC_ivy.csv",
        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_CPTAC/CBx_LOOCV_heldout_CPTAC_LM22/CIBERSORTx_Job51_Results.csv",
        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_CPTAC.csv",
        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/CPTAC/Test_CPTAC_LM22/CIBERSORTx_Job57_Results.csv"
    }
}
# Scenario definitions_GBM
scenarios_GBM = {
    1: {
        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_TC.csv",
        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_Ivy/Cbx_LOOCV_heldout_Ivy_GBM/CIBERSORTx_Job50_Results.csv",
        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_Ivy.csv",
        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/IvyGAP/Test_Ivy_GBM/CIBERSORTx_Job56_Results.csv"
    },
    2: {
        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_ivy.csv",
        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_TCGA/Cbx_LOOCV_TCGA_heldout_GBM/CIBERSORTx_Job48_Results.csv",
        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_TCGA.csv",
        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/TCGA/TCGA_test_GBM/CIBERSORTx_Job54_Results.csv"
    },
    3: {
        'train_radiomics':    r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_TC_ivy.csv",
        'train_immune':       r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_CPTAC/Cbx_LOOCV_heldout_CPTAC_GBM/CIBERSORTx_Job52_Results.csv",
        'heldout_radiomics':  r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_CPTAC.csv",
        'heldout_immune':     r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/CPTAC/Test_CPTAC_GBM/CIBERSORTx_Job58_Results.csv"
    }
}

signature_groups = {
    'LM22': scenarios_LM22,
    'GBM': scenarios_GBM
}

# -------------------------
# Hyperparameter grids
# -------------------------
param_dist_svm = {
    'clf__C': [1, 10],
    'clf__gamma': [0.01, 0.1],
    'clf__kernel': ['rbf']
}
param_dist_ensemble = {
    'ensemble__svm__classifier__C': [1],
    'ensemble__svm__classifier__kernel': ['rbf'],
    'ensemble__rf__n_estimators': [100, 200],
    'ensemble__rf__max_depth': [None],
    'ensemble__gb__max_iter': [100],
    'ensemble__gb__learning_rate': [0.1]
}

# -------------------------
# Process each signature group
# -------------------------
for sig_name, scenarios in signature_groups.items():
    all_results = {}
    all_features = {}
    all_cv = {}

    for scen_id, paths in scenarios.items():
        logging.info(f"[{sig_name}] Starting {scen_id}")
        t0 = time.time()

        # Load & align training data
        rad_tr = pd.read_csv(paths['train_radiomics'], index_col=0)
        imm_tr = pd.read_csv(paths['train_immune'],    index_col=0)
        df_tr = pd.merge(rad_tr, imm_tr, left_index=True, right_index=True, how='inner')

        # Load & align held-out data
        rad_ho = pd.read_csv(paths['heldout_radiomics'], index_col=0)
        imm_ho = pd.read_csv(paths['heldout_immune'],    index_col=0)
        df_ho = pd.merge(rad_ho, imm_ho, left_index=True, right_index=True, how='inner')

        scen_results = {}
        scen_features = {}
        scen_cv = {}
        inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        # Determine immune feature columns (may differ by signature)
        immune_cols = imm_tr.columns.intersection(imm_ho.columns)
        if immune_cols.empty:
            raise ValueError(f"{sig_name}:{scen_id} - no matching immune features between train and held-out")
        logging.info(f"{sig_name}:{scen_id} - {len(immune_cols)} immune features: {immune_cols.tolist()}")

        for col in tqdm(immune_cols, desc=f"{sig_name}:{scen_id}"):
            try:
                # GMM labeling on train
                gmm = GaussianMixture(n_components=2, random_state=42)
                y_tr = gmm.fit_predict(df_tr[[col]].values)
                if len(np.unique(y_tr)) < 2:
                    continue
                y_ho = gmm.predict(df_ho[[col]].values)
                # ensure label 1 = higher mean
                m0, m1 = gmm.means_.flatten()
                if m0 < m1:
                    y_tr = 1 - y_tr; y_ho = 1 - y_ho
                # save gmm model
                gmm_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_gmm_model.joblib'
                dump(gmm, gmm_model_path)
                logging.info(f"Saved GMM model to {gmm_model_path}")
                logging.info(f"GMM means for {sig_name}:{scen_id}, col {col}: {gmm.means_.flatten().tolist()}")

                # Feature selection
                X_tr = df_tr.drop(columns=[col]).values
                X_ho = df_ho.drop(columns=[col]).values
                sel = select_features(X_tr, y_tr)
                X_tr_sel, X_ho_sel = X_tr[:, sel], X_ho[:, sel]
                feat_names = df_tr.drop(columns=[col]).columns.tolist()
                sel_names = [feat_names[i] for i in sel]

                # Save selected feature names for this model so retraining can reuse them
                sel_feat_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_selected_features.json'
                os.makedirs(os.path.dirname(sel_feat_path), exist_ok=True)
                ts = _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S')
                meta = {'saved_at': _dt.now(_tz.utc).isoformat(), 'version': ts, 'selected_features': sel_names}
                with open(sel_feat_path, 'w') as _f:
                    json.dump(meta, _f, indent=2)

                # SVM nested CV
                pipe_svm = Pipeline([
                    ('scaler', StandardScaler()),
                    ('clf', SVC(class_weight='balanced', probability=True, random_state=42))
                ], memory=memory)
                search_svm = RandomizedSearchCV(
                    pipe_svm, param_dist_svm, n_iter=5,
                    cv=inner_cv, scoring='balanced_accuracy',
                    n_jobs=-1, refit=True, error_score='raise'
                )
                search_svm.fit(X_tr_sel, y_tr)
                y_pred_svm = search_svm.predict(X_ho_sel)
                cv_svm = {k: (v.tolist() if hasattr(v, 'tolist') else v)
                          for k, v in search_svm.cv_results_.items()}
                # save SVM model
                svm_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_model.joblib'
                dump(search_svm.best_estimator_, svm_model_path)
                logging.info(f"Saved SVM model to {svm_model_path}")
                logging.info(f"SVM best params for {sig_name}:{scen_id}, col {col}: {search_svm.best_params_}")

                # Save SVM best params and cv results for reproducibility / retraining (with metadata)
                svm_params_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_params.json'
                svm_cv_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_cv.json'
                os.makedirs(os.path.dirname(svm_params_path), exist_ok=True)
                svm_meta = {
                    'saved_at': _dt.now(_tz.utc).isoformat(),
                    'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
                    'best_params': _convert_obj(search_svm.best_params_)
                }
                with open(svm_params_path, 'w') as _f:
                    json.dump(svm_meta, _f, indent=2)
                svm_cv_meta = {
                    'saved_at': _dt.now(_tz.utc).isoformat(),
                    'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
                    'cv_results': _cv_results_to_serializable(search_svm.cv_results_)
                }
                with open(svm_cv_path, 'w') as _f:
                    json.dump(svm_cv_meta, _f, indent=2)

                # Ensemble nested CV
                base_pipe = Pipeline([
                    ('scaler', StandardScaler()),
                    ('classifier', SVC(class_weight='balanced', probability=True, random_state=42))
                ], memory=memory)
                ensemble = VotingClassifier([
                    ('svm', base_pipe),
                    ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
                    ('gb', HistGradientBoostingClassifier(random_state=42))
                ], voting='soft', weights=[1,1,1], n_jobs=-1)
                pipe_ens = Pipeline([
                    ('scaler', StandardScaler()),
                    ('ensemble', ensemble)
                ], memory=memory)
                search_ens = RandomizedSearchCV(
                    pipe_ens, param_dist_ensemble, n_iter=3,
                    cv=inner_cv, scoring='balanced_accuracy',
                    n_jobs=-1, refit=True, error_score='raise'
                )
                search_ens.fit(X_tr_sel, y_tr)
                y_pred_ens = search_ens.predict(X_ho_sel)
                cv_ens = {k: (v.tolist() if hasattr(v, 'tolist') else v)
                          for k, v in search_ens.cv_results_.items()}
                # save Ensemble model
                ens_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_model.joblib'
                dump(search_ens.best_estimator_, ens_model_path)
                logging.info(f"Saved Ensemble model to {ens_model_path}")
                logging.info(f"Ensemble best params for {sig_name}:{scen_id}, col {col}: {search_ens.best_params_}")

                # Save Ensemble best params and cv results for reproducibility / retraining (with metadata)
                ens_params_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_params.json'
                ens_cv_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_cv.json'
                os.makedirs(os.path.dirname(ens_params_path), exist_ok=True)
                ens_meta = {
                    'saved_at': _dt.now(_tz.utc).isoformat(),
                    'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
                    'best_params': _convert_obj(search_ens.best_params_)
                }
                with open(ens_params_path, 'w') as _f:
                    json.dump(ens_meta, _f, indent=2)
                ens_cv_meta = {
                    'saved_at': _dt.now(_tz.utc).isoformat(),
                    'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
                    'cv_results': _cv_results_to_serializable(search_ens.cv_results_)
                }
                with open(ens_cv_path, 'w') as _f:
                    json.dump(ens_cv_meta, _f, indent=2)

                # Metrics
                def metrics(y_true, y_pred):
                    return {
                        'Accuracy': accuracy_score(y_true, y_pred),
                        'Precision': precision_score(y_true, y_pred, zero_division=1),
                        'Recall': recall_score(y_true, y_pred, zero_division=1),
                        'F1 Score': f1_score(y_true, y_pred, zero_division=1),
                        'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred),
                        'MCC': matthews_corrcoef(y_true, y_pred)
                    }
                scen_results[col] = {'SVM': metrics(y_ho, y_pred_svm), 'Ensemble': metrics(y_ho, y_pred_ens)}
                scen_features[col] = sel_names
                scen_cv[col] = {'svm_cv': cv_svm, 'ensemble_cv': cv_ens}

            except Exception as e:
                logging.error(f"{sig_name}:{scen_id}, col {col}: {e}")
                print(f"[ERROR] {sig_name}:{scen_id}, column {col}: {e}")

        # Save for this scenario
        all_results[scen_id] = scen_results
        all_features[scen_id] = scen_features
        all_cv[scen_id] = scen_cv
        logging.info(f"[{sig_name}] {scen_id} done in {time.time()-t0:.1f}s")

    # Write group-level JSONs
    with open(f'nested_results111_{sig_name}.json', 'w') as f:
        json.dump(all_results, f, indent=2)
    with open(f'nested_features111_{sig_name}.json', 'w') as f:
        json.dump(all_features, f, indent=2)
    with open(f'nested_cv111_{sig_name}.json', 'w') as f:
        json.dump(all_cv, f, indent=2)
    print(f"✅ {sig_name} group complete: scenarios={list(all_results.keys())}")

print("All signature groups processed.")