""" SWAN Menopause Stage Prediction (pre / peri / post) using self-reported features Uses only the uploaded SWAN TSV file (no synthetic data, no external datasets). Outputs: - saved artifacts in ./swan_ml_output/ - documentation.md summarizing steps and results - optional CSV outputs for stage predictions and symptom predictions (separate files) Notes: - The script attempts to locate a menopause-stage column heuristically (common names like MENOSTAT, MENO, MENOSYM, MENOP etc.). Please verify the chosen stage column against the codebook. - Self-reported features are identified using name-pattern heuristics (VMS/HOT/SLEEP/CESD/STRESS/MOOD/SMOK/ALCOH/EXER/PHYS/VAG/URINE/SEX/PAIN etc). - Duplicate column names are tolerantly handled by renaming duplicates. """ import os, re, sys, argparse import numpy as np import pandas as pd import importlib import sklearn import matplotlib # Use a non-interactive backend by default so the script can run on servers/CI matplotlib.use('Agg') import matplotlib.pyplot as plt from datetime import datetime, timedelta from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc from sklearn.inspection import permutation_importance from sklearn.preprocessing import label_binarize # -------------------------- # Environment / CLI defaults # -------------------------- # Defaults may be overridden by environment variables or CLI args below DATA_PATH = os.environ.get('MENOPAUSE_DATA', "ICPSR_31901/DS0001/31901-0001-Data.tsv") OUTPUT_DIR = os.environ.get('MENOPAUSE_OUT', "swan_ml_output") # Parse CLI args (safe to parse here for a script; this will be ignored when imported) parser = argparse.ArgumentParser(description='Run menopause stage prediction pipeline') parser.add_argument('--data', '-d', default=DATA_PATH, help='Path to SWAN TSV file') parser.add_argument('--output', '-o', default=OUTPUT_DIR, help='Output directory for artifacts') parser.add_argument('--show', action='store_true', help='Show plots interactively (default: off)') parser.add_argument('--stage-col', default=None, help='Override detected stage column name') # Symptom cycle prediction CLI options parser.add_argument('--predict-symptoms', action='store_true', help='Run symptom cycle prediction from CSV input') parser.add_argument('--symptoms-input', default=None, help='Input CSV for symptom predictions') parser.add_argument('--symptoms-output', default=None, help='Output CSV to write symptom predictions') parser.add_argument('--lmp-col', default='LMP', help='Column name used as LMP (date string or day-of-month integer)') parser.add_argument('--date-col', default=None, help='Column name for target date; if omitted, uses today or VISIT date if present') parser.add_argument('--cycle-length', type=int, default=28, help='Average cycle length in days for symptom prediction') # Dual prediction CLI options (separate inputs/outputs for each model) parser.add_argument('--predict-dual', action='store_true', help='Run stage + symptom predictions using separate input/output files') parser.add_argument('--stage-input', default=None, help='Input CSV for menopause stage predictions') parser.add_argument('--stage-output', default=None, help='Output CSV for menopause stage predictions') parser.add_argument('--stage-model', default='RandomForest', help='Model for stage prediction: RandomForest or LogisticRegression') parser.add_argument('--forecast-dir', default=OUTPUT_DIR, help='Directory containing saved forecast models') parser.add_argument('--menopause-stage-col', default=None, help='(Deprecated) Kept for backward compatibility; symptom forecasting no longer uses menopause stage') # Parse CLI args only when script is run directly; when imported (e.g., during testing), avoid consuming external argv if __name__ == '__main__': args = parser.parse_args() else: # Use defaults when module is imported to avoid interfering with external CLI (pytest, etc.) args = parser.parse_args([]) DATA_PATH = args.data OUTPUT_DIR = args.output SHOW_PLOTS = bool(args.show) STAGE_COL_OVERRIDE = args.stage_col # If user only wants symptom-cycle predictions, provide a fast-path before loading the large TSV # Define a light-weight cycle-based symptom forecaster and CSV helper so users can run predictions # without training the menopause models (useful for small CSV inputs). class SymptomCycleForecaster: def __init__(self, cycle_length=28, hot_mu=14, hot_sigma=5, mood_mu=26, mood_sigma=4, base_hot=0.1, amp_hot=0.4, base_mood=0.1, amp_mood=0.45, threshold=0.5): self.cycle_length = cycle_length self.hot_mu = hot_mu self.hot_sigma = hot_sigma self.mood_mu = mood_mu self.mood_sigma = mood_sigma self.base_hot = base_hot self.amp_hot = amp_hot self.base_mood = base_mood self.amp_mood = amp_mood self.threshold = threshold def _parse_lmp(self, lmp, reference_date=None): if pd.isna(lmp): return None try: lmp_int = int(lmp) if reference_date is None: ref = pd.Timestamp(datetime.today()).to_pydatetime() else: ref = pd.to_datetime(reference_date, errors='coerce') if pd.isna(ref): ref = pd.Timestamp(datetime.today()).to_pydatetime() else: ref = ref.to_pydatetime() day = max(1, min(lmp_int, 28)) return datetime(ref.year, ref.month, day) except Exception: try: return pd.to_datetime(lmp, errors='coerce').to_pydatetime() except Exception: return None def compute_cycle_day(self, lmp, target_date=None): if target_date is None: tdate = datetime.today() else: tdate = pd.to_datetime(target_date, errors='coerce') if pd.isna(tdate): tdate = datetime.today() else: tdate = tdate.to_pydatetime() lmp_date = self._parse_lmp(lmp, reference_date=tdate) if lmp_date is None: return None delta = (tdate - lmp_date).days if delta < 0: lmp_date = lmp_date - timedelta(days=self.cycle_length) delta = (tdate - lmp_date).days cycle_day = (delta % self.cycle_length) + 1 return int(cycle_day) def _gauss_prob(self, day, mu, sigma, base, amp): if day is None: return np.nan val = base + amp * np.exp(-0.5 * ((day - mu) / float(sigma)) ** 2) return float(min(max(val, 0.0), 1.0)) def predict_single(self, lmp, target_date=None): day = self.compute_cycle_day(lmp, target_date=target_date) hot_p = self._gauss_prob(day, self.hot_mu, self.hot_sigma, self.base_hot, self.amp_hot) mood_p = self._gauss_prob(day, self.mood_mu, self.mood_sigma, self.base_mood, self.amp_mood) return { 'cycle_day': day, 'hotflash_prob': hot_p, 'hotflash_pred': hot_p >= self.threshold if not np.isnan(hot_p) else None, 'mood_prob': mood_p, 'mood_pred': mood_p >= self.threshold if not np.isnan(mood_p) else None } def predict_df(self, df, lmp_col='LMP', date_col=None, menopause_stage_col=None): df = df.copy() results = df.apply( lambda row: pd.Series(self.predict_single( lmp=row.get(lmp_col), target_date=(row.get(date_col) if date_col is not None else None) )), axis=1 ) out = pd.concat([df.reset_index(drop=True), results.reset_index(drop=True)], axis=1) return out def predict_symptoms_from_csv(input_csv, output_csv, lmp_col='LMP', date_col=None, menopause_stage_col=None, cycle_length=28, **kwargs): df = pd.read_csv(input_csv) fore = SymptomCycleForecaster(cycle_length=cycle_length) out_df = fore.predict_df(df, lmp_col=lmp_col, date_col=date_col, menopause_stage_col=menopause_stage_col) out_df.to_csv(output_csv, index=False) print(f"Wrote symptom predictions for {out_df.shape[0]} rows to {output_csv}") print("Sample predictions (first 5 rows):") print(out_df[[lmp_col] + ['cycle_day','hotflash_prob','hotflash_pred','mood_prob','mood_pred']].head().to_string()) # If the user requested only symptom predictions from a CSV, run fast-path and exit if args.predict_symptoms: if not args.symptoms_input or not args.symptoms_output: print("Error: --symptoms-input and --symptoms-output are required when --predict-symptoms is set") sys.exit(1) else: predict_symptoms_from_csv( input_csv=args.symptoms_input, output_csv=args.symptoms_output, lmp_col=args.lmp_col, date_col=args.date_col, menopause_stage_col=None, cycle_length=args.cycle_length ) sys.exit(0) # Fast-path for dual predictions (separate stage + symptoms) without loading large TSV if args.predict_dual: if not args.stage_input or not args.stage_output or not args.symptoms_input or not args.symptoms_output: print("Error: --stage-input, --stage-output, --symptoms-input, and --symptoms-output are required when --predict-dual is set") sys.exit(1) # Load saved pipeline directly via joblib to avoid initializing full training pipeline import joblib model_file = os.path.join(args.forecast_dir, 'rf_pipeline.pkl' if args.stage_model == 'RandomForest' else 'lr_pipeline.pkl') try: pipeline = joblib.load(model_file) except Exception as e: print(f"ERROR: Could not load model file '{model_file}': {e}") print("Please train the models first (run the script without --predict-dual) or provide correct --forecast-dir") sys.exit(1) # Stage predictions try: stage_data = pd.read_csv(args.stage_input) except Exception as e: print(f"ERROR: Could not read stage input CSV '{args.stage_input}': {e}") sys.exit(1) id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject'] feature_cols = [c for c in stage_data.columns if c not in id_cols] # Attempt to load feature metadata so we can reindex inputs to expected features import json metadata_path = os.path.join(args.forecast_dir, 'forecast_metadata.json') try: with open(metadata_path, 'r') as f: metadata = json.load(f) expected_features = metadata.get('feature_names', feature_cols) except Exception: expected_features = feature_cols X = stage_data.reindex(columns=expected_features, fill_value=np.nan) preds = pd.DataFrame({'predicted_stage': pipeline.predict(X), 'model': args.stage_model}) try: proba = pipeline.predict_proba(X) final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]] preds['confidence'] = np.max(proba, axis=1) for i, cls in enumerate(final_est.classes_): preds[f'prob_{cls}'] = proba[:, i] except Exception: preds['confidence'] = np.nan id_data = stage_data[[c for c in id_cols if c in stage_data.columns]] if any(c in stage_data.columns for c in id_cols) else None if id_data is not None: stage_results = pd.concat([id_data.reset_index(drop=True), preds.reset_index(drop=True)], axis=1) else: stage_results = preds.reset_index(drop=True) stage_results.insert(0, 'individual', range(1, len(stage_results) + 1)) stage_results.to_csv(args.stage_output, index=False) print(f"Wrote stage predictions for {stage_results.shape[0]} rows to {args.stage_output}") # Symptom predictions (independent input/output) try: symptom_data = pd.read_csv(args.symptoms_input) except Exception as e: print(f"ERROR: Could not read symptom input CSV '{args.symptoms_input}': {e}") sys.exit(1) date_col = args.date_col if args.date_col else ('date' if 'date' in symptom_data.columns else None) fore = SymptomCycleForecaster(cycle_length=args.cycle_length) symptom_results = fore.predict_df(symptom_data, lmp_col=args.lmp_col, date_col=date_col) symptom_results.to_csv(args.symptoms_output, index=False) print(f"Wrote symptom predictions for {symptom_results.shape[0]} rows to {args.symptoms_output}") sys.exit(0) os.makedirs(OUTPUT_DIR, exist_ok=True) # -------------------------- # Utility: make column names unique (pandas allows duplicates) # -------------------------- def make_unique_columns(cols): counts = {} new_cols = [] for c in cols: if c not in counts: counts[c] = 0 new_cols.append(c) else: counts[c] += 1 new_cols.append(f"{c}__dup{counts[c]}") return new_cols # -------------------------- # 1. Load data # -------------------------- # Guard: only run training and heavy data loading when script is executed directly if __name__ == '__main__' and os.path.exists(DATA_PATH): print("Loading data from:", DATA_PATH) df = pd.read_csv(DATA_PATH, sep='\t', low_memory=False) print("Original shape:", df.shape) # make column names unique for robust selection (duplicates -> __dup1, __dup2) df.columns = make_unique_columns(df.columns.tolist()) # Show a few columns (first 40) so user can inspect if running interactively print("First 40 column names (for inspection):") print(df.columns[:40].tolist()) # -------------------------- # 2. Identify candidate self-reported features and menopause-stage variable # -------------------------- # Heuristic patterns for self-report variables (adjust if you'd like to include additional columns) selfreport_patterns = [ r'VMS', r'HOT', r'HOTFL', r'NIGHTSW', r'SLEEP', r'CESD', r'STRESS', r'MOOD', r'SMOK', r'ALCOH', r'ALCO', r'EXER', r'PHYS', r'ACTIV', r'VAG', r'URINE', r'SEX', r'PAIN', r'FATIG', r'IRRIT', r'ANXI', r'DEPRESS', r'BLEED', r'MENSE', r'PERIOD', r'LMP', r'HOTSW', r'QOL', r'DRY' ] # Exclude laboratory/biomarker variable name patterns biomarker_exclude = r'E2|FSH|GLUCOSE|CHOLESTEROL|HDL|TRIG|SHBG|DHEAS|INSULIN|BMD|BP|HEIGHT|WEIGHT' upper_cols = {c: c.upper() for c in df.columns} selfreport_cols = [] for orig, up in upper_cols.items(): for pat in selfreport_patterns: if re.search(pat, up): # skip biomarkers that match both symptom patterns and biomarker patterns if re.search(biomarker_exclude, up): continue selfreport_cols.append(orig) break # Also include basic self-report demographics commonly present (AGE, RACE) for dem in ['AGE7','AGE','RACE','LANGINT7','LANGINT']: if dem in df.columns and dem not in selfreport_cols: selfreport_cols.append(dem) # Deduplicate preserving order seen=set() selfreport_cols = [x for x in selfreport_cols if not (x in seen or seen.add(x))] print(f"Found {len(selfreport_cols)} candidate self-reported columns (first 50 shown):") print(selfreport_cols[:50]) # Identify menopause-stage variable heuristically stage_cand_patterns = [r'MENOSTAT', r'MENOSYM', r'MENO', r'MENOP', r'MENST', r'MENSE', r'STATUS'] stage_candidates = [c for c in df.columns if any(re.search(p, c, flags=re.I) for p in stage_cand_patterns)] print("Stage-like candidate columns (found):", stage_candidates[:10]) # If user provided an override for stage column via CLI, honor it (if present in data) if STAGE_COL_OVERRIDE: if STAGE_COL_OVERRIDE in df.columns: print(f"Using overridden stage column: {STAGE_COL_OVERRIDE}") stage_candidates = [STAGE_COL_OVERRIDE] else: print(f"Warning: requested stage column '{STAGE_COL_OVERRIDE}' not present in data; proceeding with heuristic detection") # If multiple candidates choose one with few unique values (likely coded categories) stage_col = None for c in stage_candidates: nunique = df[c].nunique(dropna=True) # prefer small discrete sets (e.g., 2-6 categories) if 1 < nunique <= 20: stage_col = c break if stage_col is None and stage_candidates: # fallback to first candidate stage_col = stage_candidates[0] if stage_col is None: raise RuntimeError("No menopause-stage-like column found automatically. Inspect df.columns and pick the proper variable (e.g., MENOSTAT).") print("Selected stage column:", stage_col, " unique values:", df[stage_col].nunique(dropna=True)) print("Sample raw counts:") print(df[stage_col].value_counts(dropna=False).head(20)) # -------------------------- # 3. Create working dataframe with self-report features + stage # -------------------------- use_cols = [stage_col] + [c for c in selfreport_cols if c in df.columns and c != stage_col] data = df[use_cols].copy() # Replace common SWAN missing codes with NaN missing_values = [-9, -8, -7, -1, '.', 'NA', 'N/A', '999', 9999] data.replace(missing_values, np.nan, inplace=True) # Try convert object columns to numeric when appropriate for col in data.columns: if data[col].dtype == object: coerced = pd.to_numeric(data[col].astype(str).str.strip(), errors='coerce') # If many values become numeric, use numeric version; else leave as categorical string if coerced.notna().sum() > len(coerced) * 0.5: data[col] = coerced else: # replace blank/'nan' strings with np.nan data[col] = data[col].astype(str).str.strip().replace({'nan': np.nan, '': np.nan}) # -------------------------- # 4. Map stage variable to standardized labels {pre, peri, post} # *Important*: this is heuristic. Verify using the codebook and adjust mapping if needed. # -------------------------- def map_stage_to_labels(series): # Try textual mapping first s = series.copy() try: uniques = [str(x).lower() for x in s.dropna().unique()] except Exception: uniques = [] # textual mapping if any(x in ['pre','premenopausal','premenopause','pre-menopausal'] for x in uniques): s = s.astype(str).str.lower() s = s.replace({'premenopausal':'pre','pre-menopausal':'pre','pre-menopause':'pre','pre':'pre'}) s = s.replace({'perimenopausal':'peri','peri-menopausal':'peri','peri':'peri'}) s = s.replace({'postmenopausal':'post','post-menopausal':'post','post':'post'}) return s.map({'pre':'pre','peri':'peri','post':'post'}) # numeric mapping heuristic: map min->pre, median->peri, max->post num = pd.to_numeric(s, errors='coerce') num_unique = sorted(num.dropna().unique().tolist()) if len(num_unique) >= 3: mapping = {num_unique[0]:'pre', num_unique[len(num_unique)//2]:'peri', num_unique[-1]:'post'} return num.map(mapping) # 2-level mapping (assume 1->pre,2->post) or fallback if len(num_unique) == 2: return num.map({num_unique[0]:'pre', num_unique[1]:'post'}) # If not mappable, return NaN series return pd.Series([np.nan]*len(s), index=s.index) mapped_stage = map_stage_to_labels(data[stage_col]) # If mapping failed (too many NaNs), attempt a simple bleed-based heuristic (last menstrual period) if mapped_stage.isna().mean() > 0.9: bleed_candidates = [c for c in data.columns if re.search(r'LMP|BLEED|PERIOD|MENSTR', c, flags=re.I)] if len(bleed_candidates) > 0: lcol = bleed_candidates[0] lnum = pd.to_numeric(data[lcol], errors='coerce') mapped_stage = pd.Series(index=data.index, dtype=object) mapped_stage[lnum.isna()] = 'post' mapped_stage[lnum.notna()] = 'pre' else: raise RuntimeError("Failed to map stage variable to pre/peri/post and no bleed/LMP variable found.") data['_menopause_stage'] = mapped_stage print("Mapped stage counts (after heuristic mapping):") print(data['_menopause_stage'].value_counts(dropna=False)) # Drop rows with no mapped stage data = data[~data['_menopause_stage'].isna()].copy() print("Rows available for modeling:", data.shape[0]) # -------------------------- # 5. Feature selection for modeling # Keep only self-report fields with enough non-missing values and >1 unique value # -------------------------- feature_candidates = [c for c in use_cols if c != stage_col] selected_features = [] for c in feature_candidates: non_null = data[c].notna().sum() # require at least 2% nonmissing or minimum 50 observations if non_null < max(50, len(data) * 0.02): continue if data[c].nunique(dropna=True) <= 1: continue selected_features.append(c) print("Number of features selected for modeling:", len(selected_features)) print("First 40 features (if many):", selected_features[:40]) # -------------------------- # 6. Preprocessing pipeline # Numeric features: impute mean # Categorical features: impute most frequent + one-hot encode # Normalization: only added for logistic regression pipeline (tree-based RF doesn't need scaling) # -------------------------- numeric_feats = [c for c in selected_features if pd.api.types.is_numeric_dtype(data[c])] cat_feats = [c for c in selected_features if c not in numeric_feats] from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')) ]) # Construct OneHotEncoder in a sklearn-version compatible way try: ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False) except TypeError: # older sklearn versions use `sparse` kwarg ohe = OneHotEncoder(handle_unknown='ignore', sparse=False) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', ohe) ]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, numeric_feats), ('cat', categorical_transformer, cat_feats) ], remainder='drop') # Two pipelines: RandomForest (no scaling) and LogisticRegression (scaling) rf_pipeline = Pipeline(steps=[ ('pre', preprocessor), ('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)) ]) lr_pipeline = Pipeline(steps=[ ('pre', preprocessor), ('scaler', StandardScaler()), ('lr', LogisticRegression(solver='lbfgs', max_iter=1000)) ]) # -------------------------- # 7. Prepare data, train/test split # -------------------------- X = data[selected_features].copy() y = data['_menopause_stage'].copy().astype(str) # values: 'pre','peri','post' (hopefully) print("Target class distribution:") print(y.value_counts()) # Stratified split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42) print("Train / test sizes:", X_train.shape[0], X_test.shape[0]) # -------------------------- # 8. Train models # -------------------------- print("Training RandomForest...") rf_pipeline.fit(X_train, y_train) print("RandomForest trained.") print("Training LogisticRegression (multinomial)...") lr_pipeline.fit(X_train, y_train) print("LogisticRegression trained.") # -------------------------- # 9. Predictions and assessment # -------------------------- def evaluate_model(pipeline, X_test, y_test, model_name, output_dir=OUTPUT_DIR): y_pred = pipeline.predict(X_test) report = classification_report(y_test, y_pred) print(f"\n=== {model_name} Classification Report ===\n{report}") # confusion matrix labels = sorted(y_test.unique()) cm = confusion_matrix(y_test, y_pred, labels=labels) print(f"{model_name} Confusion Matrix (rows=true, cols=pred):\nLabels: {labels}\n{cm}") # Save classification report with open(os.path.join(output_dir, f"classification_report_{model_name.replace(' ','_')}.txt"), "w") as f: f.write(report) # Plot confusion matrix with matplotlib fig, ax = plt.subplots(figsize=(5,4)) im = ax.imshow(cm, interpolation='nearest') ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=45) ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels) ax.set_title(f"{model_name} Confusion Matrix") for i in range(cm.shape[0]): for j in range(cm.shape[1]): ax.text(j, i, format(cm[i, j], 'd'), ha="center", va="center") plt.tight_layout() plt.savefig(os.path.join(output_dir, f"{model_name.replace(' ','_')}_confusion_matrix.png")) # Show plots only when requested; otherwise close to free resources (non-interactive default) if SHOW_PLOTS: plt.show() else: plt.close('all') return y_pred, cm rf_pred, rf_cm = evaluate_model(rf_pipeline, X_test, y_test, "RandomForest") lr_pred, lr_cm = evaluate_model(lr_pipeline, X_test, y_test, "LogisticRegression") # 10. Feature importance # Extract feature names after preprocessing (numerics stay same; categorical one-hot create names) pre = rf_pipeline.named_steps['pre'] # Get numeric feature names feature_names = [] if len(numeric_feats) > 0: feature_names.extend(numeric_feats) if len(cat_feats) > 0: # Get onehot output names ohe = pre.named_transformers_['cat'].named_steps['onehot'] try: cat_onehot_names = ohe.get_feature_names_out(cat_feats) except Exception: # fallback cat_onehot_names = [] feature_names.extend(cat_onehot_names.tolist() if hasattr(cat_onehot_names, 'tolist') else list(cat_onehot_names)) # Feature importances from RandomForest rf_model = rf_pipeline.named_steps['rf'] importances = rf_model.feature_importances_ imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False) imp_df.to_csv(os.path.join(OUTPUT_DIR, "rf_feature_importances.csv"), index=False) print("\nTop 20 RF feature importances:") print(imp_df.head(20).to_string(index=False)) # Permutation importance (robust) print("Computing permutation importance (this can take some time)...") perm = permutation_importance(rf_pipeline, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1) perm_idx = perm.importances_mean.argsort()[::-1] perm_df = pd.DataFrame({ 'feature': np.array(feature_names)[perm_idx], 'importance_mean': perm.importances_mean[perm_idx], 'importance_std': perm.importances_std[perm_idx] }) perm_df.to_csv(os.path.join(OUTPUT_DIR, "rf_permutation_importances.csv"), index=False) print("Top 20 permutation importances:") print(perm_df.head(20).to_string(index=False)) # Plot RF top features topn = min(20, imp_df.shape[0]) fig, ax = plt.subplots(figsize=(8,6)) ax.barh(imp_df['feature'].head(topn)[::-1], imp_df['importance'].head(topn)[::-1]) ax.set_title("RandomForest: Top feature importances") ax.set_xlabel("Importance") plt.tight_layout() plt.savefig(os.path.join(OUTPUT_DIR, "rf_top_feature_importances.png")) if SHOW_PLOTS: plt.show() else: plt.close('all') # 11. ROC curves (one-vs-rest) if predict_proba available def plot_multiclass_roc(pipeline, X_test, y_test, model_name): if not hasattr(pipeline, "predict_proba"): print(f"{model_name} has no predict_proba; skipping ROC plot.") return # Must use same class order as pipeline's final estimator final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]] classes = final_est.classes_ y_test_bin = label_binarize(y_test, classes=classes) y_score = pipeline.predict_proba(X_test) for i, cls in enumerate(classes): fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i]) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}") plt.plot([0,1],[0,1], linestyle='--') plt.title(f"{model_name} ROC for class {cls}") plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate") plt.legend(loc='lower right') plt.savefig(os.path.join(OUTPUT_DIR, f"{model_name.replace(' ','_')}_ROC_{cls}.png")) if SHOW_PLOTS: plt.show() else: plt.close('all') print("Plotting ROC curves for RandomForest and LogisticRegression (if available)...") if __name__ == '__main__' and 'rf_pipeline' in globals(): plot_multiclass_roc(rf_pipeline, X_test, y_test, "RandomForest") plot_multiclass_roc(lr_pipeline, X_test, y_test, "LogisticRegression") # ========================================================================================== # 12. FORECASTING MODULE: Predict menopausal stage for new individuals # ========================================================================================== class MenopauseForecast: """ Forecasting module for predicting menopausal stage (pre/peri/post) given self-reported features. This class encapsulates the trained models and preprocessing pipeline to make predictions on new data with the same features used during training. """ def __init__(self, rf_pipeline, lr_pipeline, feature_names, stage_classes): """ Initialize the forecaster with trained pipelines. Parameters: ----------- rf_pipeline : sklearn Pipeline Trained RandomForest pipeline lr_pipeline : sklearn Pipeline Trained LogisticRegression pipeline feature_names : list List of feature column names used for training stage_classes : list List of possible menopause stage classes (e.g., ['pre', 'peri', 'post']) """ self.rf_pipeline = rf_pipeline self.lr_pipeline = lr_pipeline self.feature_names = feature_names self.stage_classes = stage_classes self.models = { 'RandomForest': rf_pipeline, 'LogisticRegression': lr_pipeline } def predict_single(self, feature_dict, model='RandomForest', return_proba=True): """ Predict menopausal stage for a single individual. Parameters: ----------- feature_dict : dict Dictionary with feature names as keys and values for prediction. Example: {'HOT7': 1, 'SLEEP7': 2, 'CESD': 10, ...} model : str Which model to use for prediction: 'RandomForest' or 'LogisticRegression' return_proba : bool If True, return prediction probabilities; otherwise just the class label Returns: -------- dict : Contains 'stage', 'confidence', and optionally 'probabilities' """ if model not in self.models: raise ValueError(f"Model '{model}' not found. Available: {list(self.models.keys())}") # Create DataFrame with single row, reindex to match training features X = pd.DataFrame([feature_dict]).reindex(columns=self.feature_names, fill_value=np.nan) pipeline = self.models[model] prediction = pipeline.predict(X)[0] result = { 'stage': prediction, 'model': model, 'confidence': None, 'probabilities': None } if return_proba: try: proba = pipeline.predict_proba(X)[0] result['confidence'] = float(np.max(proba)) result['probabilities'] = { cls: float(prob) for cls, prob in zip(pipeline.named_steps[list(pipeline.named_steps.keys())[-1]].classes_, proba) } except Exception as e: print(f"Warning: Could not compute probabilities: {e}") return result def predict_batch(self, df, model='RandomForest', return_proba=True): """ Predict menopausal stage for multiple individuals (batch prediction). Parameters: ----------- df : pd.DataFrame DataFrame with feature columns matching training features. Missing values will be handled by the preprocessing pipeline. model : str Which model to use: 'RandomForest' or 'LogisticRegression' return_proba : bool If True, return prediction probabilities Returns: -------- pd.DataFrame : Contains 'predicted_stage', 'confidence', and probability columns """ if model not in self.models: raise ValueError(f"Model '{model}' not found. Available: {list(self.models.keys())}") # Reindex to match training features X = df.reindex(columns=self.feature_names, fill_value=np.nan) pipeline = self.models[model] predictions = pipeline.predict(X) result_df = pd.DataFrame({ 'predicted_stage': predictions, 'model': model }) if return_proba: try: proba = pipeline.predict_proba(X) final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]] result_df['confidence'] = np.max(proba, axis=1) # Add probability column for each class for i, cls in enumerate(final_est.classes_): result_df[f'prob_{cls}'] = proba[:, i] except Exception as e: print(f"Warning: Could not compute probabilities: {e}") return result_df def compare_models(self, feature_dict): """ Compare predictions from both RandomForest and LogisticRegression models. Parameters: ----------- feature_dict : dict Feature values for the individual Returns: -------- dict : Predictions and probabilities from both models """ rf_result = self.predict_single(feature_dict, model='RandomForest', return_proba=True) lr_result = self.predict_single(feature_dict, model='LogisticRegression', return_proba=True) return { 'RandomForest': rf_result, 'LogisticRegression': lr_result } def get_feature_info(self): """Return information about required features.""" return { 'num_features': len(self.feature_names), 'feature_names': self.feature_names, 'stage_classes': self.stage_classes } def create_forecast_example(): """ Create an example forecast instance and demonstrate usage. This function is robust: if the training artifacts (`rf_pipeline`, `lr_pipeline`, `selected_features`, `X_train`, `X_test`) are not available in memory (e.g., when the module is imported in another process), it attempts to load saved pipelines from `OUTPUT_DIR` via `load_forecast_model()` and uses placeholder inputs. """ print("\n" + "="*80) print("FORECASTING MODULE EXAMPLE: Predicting Menopausal Stage") print("="*80) # Determine pipelines and feature metadata (use in-memory if available, else load from disk) try: _rf = rf_pipeline _lr = lr_pipeline _features = selected_features _stage_classes = sorted(y.unique().tolist()) has_training = True except NameError: print("Training artifacts not present in memory; attempting to load from disk...") try: _loaded = load_forecast_model(OUTPUT_DIR) _rf = _loaded.rf_pipeline _lr = _loaded.lr_pipeline _features = _loaded.feature_names _stage_classes = _loaded.stage_classes has_training = False except Exception as e: raise RuntimeError(f"Failed to initialize forecaster from disk: {e}") forecast = MenopauseForecast( rf_pipeline=_rf, lr_pipeline=_lr, feature_names=_features, stage_classes=_stage_classes ) print(f"\nForecaster initialized with {len(_features)} features") print(f"Predicting stages: {_stage_classes}") # Example 1: Single individual prediction print("\n--- Example 1: Predict for a single individual ---") example_individual = {} n_example_feats = min(10, len(_features)) if has_training: for feat in _features[:n_example_feats]: try: example_individual[feat] = float(pd.to_numeric(X_train[feat], errors='coerce').median()) except Exception: # Fallback to mode or NaN try: example_individual[feat] = X_train[feat].mode().iloc[0] except Exception: example_individual[feat] = np.nan else: # No training DF available; provide NaN placeholders to let pipeline impute for feat in _features[:n_example_feats]: example_individual[feat] = np.nan result = forecast.predict_single(example_individual, model='RandomForest', return_proba=True) print(f"Predicted stage: {result.get('stage')}") print(f"Confidence: {result.get('confidence'):.3f}" if result.get('confidence') is not None else "Confidence: None") if result.get('probabilities'): print("Stage probabilities:") for stage, prob in sorted(result['probabilities'].items()): print(f" {stage}: {prob:.3f}") # Example 2: Compare models print("\n--- Example 2: Compare RandomForest vs LogisticRegression ---") comparison = forecast.compare_models(example_individual) for model_name, cres in comparison.items(): print(f"\n{model_name}:") print(f" Predicted stage: {cres.get('stage')}") print(f" Confidence: {cres.get('confidence'):.3f}" if cres.get('confidence') is not None else " Confidence: None") # Example 3: Batch prediction on a small sample (either X_test if available or placeholder rows) print("\n--- Example 3: Batch prediction (small sample) ---") if has_training: try: test_sample = X_test.iloc[:5].copy() batch_results = forecast.predict_batch(test_sample, model='RandomForest', return_proba=True) print(batch_results.to_string()) except Exception as e: print(f"Batch prediction failed on training sample: {e}") else: # Create a small placeholder DataFrame with feature columns filled with NaN placeholder = pd.DataFrame([{f: np.nan for f in _features[:n_example_feats]}]) batch_results = forecast.predict_batch(placeholder, model='RandomForest', return_proba=True) print(batch_results.to_string()) return forecast def save_forecast_model(forecast_instance, output_dir=OUTPUT_DIR): """ Save the forecast model instance for later use (optional: can use joblib for production). For now, saves metadata about features and classes that can be used to reinitialize the forecaster. Parameters: ----------- forecast_instance : MenopauseForecast The forecaster to save output_dir : str Directory to save metadata """ import json import joblib metadata = { 'feature_names': forecast_instance.feature_names, 'stage_classes': forecast_instance.stage_classes, 'num_features': len(forecast_instance.feature_names) } # Save metadata as JSON with open(os.path.join(output_dir, 'forecast_metadata.json'), 'w') as f: json.dump(metadata, f, indent=2) # Save trained pipelines using joblib (allows full reuse) joblib.dump(forecast_instance.rf_pipeline, os.path.join(output_dir, 'rf_pipeline.pkl')) joblib.dump(forecast_instance.lr_pipeline, os.path.join(output_dir, 'lr_pipeline.pkl')) print(f"Forecast model saved to {output_dir}") print(f" - forecast_metadata.json") print(f" - rf_pipeline.pkl") print(f" - lr_pipeline.pkl") def load_forecast_model(output_dir=OUTPUT_DIR): """ Load a previously saved forecast model. Parameters: ----------- output_dir : str Directory containing saved models Returns: -------- MenopauseForecast : The loaded forecaster """ import json import joblib # Load metadata with open(os.path.join(output_dir, 'forecast_metadata.json'), 'r') as f: metadata = json.load(f) # Load pipelines rf_pipeline_loaded = joblib.load(os.path.join(output_dir, 'rf_pipeline.pkl')) lr_pipeline_loaded = joblib.load(os.path.join(output_dir, 'lr_pipeline.pkl')) # Recreate forecaster forecast = MenopauseForecast( rf_pipeline=rf_pipeline_loaded, lr_pipeline=lr_pipeline_loaded, feature_names=metadata['feature_names'], stage_classes=metadata['stage_classes'] ) print(f"Forecast model loaded from {output_dir}") return forecast # Initialize and demonstrate the forecasting module # Symptom cycle forecasting (defined earlier near CLI args) class SymptomCycleForecaster: """ Predicts the probability of hot flashes and mood changes within a menstrual cycle based on last menstrual period (LMP) date and target date. """ def __init__(self, cycle_length=28, hot_mu=14, hot_sigma=5, mood_mu=26, mood_sigma=4, base_hot=0.1, amp_hot=0.4, base_mood=0.1, amp_mood=0.45, threshold=0.5): self.cycle_length = cycle_length self.hot_mu = hot_mu self.hot_sigma = hot_sigma self.mood_mu = mood_mu self.mood_sigma = mood_sigma self.base_hot = base_hot self.amp_hot = amp_hot self.base_mood = base_mood self.amp_mood = amp_mood self.threshold = threshold def _parse_lmp(self, lmp, reference_date=None): """Parse LMP input which may be a full date string or an integer day-of-month.""" if pd.isna(lmp): return None # If numeric day (int-like), construct a date in the same month as reference_date try: lmp_int = int(lmp) if reference_date is None: ref = pd.Timestamp(datetime.today()).to_pydatetime() else: ref = pd.to_datetime(reference_date, errors='coerce') if pd.isna(ref): ref = pd.Timestamp(datetime.today()).to_pydatetime() else: ref = ref.to_pydatetime() # Clamp day to valid range day = max(1, min(lmp_int, 28)) return datetime(ref.year, ref.month, day) except Exception: # Try parse as full date string try: return pd.to_datetime(lmp, errors='coerce').to_pydatetime() except Exception: return None def compute_cycle_day(self, lmp, target_date=None): """Return 1-based cycle day (1..cycle_length) or None if cannot compute.""" if target_date is None: tdate = datetime.today() else: tdate = pd.to_datetime(target_date, errors='coerce') if pd.isna(tdate): tdate = datetime.today() else: tdate = tdate.to_pydatetime() lmp_date = self._parse_lmp(lmp, reference_date=tdate) if lmp_date is None: return None delta = (tdate - lmp_date).days if delta < 0: # If LMP is in the future, assume it refers to previous cycle (subtract one month) lmp_date = lmp_date - timedelta(days=self.cycle_length) delta = (tdate - lmp_date).days cycle_day = (delta % self.cycle_length) + 1 return int(cycle_day) def _gauss_prob(self, day, mu, sigma, base, amp): if day is None: return np.nan val = base + amp * np.exp(-0.5 * ((day - mu) / float(sigma)) ** 2) return float(min(max(val, 0.0), 1.0)) def predict_single(self, lmp, target_date=None): day = self.compute_cycle_day(lmp, target_date=target_date) hot_p = self._gauss_prob(day, self.hot_mu, self.hot_sigma, self.base_hot, self.amp_hot) mood_p = self._gauss_prob(day, self.mood_mu, self.mood_sigma, self.base_mood, self.amp_mood) return { 'cycle_day': day, 'hotflash_prob': hot_p, 'hotflash_pred': hot_p >= self.threshold if not np.isnan(hot_p) else None, 'mood_prob': mood_p, 'mood_pred': mood_p >= self.threshold if not np.isnan(mood_p) else None } def predict_df(self, df, lmp_col='LMP', date_col=None, menopause_stage_col=None): df = df.copy() results = df.apply( lambda row: pd.Series(self.predict_single( lmp=row.get(lmp_col), target_date=(row.get(date_col) if date_col is not None else None) )), axis=1 ) out = pd.concat([df.reset_index(drop=True), results.reset_index(drop=True)], axis=1) return out def predict_symptoms_from_csv(input_csv, output_csv, lmp_col='LMP', date_col=None, menopause_stage_col=None, cycle_length=28, **kwargs): """Read input CSV, predict hot flashes/mood by cycle day, and write output CSV.""" df = pd.read_csv(input_csv) fore = SymptomCycleForecaster(cycle_length=cycle_length) out_df = fore.predict_df(df, lmp_col=lmp_col, date_col=date_col, menopause_stage_col=menopause_stage_col) out_df.to_csv(output_csv, index=False) # Print a brief summary print(f"Wrote symptom predictions for {out_df.shape[0]} rows to {output_csv}") print("Sample predictions (first 5 rows):") print(out_df[[lmp_col] + ['cycle_day','hotflash_prob','hotflash_pred','mood_prob','mood_pred']].head().to_string()) # CLI integration: run symptom prediction if requested if __name__ == '__main__': # If symptom prediction requested via CLI, run fast-path and exit if args.predict_symptoms: if not args.symptoms_input or not args.symptoms_output: print("Error: --symptoms-input and --symptoms-output are required when --predict-symptoms is set") sys.exit(1) else: predict_symptoms_from_csv( input_csv=args.symptoms_input, output_csv=args.symptoms_output, lmp_col=args.lmp_col, date_col=args.date_col, cycle_length=args.cycle_length ) sys.exit(0) # Dual predictions are handled in the early fast-path above to avoid training. # Default behavior: create demo forecaster, save trained models and show summary forecast_model = create_forecast_example() save_forecast_model(forecast_model) print("\n" + "="*80) print("FORECASTING MODULE SUMMARY") print("="*80) print(""" The MenopauseForecast class provides three main methods for predictions: 1. predict_single(feature_dict, model='RandomForest', return_proba=True) - Predict stage for one individual given feature values - Returns predicted stage and confidence scores 2. predict_batch(df, model='RandomForest', return_proba=True) - Predict stages for multiple individuals - Returns DataFrame with predictions and probabilities for each stage 3. compare_models(feature_dict) - Compare predictions from both RandomForest and LogisticRegression - Useful for validating model agreement Usage in your own code: from menopause import load_forecast_model # Load the trained forecaster forecast = load_forecast_model('swan_ml_output') # Predict for an individual features = {'HOT7': 1, 'SLEEP7': 2, 'CESD': 10, ...} result = forecast.predict_single(features, model='RandomForest') # Predict for multiple individuals results_df = forecast.predict_batch(your_dataframe, model='RandomForest') """) # ========================================================================================== # 13. CSV INPUT/OUTPUT FUNCTIONALITY: Batch prediction from CSV files # ========================================================================================== def predict_from_csv(input_csv, forecast_instance, output_csv=None, model='RandomForest', output_dir=OUTPUT_DIR): """ Read individual data from CSV, make predictions, and save results. Parameters: ----------- input_csv : str Path to input CSV file with feature columns for individuals CSV should have columns matching training features (or subset) forecast_instance : MenopauseForecast The trained forecaster instance output_csv : str Path to output CSV file (default: input_csv with '_predictions' appended) model : str Which model to use ('RandomForest' or 'LogisticRegression') output_dir : str Directory to save results (for metadata) Returns: -------- pd.DataFrame : Results with predictions and confidence scores Example: -------- forecast = load_forecast_model('swan_ml_output') results = predict_from_csv('individuals.csv', forecast) # Results saved to 'individuals_predictions.csv' """ import os # Read input CSV print(f"Reading input data from: {input_csv}") try: data = pd.read_csv(input_csv) except FileNotFoundError: print(f"ERROR: File not found: {input_csv}") return None n_samples = len(data) print(f"Loaded {n_samples} individuals") # Identify feature columns (exclude ID columns) id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject'] feature_cols = [c for c in data.columns if c not in id_cols] # Separate ID columns from features id_data = data[[c for c in id_cols if c in data.columns]] if any(c in data.columns for c in id_cols) else None # Make predictions print(f"Making predictions using {model}...") predictions = forecast_instance.predict_batch( data[feature_cols], model=model, return_proba=True ) # Combine with original data if id_data is not None: results = pd.concat([id_data.reset_index(drop=True), predictions.reset_index(drop=True)], axis=1) else: results = predictions.reset_index(drop=True) # Add individual index if no ID column if id_data is None: results.insert(0, 'individual', range(1, n_samples + 1)) # Set output file path if output_csv is None: base, ext = os.path.splitext(input_csv) output_csv = f"{base}_predictions{ext}" # Save results print(f"Saving predictions to: {output_csv}") results.to_csv(output_csv, index=False) return results def predict_dual_from_csv(stage_input_csv, stage_output_csv, symptoms_input_csv, symptoms_output_csv, forecast_dir=OUTPUT_DIR, model='RandomForest', lmp_col='LMP', date_col=None, cycle_length=28): """Run menopause stage prediction and symptom-cycle prediction using separate input and output files for each model. Returns: -------- dict : {'stage': stage_results_df, 'symptoms': symptom_results_df} """ print(f"Reading stage input data from: {stage_input_csv}") try: stage_data = pd.read_csv(stage_input_csv) except FileNotFoundError: print(f"ERROR: File not found: {stage_input_csv}") return None # Load forecast model try: forecast = load_forecast_model(output_dir=forecast_dir) except Exception as e: print(f"ERROR: Could not load forecast model from '{forecast_dir}': {e}") return None # Identify id and feature columns id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject'] feature_cols = [c for c in stage_data.columns if c not in id_cols] # Make stage predictions print(f"Making menopause stage predictions using {model}...") stage_preds = forecast.predict_batch(stage_data[feature_cols], model=model, return_proba=True) id_data = stage_data[[c for c in id_cols if c in stage_data.columns]] if any(c in stage_data.columns for c in id_cols) else None if id_data is not None: stage_results = pd.concat([id_data.reset_index(drop=True), stage_preds.reset_index(drop=True)], axis=1) else: stage_results = stage_preds.reset_index(drop=True) stage_results.insert(0, 'individual', range(1, len(stage_results) + 1)) # Default stage output path if not provided if stage_output_csv is None: base, ext = os.path.splitext(stage_input_csv) stage_output_csv = f"{base}_stage_predictions{ext}" print(f"Saving stage predictions to: {stage_output_csv}") stage_results.to_csv(stage_output_csv, index=False) # Symptom predictions (independent) print(f"Reading symptom input data from: {symptoms_input_csv}") try: symptom_data = pd.read_csv(symptoms_input_csv) except FileNotFoundError: print(f"ERROR: File not found: {symptoms_input_csv}") return None if date_col is None and 'date' in symptom_data.columns: date_col = 'date' fore = SymptomCycleForecaster(cycle_length=cycle_length) symptom_results = fore.predict_df(symptom_data, lmp_col=lmp_col, date_col=date_col) # Default symptom output path if not provided if symptoms_output_csv is None: base, ext = os.path.splitext(symptoms_input_csv) symptoms_output_csv = f"{base}_symptom_predictions{ext}" print(f"Saving symptom predictions to: {symptoms_output_csv}") symptom_results.to_csv(symptoms_output_csv, index=False) return {'stage': stage_results, 'symptoms': symptom_results} def predict_combined_from_csv(*args, **kwargs): """Deprecated: combined predictions are removed in favor of separate input/output files.""" raise ValueError( "Combined predictions are deprecated. Use predict_dual_from_csv() with separate stage and symptom input/output files." ) def create_demo_csv(forecast_instance, num_individuals=5, output_file='demo_individuals.csv', output_dir=OUTPUT_DIR): """ Create a demo CSV file with sample individuals for testing predictions. Uses statistics from the training data to generate realistic feature values. Parameters: ----------- forecast_instance : MenopauseForecast The trained forecaster (used to get feature names) num_individuals : int Number of demo individuals to generate output_file : str Path to output CSV file output_dir : str Directory to save demo file Returns: -------- str : Path to created CSV file """ # Get feature names from forecaster feature_names = forecast_instance.feature_names # Create demo data with random realistic values np.random.seed(42) demo_data = {} # Add individual ID demo_data['individual'] = [f"Individual_{i+1}" for i in range(num_individuals)] # Generate random feature values (using ranges typical for SWAN data) for feat in feature_names: # Random values between 1 and 5 (typical Likert scale for SWAN) demo_data[feat] = np.random.randint(1, 6, size=num_individuals) # Create DataFrame demo_df = pd.DataFrame(demo_data) # Create full path full_path = os.path.join(output_dir, output_file) # Ensure output directory exists os.makedirs(output_dir, exist_ok=True) # Save demo file demo_df.to_csv(full_path, index=False) print(f"✅ Demo CSV created: {full_path}") print(f" Individuals: {num_individuals}") print(f" Features: {len(feature_names)}") print(f" File shape: {demo_df.shape}") return full_path def add_performance_metrics_to_csv(results_df, y_test=None, model_name='RandomForest'): """ Add performance metrics to predictions CSV. If true labels available, computes accuracy, precision, recall, F1-score. Parameters: ----------- results_df : pd.DataFrame Results dataframe with predictions y_test : array-like True labels (optional) model_name : str Name of model used Returns: -------- pd.DataFrame : Results with metrics appended """ if y_test is not None: from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score acc = accuracy_score(y_test, results_df['predicted_stage']) prec = precision_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0) recall = recall_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0) f1 = f1_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0) # Add as metadata comment at bottom metrics_text = f"\n# Performance Metrics ({model_name})\n" metrics_text += f"# Accuracy: {acc:.3f}\n" metrics_text += f"# Precision (weighted): {prec:.3f}\n" metrics_text += f"# Recall (weighted): {recall:.3f}\n" metrics_text += f"# F1-Score (weighted): {f1:.3f}\n" return results_df, metrics_text return results_df, None