Spaces:
Sleeping
Sleeping
| """ | |
| SWAN Menopause Stage Prediction (pre / peri / post) using self-reported features | |
| Uses only the uploaded SWAN TSV file (no synthetic data, no external datasets). | |
| Outputs: | |
| - saved artifacts in ./swan_ml_output/ | |
| - documentation.md summarizing steps and results | |
| - optional CSV outputs for stage predictions and symptom predictions (separate files) | |
| Notes: | |
| - The script attempts to locate a menopause-stage column heuristically (common names like MENOSTAT, | |
| MENO, MENOSYM, MENOP etc.). Please verify the chosen stage column against the codebook. | |
| - Self-reported features are identified using name-pattern heuristics (VMS/HOT/SLEEP/CESD/STRESS/MOOD/SMOK/ALCOH/EXER/PHYS/VAG/URINE/SEX/PAIN etc). | |
| - Duplicate column names are tolerantly handled by renaming duplicates. | |
| """ | |
| import os, re, sys, argparse | |
| import numpy as np | |
| import pandas as pd | |
| import importlib | |
| import sklearn | |
| import matplotlib | |
| # Use a non-interactive backend by default so the script can run on servers/CI | |
| matplotlib.use('Agg') | |
| import matplotlib.pyplot as plt | |
| from datetime import datetime, timedelta | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc | |
| from sklearn.inspection import permutation_importance | |
| from sklearn.preprocessing import label_binarize | |
| # -------------------------- | |
| # Environment / CLI defaults | |
| # -------------------------- | |
| # Defaults may be overridden by environment variables or CLI args below | |
| DATA_PATH = os.environ.get('MENOPAUSE_DATA', "ICPSR_31901/DS0001/31901-0001-Data.tsv") | |
| OUTPUT_DIR = os.environ.get('MENOPAUSE_OUT', "swan_ml_output") | |
| # Parse CLI args (safe to parse here for a script; this will be ignored when imported) | |
| parser = argparse.ArgumentParser(description='Run menopause stage prediction pipeline') | |
| parser.add_argument('--data', '-d', default=DATA_PATH, help='Path to SWAN TSV file') | |
| parser.add_argument('--output', '-o', default=OUTPUT_DIR, help='Output directory for artifacts') | |
| parser.add_argument('--show', action='store_true', help='Show plots interactively (default: off)') | |
| parser.add_argument('--stage-col', default=None, help='Override detected stage column name') | |
| # Symptom cycle prediction CLI options | |
| parser.add_argument('--predict-symptoms', action='store_true', help='Run symptom cycle prediction from CSV input') | |
| parser.add_argument('--symptoms-input', default=None, help='Input CSV for symptom predictions') | |
| parser.add_argument('--symptoms-output', default=None, help='Output CSV to write symptom predictions') | |
| parser.add_argument('--lmp-col', default='LMP', help='Column name used as LMP (date string or day-of-month integer)') | |
| parser.add_argument('--date-col', default=None, help='Column name for target date; if omitted, uses today or VISIT date if present') | |
| parser.add_argument('--cycle-length', type=int, default=28, help='Average cycle length in days for symptom prediction') | |
| # Dual prediction CLI options (separate inputs/outputs for each model) | |
| parser.add_argument('--predict-dual', action='store_true', help='Run stage + symptom predictions using separate input/output files') | |
| parser.add_argument('--stage-input', default=None, help='Input CSV for menopause stage predictions') | |
| parser.add_argument('--stage-output', default=None, help='Output CSV for menopause stage predictions') | |
| parser.add_argument('--stage-model', default='RandomForest', help='Model for stage prediction: RandomForest or LogisticRegression') | |
| parser.add_argument('--forecast-dir', default=OUTPUT_DIR, help='Directory containing saved forecast models') | |
| parser.add_argument('--menopause-stage-col', default=None, help='(Deprecated) Kept for backward compatibility; symptom forecasting no longer uses menopause stage') | |
| # Parse CLI args only when script is run directly; when imported (e.g., during testing), avoid consuming external argv | |
| if __name__ == '__main__': | |
| args = parser.parse_args() | |
| else: | |
| # Use defaults when module is imported to avoid interfering with external CLI (pytest, etc.) | |
| args = parser.parse_args([]) | |
| DATA_PATH = args.data | |
| OUTPUT_DIR = args.output | |
| SHOW_PLOTS = bool(args.show) | |
| STAGE_COL_OVERRIDE = args.stage_col | |
| # If user only wants symptom-cycle predictions, provide a fast-path before loading the large TSV | |
| # Define a light-weight cycle-based symptom forecaster and CSV helper so users can run predictions | |
| # without training the menopause models (useful for small CSV inputs). | |
| class SymptomCycleForecaster: | |
| def __init__(self, cycle_length=28, hot_mu=14, hot_sigma=5, mood_mu=26, mood_sigma=4, | |
| base_hot=0.1, amp_hot=0.4, base_mood=0.1, amp_mood=0.45, threshold=0.5): | |
| self.cycle_length = cycle_length | |
| self.hot_mu = hot_mu | |
| self.hot_sigma = hot_sigma | |
| self.mood_mu = mood_mu | |
| self.mood_sigma = mood_sigma | |
| self.base_hot = base_hot | |
| self.amp_hot = amp_hot | |
| self.base_mood = base_mood | |
| self.amp_mood = amp_mood | |
| self.threshold = threshold | |
| def _parse_lmp(self, lmp, reference_date=None): | |
| if pd.isna(lmp): | |
| return None | |
| try: | |
| lmp_int = int(lmp) | |
| if reference_date is None: | |
| ref = pd.Timestamp(datetime.today()).to_pydatetime() | |
| else: | |
| ref = pd.to_datetime(reference_date, errors='coerce') | |
| if pd.isna(ref): | |
| ref = pd.Timestamp(datetime.today()).to_pydatetime() | |
| else: | |
| ref = ref.to_pydatetime() | |
| day = max(1, min(lmp_int, 28)) | |
| return datetime(ref.year, ref.month, day) | |
| except Exception: | |
| try: | |
| return pd.to_datetime(lmp, errors='coerce').to_pydatetime() | |
| except Exception: | |
| return None | |
| def compute_cycle_day(self, lmp, target_date=None): | |
| if target_date is None: | |
| tdate = datetime.today() | |
| else: | |
| tdate = pd.to_datetime(target_date, errors='coerce') | |
| if pd.isna(tdate): | |
| tdate = datetime.today() | |
| else: | |
| tdate = tdate.to_pydatetime() | |
| lmp_date = self._parse_lmp(lmp, reference_date=tdate) | |
| if lmp_date is None: | |
| return None | |
| delta = (tdate - lmp_date).days | |
| if delta < 0: | |
| lmp_date = lmp_date - timedelta(days=self.cycle_length) | |
| delta = (tdate - lmp_date).days | |
| cycle_day = (delta % self.cycle_length) + 1 | |
| return int(cycle_day) | |
| def _gauss_prob(self, day, mu, sigma, base, amp): | |
| if day is None: | |
| return np.nan | |
| val = base + amp * np.exp(-0.5 * ((day - mu) / float(sigma)) ** 2) | |
| return float(min(max(val, 0.0), 1.0)) | |
| def predict_single(self, lmp, target_date=None): | |
| day = self.compute_cycle_day(lmp, target_date=target_date) | |
| hot_p = self._gauss_prob(day, self.hot_mu, self.hot_sigma, self.base_hot, self.amp_hot) | |
| mood_p = self._gauss_prob(day, self.mood_mu, self.mood_sigma, self.base_mood, self.amp_mood) | |
| return { | |
| 'cycle_day': day, | |
| 'hotflash_prob': hot_p, | |
| 'hotflash_pred': hot_p >= self.threshold if not np.isnan(hot_p) else None, | |
| 'mood_prob': mood_p, | |
| 'mood_pred': mood_p >= self.threshold if not np.isnan(mood_p) else None | |
| } | |
| def predict_df(self, df, lmp_col='LMP', date_col=None, menopause_stage_col=None): | |
| df = df.copy() | |
| results = df.apply( | |
| lambda row: pd.Series(self.predict_single( | |
| lmp=row.get(lmp_col), | |
| target_date=(row.get(date_col) if date_col is not None else None) | |
| )), axis=1 | |
| ) | |
| out = pd.concat([df.reset_index(drop=True), results.reset_index(drop=True)], axis=1) | |
| return out | |
| def predict_symptoms_from_csv(input_csv, output_csv, lmp_col='LMP', date_col=None, | |
| menopause_stage_col=None, cycle_length=28, **kwargs): | |
| df = pd.read_csv(input_csv) | |
| fore = SymptomCycleForecaster(cycle_length=cycle_length) | |
| out_df = fore.predict_df(df, lmp_col=lmp_col, date_col=date_col, menopause_stage_col=menopause_stage_col) | |
| out_df.to_csv(output_csv, index=False) | |
| print(f"Wrote symptom predictions for {out_df.shape[0]} rows to {output_csv}") | |
| print("Sample predictions (first 5 rows):") | |
| print(out_df[[lmp_col] + ['cycle_day','hotflash_prob','hotflash_pred','mood_prob','mood_pred']].head().to_string()) | |
| # If the user requested only symptom predictions from a CSV, run fast-path and exit | |
| if args.predict_symptoms: | |
| if not args.symptoms_input or not args.symptoms_output: | |
| print("Error: --symptoms-input and --symptoms-output are required when --predict-symptoms is set") | |
| sys.exit(1) | |
| else: | |
| predict_symptoms_from_csv( | |
| input_csv=args.symptoms_input, | |
| output_csv=args.symptoms_output, | |
| lmp_col=args.lmp_col, | |
| date_col=args.date_col, | |
| menopause_stage_col=None, | |
| cycle_length=args.cycle_length | |
| ) | |
| sys.exit(0) | |
| # Fast-path for dual predictions (separate stage + symptoms) without loading large TSV | |
| if args.predict_dual: | |
| if not args.stage_input or not args.stage_output or not args.symptoms_input or not args.symptoms_output: | |
| print("Error: --stage-input, --stage-output, --symptoms-input, and --symptoms-output are required when --predict-dual is set") | |
| sys.exit(1) | |
| # Load saved pipeline directly via joblib to avoid initializing full training pipeline | |
| import joblib | |
| model_file = os.path.join(args.forecast_dir, 'rf_pipeline.pkl' if args.stage_model == 'RandomForest' else 'lr_pipeline.pkl') | |
| try: | |
| pipeline = joblib.load(model_file) | |
| except Exception as e: | |
| print(f"ERROR: Could not load model file '{model_file}': {e}") | |
| print("Please train the models first (run the script without --predict-dual) or provide correct --forecast-dir") | |
| sys.exit(1) | |
| # Stage predictions | |
| try: | |
| stage_data = pd.read_csv(args.stage_input) | |
| except Exception as e: | |
| print(f"ERROR: Could not read stage input CSV '{args.stage_input}': {e}") | |
| sys.exit(1) | |
| id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject'] | |
| feature_cols = [c for c in stage_data.columns if c not in id_cols] | |
| # Attempt to load feature metadata so we can reindex inputs to expected features | |
| import json | |
| metadata_path = os.path.join(args.forecast_dir, 'forecast_metadata.json') | |
| try: | |
| with open(metadata_path, 'r') as f: | |
| metadata = json.load(f) | |
| expected_features = metadata.get('feature_names', feature_cols) | |
| except Exception: | |
| expected_features = feature_cols | |
| X = stage_data.reindex(columns=expected_features, fill_value=np.nan) | |
| preds = pd.DataFrame({'predicted_stage': pipeline.predict(X), 'model': args.stage_model}) | |
| try: | |
| proba = pipeline.predict_proba(X) | |
| final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]] | |
| preds['confidence'] = np.max(proba, axis=1) | |
| for i, cls in enumerate(final_est.classes_): | |
| preds[f'prob_{cls}'] = proba[:, i] | |
| except Exception: | |
| preds['confidence'] = np.nan | |
| id_data = stage_data[[c for c in id_cols if c in stage_data.columns]] if any(c in stage_data.columns for c in id_cols) else None | |
| if id_data is not None: | |
| stage_results = pd.concat([id_data.reset_index(drop=True), preds.reset_index(drop=True)], axis=1) | |
| else: | |
| stage_results = preds.reset_index(drop=True) | |
| stage_results.insert(0, 'individual', range(1, len(stage_results) + 1)) | |
| stage_results.to_csv(args.stage_output, index=False) | |
| print(f"Wrote stage predictions for {stage_results.shape[0]} rows to {args.stage_output}") | |
| # Symptom predictions (independent input/output) | |
| try: | |
| symptom_data = pd.read_csv(args.symptoms_input) | |
| except Exception as e: | |
| print(f"ERROR: Could not read symptom input CSV '{args.symptoms_input}': {e}") | |
| sys.exit(1) | |
| date_col = args.date_col if args.date_col else ('date' if 'date' in symptom_data.columns else None) | |
| fore = SymptomCycleForecaster(cycle_length=args.cycle_length) | |
| symptom_results = fore.predict_df(symptom_data, lmp_col=args.lmp_col, date_col=date_col) | |
| symptom_results.to_csv(args.symptoms_output, index=False) | |
| print(f"Wrote symptom predictions for {symptom_results.shape[0]} rows to {args.symptoms_output}") | |
| sys.exit(0) | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| # -------------------------- | |
| # Utility: make column names unique (pandas allows duplicates) | |
| # -------------------------- | |
| def make_unique_columns(cols): | |
| counts = {} | |
| new_cols = [] | |
| for c in cols: | |
| if c not in counts: | |
| counts[c] = 0 | |
| new_cols.append(c) | |
| else: | |
| counts[c] += 1 | |
| new_cols.append(f"{c}__dup{counts[c]}") | |
| return new_cols | |
| # -------------------------- | |
| # 1. Load data | |
| # -------------------------- | |
| # Guard: only run training and heavy data loading when script is executed directly | |
| if __name__ == '__main__' and os.path.exists(DATA_PATH): | |
| print("Loading data from:", DATA_PATH) | |
| df = pd.read_csv(DATA_PATH, sep='\t', low_memory=False) | |
| print("Original shape:", df.shape) | |
| # make column names unique for robust selection (duplicates -> __dup1, __dup2) | |
| df.columns = make_unique_columns(df.columns.tolist()) | |
| # Show a few columns (first 40) so user can inspect if running interactively | |
| print("First 40 column names (for inspection):") | |
| print(df.columns[:40].tolist()) | |
| # -------------------------- | |
| # 2. Identify candidate self-reported features and menopause-stage variable | |
| # -------------------------- | |
| # Heuristic patterns for self-report variables (adjust if you'd like to include additional columns) | |
| selfreport_patterns = [ | |
| r'VMS', r'HOT', r'HOTFL', r'NIGHTSW', r'SLEEP', r'CESD', r'STRESS', r'MOOD', | |
| r'SMOK', r'ALCOH', r'ALCO', r'EXER', r'PHYS', r'ACTIV', r'VAG', r'URINE', r'SEX', r'PAIN', | |
| r'FATIG', r'IRRIT', r'ANXI', r'DEPRESS', r'BLEED', r'MENSE', r'PERIOD', r'LMP', | |
| r'HOTSW', r'QOL', r'DRY' | |
| ] | |
| # Exclude laboratory/biomarker variable name patterns | |
| biomarker_exclude = r'E2|FSH|GLUCOSE|CHOLESTEROL|HDL|TRIG|SHBG|DHEAS|INSULIN|BMD|BP|HEIGHT|WEIGHT' | |
| upper_cols = {c: c.upper() for c in df.columns} | |
| selfreport_cols = [] | |
| for orig, up in upper_cols.items(): | |
| for pat in selfreport_patterns: | |
| if re.search(pat, up): | |
| # skip biomarkers that match both symptom patterns and biomarker patterns | |
| if re.search(biomarker_exclude, up): | |
| continue | |
| selfreport_cols.append(orig) | |
| break | |
| # Also include basic self-report demographics commonly present (AGE, RACE) | |
| for dem in ['AGE7','AGE','RACE','LANGINT7','LANGINT']: | |
| if dem in df.columns and dem not in selfreport_cols: | |
| selfreport_cols.append(dem) | |
| # Deduplicate preserving order | |
| seen=set() | |
| selfreport_cols = [x for x in selfreport_cols if not (x in seen or seen.add(x))] | |
| print(f"Found {len(selfreport_cols)} candidate self-reported columns (first 50 shown):") | |
| print(selfreport_cols[:50]) | |
| # Identify menopause-stage variable heuristically | |
| stage_cand_patterns = [r'MENOSTAT', r'MENOSYM', r'MENO', r'MENOP', r'MENST', r'MENSE', r'STATUS'] | |
| stage_candidates = [c for c in df.columns if any(re.search(p, c, flags=re.I) for p in stage_cand_patterns)] | |
| print("Stage-like candidate columns (found):", stage_candidates[:10]) | |
| # If user provided an override for stage column via CLI, honor it (if present in data) | |
| if STAGE_COL_OVERRIDE: | |
| if STAGE_COL_OVERRIDE in df.columns: | |
| print(f"Using overridden stage column: {STAGE_COL_OVERRIDE}") | |
| stage_candidates = [STAGE_COL_OVERRIDE] | |
| else: | |
| print(f"Warning: requested stage column '{STAGE_COL_OVERRIDE}' not present in data; proceeding with heuristic detection") | |
| # If multiple candidates choose one with few unique values (likely coded categories) | |
| stage_col = None | |
| for c in stage_candidates: | |
| nunique = df[c].nunique(dropna=True) | |
| # prefer small discrete sets (e.g., 2-6 categories) | |
| if 1 < nunique <= 20: | |
| stage_col = c | |
| break | |
| if stage_col is None and stage_candidates: | |
| # fallback to first candidate | |
| stage_col = stage_candidates[0] | |
| if stage_col is None: | |
| raise RuntimeError("No menopause-stage-like column found automatically. Inspect df.columns and pick the proper variable (e.g., MENOSTAT).") | |
| print("Selected stage column:", stage_col, " unique values:", df[stage_col].nunique(dropna=True)) | |
| print("Sample raw counts:") | |
| print(df[stage_col].value_counts(dropna=False).head(20)) | |
| # -------------------------- | |
| # 3. Create working dataframe with self-report features + stage | |
| # -------------------------- | |
| use_cols = [stage_col] + [c for c in selfreport_cols if c in df.columns and c != stage_col] | |
| data = df[use_cols].copy() | |
| # Replace common SWAN missing codes with NaN | |
| missing_values = [-9, -8, -7, -1, '.', 'NA', 'N/A', '999', 9999] | |
| data.replace(missing_values, np.nan, inplace=True) | |
| # Try convert object columns to numeric when appropriate | |
| for col in data.columns: | |
| if data[col].dtype == object: | |
| coerced = pd.to_numeric(data[col].astype(str).str.strip(), errors='coerce') | |
| # If many values become numeric, use numeric version; else leave as categorical string | |
| if coerced.notna().sum() > len(coerced) * 0.5: | |
| data[col] = coerced | |
| else: | |
| # replace blank/'nan' strings with np.nan | |
| data[col] = data[col].astype(str).str.strip().replace({'nan': np.nan, '': np.nan}) | |
| # -------------------------- | |
| # 4. Map stage variable to standardized labels {pre, peri, post} | |
| # *Important*: this is heuristic. Verify using the codebook and adjust mapping if needed. | |
| # -------------------------- | |
| def map_stage_to_labels(series): | |
| # Try textual mapping first | |
| s = series.copy() | |
| try: | |
| uniques = [str(x).lower() for x in s.dropna().unique()] | |
| except Exception: | |
| uniques = [] | |
| # textual mapping | |
| if any(x in ['pre','premenopausal','premenopause','pre-menopausal'] for x in uniques): | |
| s = s.astype(str).str.lower() | |
| s = s.replace({'premenopausal':'pre','pre-menopausal':'pre','pre-menopause':'pre','pre':'pre'}) | |
| s = s.replace({'perimenopausal':'peri','peri-menopausal':'peri','peri':'peri'}) | |
| s = s.replace({'postmenopausal':'post','post-menopausal':'post','post':'post'}) | |
| return s.map({'pre':'pre','peri':'peri','post':'post'}) | |
| # numeric mapping heuristic: map min->pre, median->peri, max->post | |
| num = pd.to_numeric(s, errors='coerce') | |
| num_unique = sorted(num.dropna().unique().tolist()) | |
| if len(num_unique) >= 3: | |
| mapping = {num_unique[0]:'pre', num_unique[len(num_unique)//2]:'peri', num_unique[-1]:'post'} | |
| return num.map(mapping) | |
| # 2-level mapping (assume 1->pre,2->post) or fallback | |
| if len(num_unique) == 2: | |
| return num.map({num_unique[0]:'pre', num_unique[1]:'post'}) | |
| # If not mappable, return NaN series | |
| return pd.Series([np.nan]*len(s), index=s.index) | |
| mapped_stage = map_stage_to_labels(data[stage_col]) | |
| # If mapping failed (too many NaNs), attempt a simple bleed-based heuristic (last menstrual period) | |
| if mapped_stage.isna().mean() > 0.9: | |
| bleed_candidates = [c for c in data.columns if re.search(r'LMP|BLEED|PERIOD|MENSTR', c, flags=re.I)] | |
| if len(bleed_candidates) > 0: | |
| lcol = bleed_candidates[0] | |
| lnum = pd.to_numeric(data[lcol], errors='coerce') | |
| mapped_stage = pd.Series(index=data.index, dtype=object) | |
| mapped_stage[lnum.isna()] = 'post' | |
| mapped_stage[lnum.notna()] = 'pre' | |
| else: | |
| raise RuntimeError("Failed to map stage variable to pre/peri/post and no bleed/LMP variable found.") | |
| data['_menopause_stage'] = mapped_stage | |
| print("Mapped stage counts (after heuristic mapping):") | |
| print(data['_menopause_stage'].value_counts(dropna=False)) | |
| # Drop rows with no mapped stage | |
| data = data[~data['_menopause_stage'].isna()].copy() | |
| print("Rows available for modeling:", data.shape[0]) | |
| # -------------------------- | |
| # 5. Feature selection for modeling | |
| # Keep only self-report fields with enough non-missing values and >1 unique value | |
| # -------------------------- | |
| feature_candidates = [c for c in use_cols if c != stage_col] | |
| selected_features = [] | |
| for c in feature_candidates: | |
| non_null = data[c].notna().sum() | |
| # require at least 2% nonmissing or minimum 50 observations | |
| if non_null < max(50, len(data) * 0.02): | |
| continue | |
| if data[c].nunique(dropna=True) <= 1: | |
| continue | |
| selected_features.append(c) | |
| print("Number of features selected for modeling:", len(selected_features)) | |
| print("First 40 features (if many):", selected_features[:40]) | |
| # -------------------------- | |
| # 6. Preprocessing pipeline | |
| # Numeric features: impute mean | |
| # Categorical features: impute most frequent + one-hot encode | |
| # Normalization: only added for logistic regression pipeline (tree-based RF doesn't need scaling) | |
| # -------------------------- | |
| numeric_feats = [c for c in selected_features if pd.api.types.is_numeric_dtype(data[c])] | |
| cat_feats = [c for c in selected_features if c not in numeric_feats] | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.compose import ColumnTransformer | |
| numeric_transformer = Pipeline(steps=[ | |
| ('imputer', SimpleImputer(strategy='mean')) | |
| ]) | |
| # Construct OneHotEncoder in a sklearn-version compatible way | |
| try: | |
| ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False) | |
| except TypeError: | |
| # older sklearn versions use `sparse` kwarg | |
| ohe = OneHotEncoder(handle_unknown='ignore', sparse=False) | |
| categorical_transformer = Pipeline(steps=[ | |
| ('imputer', SimpleImputer(strategy='most_frequent')), | |
| ('onehot', ohe) | |
| ]) | |
| preprocessor = ColumnTransformer(transformers=[ | |
| ('num', numeric_transformer, numeric_feats), | |
| ('cat', categorical_transformer, cat_feats) | |
| ], remainder='drop') | |
| # Two pipelines: RandomForest (no scaling) and LogisticRegression (scaling) | |
| rf_pipeline = Pipeline(steps=[ | |
| ('pre', preprocessor), | |
| ('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)) | |
| ]) | |
| lr_pipeline = Pipeline(steps=[ | |
| ('pre', preprocessor), | |
| ('scaler', StandardScaler()), | |
| ('lr', LogisticRegression(solver='lbfgs', max_iter=1000)) | |
| ]) | |
| # -------------------------- | |
| # 7. Prepare data, train/test split | |
| # -------------------------- | |
| X = data[selected_features].copy() | |
| y = data['_menopause_stage'].copy().astype(str) # values: 'pre','peri','post' (hopefully) | |
| print("Target class distribution:") | |
| print(y.value_counts()) | |
| # Stratified split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42) | |
| print("Train / test sizes:", X_train.shape[0], X_test.shape[0]) | |
| # -------------------------- | |
| # 8. Train models | |
| # -------------------------- | |
| print("Training RandomForest...") | |
| rf_pipeline.fit(X_train, y_train) | |
| print("RandomForest trained.") | |
| print("Training LogisticRegression (multinomial)...") | |
| lr_pipeline.fit(X_train, y_train) | |
| print("LogisticRegression trained.") | |
| # -------------------------- | |
| # 9. Predictions and assessment | |
| # -------------------------- | |
| def evaluate_model(pipeline, X_test, y_test, model_name, output_dir=OUTPUT_DIR): | |
| y_pred = pipeline.predict(X_test) | |
| report = classification_report(y_test, y_pred) | |
| print(f"\n=== {model_name} Classification Report ===\n{report}") | |
| # confusion matrix | |
| labels = sorted(y_test.unique()) | |
| cm = confusion_matrix(y_test, y_pred, labels=labels) | |
| print(f"{model_name} Confusion Matrix (rows=true, cols=pred):\nLabels: {labels}\n{cm}") | |
| # Save classification report | |
| with open(os.path.join(output_dir, f"classification_report_{model_name.replace(' ','_')}.txt"), "w") as f: | |
| f.write(report) | |
| # Plot confusion matrix with matplotlib | |
| fig, ax = plt.subplots(figsize=(5,4)) | |
| im = ax.imshow(cm, interpolation='nearest') | |
| ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=45) | |
| ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels) | |
| ax.set_title(f"{model_name} Confusion Matrix") | |
| for i in range(cm.shape[0]): | |
| for j in range(cm.shape[1]): | |
| ax.text(j, i, format(cm[i, j], 'd'), ha="center", va="center") | |
| plt.tight_layout() | |
| plt.savefig(os.path.join(output_dir, f"{model_name.replace(' ','_')}_confusion_matrix.png")) | |
| # Show plots only when requested; otherwise close to free resources (non-interactive default) | |
| if SHOW_PLOTS: | |
| plt.show() | |
| else: | |
| plt.close('all') | |
| return y_pred, cm | |
| rf_pred, rf_cm = evaluate_model(rf_pipeline, X_test, y_test, "RandomForest") | |
| lr_pred, lr_cm = evaluate_model(lr_pipeline, X_test, y_test, "LogisticRegression") | |
| # 10. Feature importance | |
| # Extract feature names after preprocessing (numerics stay same; categorical one-hot create names) | |
| pre = rf_pipeline.named_steps['pre'] | |
| # Get numeric feature names | |
| feature_names = [] | |
| if len(numeric_feats) > 0: | |
| feature_names.extend(numeric_feats) | |
| if len(cat_feats) > 0: | |
| # Get onehot output names | |
| ohe = pre.named_transformers_['cat'].named_steps['onehot'] | |
| try: | |
| cat_onehot_names = ohe.get_feature_names_out(cat_feats) | |
| except Exception: | |
| # fallback | |
| cat_onehot_names = [] | |
| feature_names.extend(cat_onehot_names.tolist() if hasattr(cat_onehot_names, 'tolist') else list(cat_onehot_names)) | |
| # Feature importances from RandomForest | |
| rf_model = rf_pipeline.named_steps['rf'] | |
| importances = rf_model.feature_importances_ | |
| imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False) | |
| imp_df.to_csv(os.path.join(OUTPUT_DIR, "rf_feature_importances.csv"), index=False) | |
| print("\nTop 20 RF feature importances:") | |
| print(imp_df.head(20).to_string(index=False)) | |
| # Permutation importance (robust) | |
| print("Computing permutation importance (this can take some time)...") | |
| perm = permutation_importance(rf_pipeline, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1) | |
| perm_idx = perm.importances_mean.argsort()[::-1] | |
| perm_df = pd.DataFrame({ | |
| 'feature': np.array(feature_names)[perm_idx], | |
| 'importance_mean': perm.importances_mean[perm_idx], | |
| 'importance_std': perm.importances_std[perm_idx] | |
| }) | |
| perm_df.to_csv(os.path.join(OUTPUT_DIR, "rf_permutation_importances.csv"), index=False) | |
| print("Top 20 permutation importances:") | |
| print(perm_df.head(20).to_string(index=False)) | |
| # Plot RF top features | |
| topn = min(20, imp_df.shape[0]) | |
| fig, ax = plt.subplots(figsize=(8,6)) | |
| ax.barh(imp_df['feature'].head(topn)[::-1], imp_df['importance'].head(topn)[::-1]) | |
| ax.set_title("RandomForest: Top feature importances") | |
| ax.set_xlabel("Importance") | |
| plt.tight_layout() | |
| plt.savefig(os.path.join(OUTPUT_DIR, "rf_top_feature_importances.png")) | |
| if SHOW_PLOTS: | |
| plt.show() | |
| else: | |
| plt.close('all') | |
| # 11. ROC curves (one-vs-rest) if predict_proba available | |
| def plot_multiclass_roc(pipeline, X_test, y_test, model_name): | |
| if not hasattr(pipeline, "predict_proba"): | |
| print(f"{model_name} has no predict_proba; skipping ROC plot.") | |
| return | |
| # Must use same class order as pipeline's final estimator | |
| final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]] | |
| classes = final_est.classes_ | |
| y_test_bin = label_binarize(y_test, classes=classes) | |
| y_score = pipeline.predict_proba(X_test) | |
| for i, cls in enumerate(classes): | |
| fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i]) | |
| roc_auc = auc(fpr, tpr) | |
| plt.figure() | |
| plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}") | |
| plt.plot([0,1],[0,1], linestyle='--') | |
| plt.title(f"{model_name} ROC for class {cls}") | |
| plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate") | |
| plt.legend(loc='lower right') | |
| plt.savefig(os.path.join(OUTPUT_DIR, f"{model_name.replace(' ','_')}_ROC_{cls}.png")) | |
| if SHOW_PLOTS: | |
| plt.show() | |
| else: | |
| plt.close('all') | |
| print("Plotting ROC curves for RandomForest and LogisticRegression (if available)...") | |
| if __name__ == '__main__' and 'rf_pipeline' in globals(): | |
| plot_multiclass_roc(rf_pipeline, X_test, y_test, "RandomForest") | |
| plot_multiclass_roc(lr_pipeline, X_test, y_test, "LogisticRegression") | |
| # ========================================================================================== | |
| # 12. FORECASTING MODULE: Predict menopausal stage for new individuals | |
| # ========================================================================================== | |
| class MenopauseForecast: | |
| """ | |
| Forecasting module for predicting menopausal stage (pre/peri/post) given self-reported features. | |
| This class encapsulates the trained models and preprocessing pipeline to make predictions | |
| on new data with the same features used during training. | |
| """ | |
| def __init__(self, rf_pipeline, lr_pipeline, feature_names, stage_classes): | |
| """ | |
| Initialize the forecaster with trained pipelines. | |
| Parameters: | |
| ----------- | |
| rf_pipeline : sklearn Pipeline | |
| Trained RandomForest pipeline | |
| lr_pipeline : sklearn Pipeline | |
| Trained LogisticRegression pipeline | |
| feature_names : list | |
| List of feature column names used for training | |
| stage_classes : list | |
| List of possible menopause stage classes (e.g., ['pre', 'peri', 'post']) | |
| """ | |
| self.rf_pipeline = rf_pipeline | |
| self.lr_pipeline = lr_pipeline | |
| self.feature_names = feature_names | |
| self.stage_classes = stage_classes | |
| self.models = { | |
| 'RandomForest': rf_pipeline, | |
| 'LogisticRegression': lr_pipeline | |
| } | |
| def predict_single(self, feature_dict, model='RandomForest', return_proba=True): | |
| """ | |
| Predict menopausal stage for a single individual. | |
| Parameters: | |
| ----------- | |
| feature_dict : dict | |
| Dictionary with feature names as keys and values for prediction. | |
| Example: {'HOT7': 1, 'SLEEP7': 2, 'CESD': 10, ...} | |
| model : str | |
| Which model to use for prediction: 'RandomForest' or 'LogisticRegression' | |
| return_proba : bool | |
| If True, return prediction probabilities; otherwise just the class label | |
| Returns: | |
| -------- | |
| dict : Contains 'stage', 'confidence', and optionally 'probabilities' | |
| """ | |
| if model not in self.models: | |
| raise ValueError(f"Model '{model}' not found. Available: {list(self.models.keys())}") | |
| # Create DataFrame with single row, reindex to match training features | |
| X = pd.DataFrame([feature_dict]).reindex(columns=self.feature_names, fill_value=np.nan) | |
| pipeline = self.models[model] | |
| prediction = pipeline.predict(X)[0] | |
| result = { | |
| 'stage': prediction, | |
| 'model': model, | |
| 'confidence': None, | |
| 'probabilities': None | |
| } | |
| if return_proba: | |
| try: | |
| proba = pipeline.predict_proba(X)[0] | |
| result['confidence'] = float(np.max(proba)) | |
| result['probabilities'] = { | |
| cls: float(prob) | |
| for cls, prob in zip(pipeline.named_steps[list(pipeline.named_steps.keys())[-1]].classes_, proba) | |
| } | |
| except Exception as e: | |
| print(f"Warning: Could not compute probabilities: {e}") | |
| return result | |
| def predict_batch(self, df, model='RandomForest', return_proba=True): | |
| """ | |
| Predict menopausal stage for multiple individuals (batch prediction). | |
| Parameters: | |
| ----------- | |
| df : pd.DataFrame | |
| DataFrame with feature columns matching training features. | |
| Missing values will be handled by the preprocessing pipeline. | |
| model : str | |
| Which model to use: 'RandomForest' or 'LogisticRegression' | |
| return_proba : bool | |
| If True, return prediction probabilities | |
| Returns: | |
| -------- | |
| pd.DataFrame : Contains 'predicted_stage', 'confidence', and probability columns | |
| """ | |
| if model not in self.models: | |
| raise ValueError(f"Model '{model}' not found. Available: {list(self.models.keys())}") | |
| # Reindex to match training features | |
| X = df.reindex(columns=self.feature_names, fill_value=np.nan) | |
| pipeline = self.models[model] | |
| predictions = pipeline.predict(X) | |
| result_df = pd.DataFrame({ | |
| 'predicted_stage': predictions, | |
| 'model': model | |
| }) | |
| if return_proba: | |
| try: | |
| proba = pipeline.predict_proba(X) | |
| final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]] | |
| result_df['confidence'] = np.max(proba, axis=1) | |
| # Add probability column for each class | |
| for i, cls in enumerate(final_est.classes_): | |
| result_df[f'prob_{cls}'] = proba[:, i] | |
| except Exception as e: | |
| print(f"Warning: Could not compute probabilities: {e}") | |
| return result_df | |
| def compare_models(self, feature_dict): | |
| """ | |
| Compare predictions from both RandomForest and LogisticRegression models. | |
| Parameters: | |
| ----------- | |
| feature_dict : dict | |
| Feature values for the individual | |
| Returns: | |
| -------- | |
| dict : Predictions and probabilities from both models | |
| """ | |
| rf_result = self.predict_single(feature_dict, model='RandomForest', return_proba=True) | |
| lr_result = self.predict_single(feature_dict, model='LogisticRegression', return_proba=True) | |
| return { | |
| 'RandomForest': rf_result, | |
| 'LogisticRegression': lr_result | |
| } | |
| def get_feature_info(self): | |
| """Return information about required features.""" | |
| return { | |
| 'num_features': len(self.feature_names), | |
| 'feature_names': self.feature_names, | |
| 'stage_classes': self.stage_classes | |
| } | |
| def create_forecast_example(): | |
| """ | |
| Create an example forecast instance and demonstrate usage. | |
| This function is robust: if the training artifacts (`rf_pipeline`, `lr_pipeline`, | |
| `selected_features`, `X_train`, `X_test`) are not available in memory (e.g., when | |
| the module is imported in another process), it attempts to load saved pipelines | |
| from `OUTPUT_DIR` via `load_forecast_model()` and uses placeholder inputs. | |
| """ | |
| print("\n" + "="*80) | |
| print("FORECASTING MODULE EXAMPLE: Predicting Menopausal Stage") | |
| print("="*80) | |
| # Determine pipelines and feature metadata (use in-memory if available, else load from disk) | |
| try: | |
| _rf = rf_pipeline | |
| _lr = lr_pipeline | |
| _features = selected_features | |
| _stage_classes = sorted(y.unique().tolist()) | |
| has_training = True | |
| except NameError: | |
| print("Training artifacts not present in memory; attempting to load from disk...") | |
| try: | |
| _loaded = load_forecast_model(OUTPUT_DIR) | |
| _rf = _loaded.rf_pipeline | |
| _lr = _loaded.lr_pipeline | |
| _features = _loaded.feature_names | |
| _stage_classes = _loaded.stage_classes | |
| has_training = False | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to initialize forecaster from disk: {e}") | |
| forecast = MenopauseForecast( | |
| rf_pipeline=_rf, | |
| lr_pipeline=_lr, | |
| feature_names=_features, | |
| stage_classes=_stage_classes | |
| ) | |
| print(f"\nForecaster initialized with {len(_features)} features") | |
| print(f"Predicting stages: {_stage_classes}") | |
| # Example 1: Single individual prediction | |
| print("\n--- Example 1: Predict for a single individual ---") | |
| example_individual = {} | |
| n_example_feats = min(10, len(_features)) | |
| if has_training: | |
| for feat in _features[:n_example_feats]: | |
| try: | |
| example_individual[feat] = float(pd.to_numeric(X_train[feat], errors='coerce').median()) | |
| except Exception: | |
| # Fallback to mode or NaN | |
| try: | |
| example_individual[feat] = X_train[feat].mode().iloc[0] | |
| except Exception: | |
| example_individual[feat] = np.nan | |
| else: | |
| # No training DF available; provide NaN placeholders to let pipeline impute | |
| for feat in _features[:n_example_feats]: | |
| example_individual[feat] = np.nan | |
| result = forecast.predict_single(example_individual, model='RandomForest', return_proba=True) | |
| print(f"Predicted stage: {result.get('stage')}") | |
| print(f"Confidence: {result.get('confidence'):.3f}" if result.get('confidence') is not None else "Confidence: None") | |
| if result.get('probabilities'): | |
| print("Stage probabilities:") | |
| for stage, prob in sorted(result['probabilities'].items()): | |
| print(f" {stage}: {prob:.3f}") | |
| # Example 2: Compare models | |
| print("\n--- Example 2: Compare RandomForest vs LogisticRegression ---") | |
| comparison = forecast.compare_models(example_individual) | |
| for model_name, cres in comparison.items(): | |
| print(f"\n{model_name}:") | |
| print(f" Predicted stage: {cres.get('stage')}") | |
| print(f" Confidence: {cres.get('confidence'):.3f}" if cres.get('confidence') is not None else " Confidence: None") | |
| # Example 3: Batch prediction on a small sample (either X_test if available or placeholder rows) | |
| print("\n--- Example 3: Batch prediction (small sample) ---") | |
| if has_training: | |
| try: | |
| test_sample = X_test.iloc[:5].copy() | |
| batch_results = forecast.predict_batch(test_sample, model='RandomForest', return_proba=True) | |
| print(batch_results.to_string()) | |
| except Exception as e: | |
| print(f"Batch prediction failed on training sample: {e}") | |
| else: | |
| # Create a small placeholder DataFrame with feature columns filled with NaN | |
| placeholder = pd.DataFrame([{f: np.nan for f in _features[:n_example_feats]}]) | |
| batch_results = forecast.predict_batch(placeholder, model='RandomForest', return_proba=True) | |
| print(batch_results.to_string()) | |
| return forecast | |
| def save_forecast_model(forecast_instance, output_dir=OUTPUT_DIR): | |
| """ | |
| Save the forecast model instance for later use (optional: can use joblib for production). | |
| For now, saves metadata about features and classes that can be used to reinitialize | |
| the forecaster. | |
| Parameters: | |
| ----------- | |
| forecast_instance : MenopauseForecast | |
| The forecaster to save | |
| output_dir : str | |
| Directory to save metadata | |
| """ | |
| import json | |
| import joblib | |
| metadata = { | |
| 'feature_names': forecast_instance.feature_names, | |
| 'stage_classes': forecast_instance.stage_classes, | |
| 'num_features': len(forecast_instance.feature_names) | |
| } | |
| # Save metadata as JSON | |
| with open(os.path.join(output_dir, 'forecast_metadata.json'), 'w') as f: | |
| json.dump(metadata, f, indent=2) | |
| # Save trained pipelines using joblib (allows full reuse) | |
| joblib.dump(forecast_instance.rf_pipeline, os.path.join(output_dir, 'rf_pipeline.pkl')) | |
| joblib.dump(forecast_instance.lr_pipeline, os.path.join(output_dir, 'lr_pipeline.pkl')) | |
| print(f"Forecast model saved to {output_dir}") | |
| print(f" - forecast_metadata.json") | |
| print(f" - rf_pipeline.pkl") | |
| print(f" - lr_pipeline.pkl") | |
| def load_forecast_model(output_dir=OUTPUT_DIR): | |
| """ | |
| Load a previously saved forecast model. | |
| Parameters: | |
| ----------- | |
| output_dir : str | |
| Directory containing saved models | |
| Returns: | |
| -------- | |
| MenopauseForecast : The loaded forecaster | |
| """ | |
| import json | |
| import joblib | |
| # Load metadata | |
| with open(os.path.join(output_dir, 'forecast_metadata.json'), 'r') as f: | |
| metadata = json.load(f) | |
| # Load pipelines | |
| rf_pipeline_loaded = joblib.load(os.path.join(output_dir, 'rf_pipeline.pkl')) | |
| lr_pipeline_loaded = joblib.load(os.path.join(output_dir, 'lr_pipeline.pkl')) | |
| # Recreate forecaster | |
| forecast = MenopauseForecast( | |
| rf_pipeline=rf_pipeline_loaded, | |
| lr_pipeline=lr_pipeline_loaded, | |
| feature_names=metadata['feature_names'], | |
| stage_classes=metadata['stage_classes'] | |
| ) | |
| print(f"Forecast model loaded from {output_dir}") | |
| return forecast | |
| # Initialize and demonstrate the forecasting module | |
| # Symptom cycle forecasting (defined earlier near CLI args) | |
| class SymptomCycleForecaster: | |
| """ | |
| Predicts the probability of hot flashes and mood changes within a menstrual cycle | |
| based on last menstrual period (LMP) date and target date. | |
| """ | |
| def __init__(self, cycle_length=28, hot_mu=14, hot_sigma=5, mood_mu=26, mood_sigma=4, | |
| base_hot=0.1, amp_hot=0.4, base_mood=0.1, amp_mood=0.45, threshold=0.5): | |
| self.cycle_length = cycle_length | |
| self.hot_mu = hot_mu | |
| self.hot_sigma = hot_sigma | |
| self.mood_mu = mood_mu | |
| self.mood_sigma = mood_sigma | |
| self.base_hot = base_hot | |
| self.amp_hot = amp_hot | |
| self.base_mood = base_mood | |
| self.amp_mood = amp_mood | |
| self.threshold = threshold | |
| def _parse_lmp(self, lmp, reference_date=None): | |
| """Parse LMP input which may be a full date string or an integer day-of-month.""" | |
| if pd.isna(lmp): | |
| return None | |
| # If numeric day (int-like), construct a date in the same month as reference_date | |
| try: | |
| lmp_int = int(lmp) | |
| if reference_date is None: | |
| ref = pd.Timestamp(datetime.today()).to_pydatetime() | |
| else: | |
| ref = pd.to_datetime(reference_date, errors='coerce') | |
| if pd.isna(ref): | |
| ref = pd.Timestamp(datetime.today()).to_pydatetime() | |
| else: | |
| ref = ref.to_pydatetime() | |
| # Clamp day to valid range | |
| day = max(1, min(lmp_int, 28)) | |
| return datetime(ref.year, ref.month, day) | |
| except Exception: | |
| # Try parse as full date string | |
| try: | |
| return pd.to_datetime(lmp, errors='coerce').to_pydatetime() | |
| except Exception: | |
| return None | |
| def compute_cycle_day(self, lmp, target_date=None): | |
| """Return 1-based cycle day (1..cycle_length) or None if cannot compute.""" | |
| if target_date is None: | |
| tdate = datetime.today() | |
| else: | |
| tdate = pd.to_datetime(target_date, errors='coerce') | |
| if pd.isna(tdate): | |
| tdate = datetime.today() | |
| else: | |
| tdate = tdate.to_pydatetime() | |
| lmp_date = self._parse_lmp(lmp, reference_date=tdate) | |
| if lmp_date is None: | |
| return None | |
| delta = (tdate - lmp_date).days | |
| if delta < 0: | |
| # If LMP is in the future, assume it refers to previous cycle (subtract one month) | |
| lmp_date = lmp_date - timedelta(days=self.cycle_length) | |
| delta = (tdate - lmp_date).days | |
| cycle_day = (delta % self.cycle_length) + 1 | |
| return int(cycle_day) | |
| def _gauss_prob(self, day, mu, sigma, base, amp): | |
| if day is None: | |
| return np.nan | |
| val = base + amp * np.exp(-0.5 * ((day - mu) / float(sigma)) ** 2) | |
| return float(min(max(val, 0.0), 1.0)) | |
| def predict_single(self, lmp, target_date=None): | |
| day = self.compute_cycle_day(lmp, target_date=target_date) | |
| hot_p = self._gauss_prob(day, self.hot_mu, self.hot_sigma, self.base_hot, self.amp_hot) | |
| mood_p = self._gauss_prob(day, self.mood_mu, self.mood_sigma, self.base_mood, self.amp_mood) | |
| return { | |
| 'cycle_day': day, | |
| 'hotflash_prob': hot_p, | |
| 'hotflash_pred': hot_p >= self.threshold if not np.isnan(hot_p) else None, | |
| 'mood_prob': mood_p, | |
| 'mood_pred': mood_p >= self.threshold if not np.isnan(mood_p) else None | |
| } | |
| def predict_df(self, df, lmp_col='LMP', date_col=None, menopause_stage_col=None): | |
| df = df.copy() | |
| results = df.apply( | |
| lambda row: pd.Series(self.predict_single( | |
| lmp=row.get(lmp_col), | |
| target_date=(row.get(date_col) if date_col is not None else None) | |
| )), axis=1 | |
| ) | |
| out = pd.concat([df.reset_index(drop=True), results.reset_index(drop=True)], axis=1) | |
| return out | |
| def predict_symptoms_from_csv(input_csv, output_csv, lmp_col='LMP', date_col=None, | |
| menopause_stage_col=None, cycle_length=28, **kwargs): | |
| """Read input CSV, predict hot flashes/mood by cycle day, and write output CSV.""" | |
| df = pd.read_csv(input_csv) | |
| fore = SymptomCycleForecaster(cycle_length=cycle_length) | |
| out_df = fore.predict_df(df, lmp_col=lmp_col, date_col=date_col, menopause_stage_col=menopause_stage_col) | |
| out_df.to_csv(output_csv, index=False) | |
| # Print a brief summary | |
| print(f"Wrote symptom predictions for {out_df.shape[0]} rows to {output_csv}") | |
| print("Sample predictions (first 5 rows):") | |
| print(out_df[[lmp_col] + ['cycle_day','hotflash_prob','hotflash_pred','mood_prob','mood_pred']].head().to_string()) | |
| # CLI integration: run symptom prediction if requested | |
| if __name__ == '__main__': | |
| # If symptom prediction requested via CLI, run fast-path and exit | |
| if args.predict_symptoms: | |
| if not args.symptoms_input or not args.symptoms_output: | |
| print("Error: --symptoms-input and --symptoms-output are required when --predict-symptoms is set") | |
| sys.exit(1) | |
| else: | |
| predict_symptoms_from_csv( | |
| input_csv=args.symptoms_input, | |
| output_csv=args.symptoms_output, | |
| lmp_col=args.lmp_col, | |
| date_col=args.date_col, | |
| cycle_length=args.cycle_length | |
| ) | |
| sys.exit(0) | |
| # Dual predictions are handled in the early fast-path above to avoid training. | |
| # Default behavior: create demo forecaster, save trained models and show summary | |
| forecast_model = create_forecast_example() | |
| save_forecast_model(forecast_model) | |
| print("\n" + "="*80) | |
| print("FORECASTING MODULE SUMMARY") | |
| print("="*80) | |
| print(""" | |
| The MenopauseForecast class provides three main methods for predictions: | |
| 1. predict_single(feature_dict, model='RandomForest', return_proba=True) | |
| - Predict stage for one individual given feature values | |
| - Returns predicted stage and confidence scores | |
| 2. predict_batch(df, model='RandomForest', return_proba=True) | |
| - Predict stages for multiple individuals | |
| - Returns DataFrame with predictions and probabilities for each stage | |
| 3. compare_models(feature_dict) | |
| - Compare predictions from both RandomForest and LogisticRegression | |
| - Useful for validating model agreement | |
| Usage in your own code: | |
| from menopause import load_forecast_model | |
| # Load the trained forecaster | |
| forecast = load_forecast_model('swan_ml_output') | |
| # Predict for an individual | |
| features = {'HOT7': 1, 'SLEEP7': 2, 'CESD': 10, ...} | |
| result = forecast.predict_single(features, model='RandomForest') | |
| # Predict for multiple individuals | |
| results_df = forecast.predict_batch(your_dataframe, model='RandomForest') | |
| """) | |
| # ========================================================================================== | |
| # 13. CSV INPUT/OUTPUT FUNCTIONALITY: Batch prediction from CSV files | |
| # ========================================================================================== | |
| def predict_from_csv(input_csv, forecast_instance, output_csv=None, model='RandomForest', output_dir=OUTPUT_DIR): | |
| """ | |
| Read individual data from CSV, make predictions, and save results. | |
| Parameters: | |
| ----------- | |
| input_csv : str | |
| Path to input CSV file with feature columns for individuals | |
| CSV should have columns matching training features (or subset) | |
| forecast_instance : MenopauseForecast | |
| The trained forecaster instance | |
| output_csv : str | |
| Path to output CSV file (default: input_csv with '_predictions' appended) | |
| model : str | |
| Which model to use ('RandomForest' or 'LogisticRegression') | |
| output_dir : str | |
| Directory to save results (for metadata) | |
| Returns: | |
| -------- | |
| pd.DataFrame : Results with predictions and confidence scores | |
| Example: | |
| -------- | |
| forecast = load_forecast_model('swan_ml_output') | |
| results = predict_from_csv('individuals.csv', forecast) | |
| # Results saved to 'individuals_predictions.csv' | |
| """ | |
| import os | |
| # Read input CSV | |
| print(f"Reading input data from: {input_csv}") | |
| try: | |
| data = pd.read_csv(input_csv) | |
| except FileNotFoundError: | |
| print(f"ERROR: File not found: {input_csv}") | |
| return None | |
| n_samples = len(data) | |
| print(f"Loaded {n_samples} individuals") | |
| # Identify feature columns (exclude ID columns) | |
| id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject'] | |
| feature_cols = [c for c in data.columns if c not in id_cols] | |
| # Separate ID columns from features | |
| id_data = data[[c for c in id_cols if c in data.columns]] if any(c in data.columns for c in id_cols) else None | |
| # Make predictions | |
| print(f"Making predictions using {model}...") | |
| predictions = forecast_instance.predict_batch( | |
| data[feature_cols], | |
| model=model, | |
| return_proba=True | |
| ) | |
| # Combine with original data | |
| if id_data is not None: | |
| results = pd.concat([id_data.reset_index(drop=True), predictions.reset_index(drop=True)], axis=1) | |
| else: | |
| results = predictions.reset_index(drop=True) | |
| # Add individual index if no ID column | |
| if id_data is None: | |
| results.insert(0, 'individual', range(1, n_samples + 1)) | |
| # Set output file path | |
| if output_csv is None: | |
| base, ext = os.path.splitext(input_csv) | |
| output_csv = f"{base}_predictions{ext}" | |
| # Save results | |
| print(f"Saving predictions to: {output_csv}") | |
| results.to_csv(output_csv, index=False) | |
| return results | |
| def predict_dual_from_csv(stage_input_csv, stage_output_csv, symptoms_input_csv, symptoms_output_csv, | |
| forecast_dir=OUTPUT_DIR, model='RandomForest', lmp_col='LMP', | |
| date_col=None, cycle_length=28): | |
| """Run menopause stage prediction and symptom-cycle prediction using separate | |
| input and output files for each model. | |
| Returns: | |
| -------- | |
| dict : {'stage': stage_results_df, 'symptoms': symptom_results_df} | |
| """ | |
| print(f"Reading stage input data from: {stage_input_csv}") | |
| try: | |
| stage_data = pd.read_csv(stage_input_csv) | |
| except FileNotFoundError: | |
| print(f"ERROR: File not found: {stage_input_csv}") | |
| return None | |
| # Load forecast model | |
| try: | |
| forecast = load_forecast_model(output_dir=forecast_dir) | |
| except Exception as e: | |
| print(f"ERROR: Could not load forecast model from '{forecast_dir}': {e}") | |
| return None | |
| # Identify id and feature columns | |
| id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject'] | |
| feature_cols = [c for c in stage_data.columns if c not in id_cols] | |
| # Make stage predictions | |
| print(f"Making menopause stage predictions using {model}...") | |
| stage_preds = forecast.predict_batch(stage_data[feature_cols], model=model, return_proba=True) | |
| id_data = stage_data[[c for c in id_cols if c in stage_data.columns]] if any(c in stage_data.columns for c in id_cols) else None | |
| if id_data is not None: | |
| stage_results = pd.concat([id_data.reset_index(drop=True), stage_preds.reset_index(drop=True)], axis=1) | |
| else: | |
| stage_results = stage_preds.reset_index(drop=True) | |
| stage_results.insert(0, 'individual', range(1, len(stage_results) + 1)) | |
| # Default stage output path if not provided | |
| if stage_output_csv is None: | |
| base, ext = os.path.splitext(stage_input_csv) | |
| stage_output_csv = f"{base}_stage_predictions{ext}" | |
| print(f"Saving stage predictions to: {stage_output_csv}") | |
| stage_results.to_csv(stage_output_csv, index=False) | |
| # Symptom predictions (independent) | |
| print(f"Reading symptom input data from: {symptoms_input_csv}") | |
| try: | |
| symptom_data = pd.read_csv(symptoms_input_csv) | |
| except FileNotFoundError: | |
| print(f"ERROR: File not found: {symptoms_input_csv}") | |
| return None | |
| if date_col is None and 'date' in symptom_data.columns: | |
| date_col = 'date' | |
| fore = SymptomCycleForecaster(cycle_length=cycle_length) | |
| symptom_results = fore.predict_df(symptom_data, lmp_col=lmp_col, date_col=date_col) | |
| # Default symptom output path if not provided | |
| if symptoms_output_csv is None: | |
| base, ext = os.path.splitext(symptoms_input_csv) | |
| symptoms_output_csv = f"{base}_symptom_predictions{ext}" | |
| print(f"Saving symptom predictions to: {symptoms_output_csv}") | |
| symptom_results.to_csv(symptoms_output_csv, index=False) | |
| return {'stage': stage_results, 'symptoms': symptom_results} | |
| def predict_combined_from_csv(*args, **kwargs): | |
| """Deprecated: combined predictions are removed in favor of separate input/output files.""" | |
| raise ValueError( | |
| "Combined predictions are deprecated. Use predict_dual_from_csv() with separate stage and symptom input/output files." | |
| ) | |
| def create_demo_csv(forecast_instance, num_individuals=5, output_file='demo_individuals.csv', output_dir=OUTPUT_DIR): | |
| """ | |
| Create a demo CSV file with sample individuals for testing predictions. | |
| Uses statistics from the training data to generate realistic feature values. | |
| Parameters: | |
| ----------- | |
| forecast_instance : MenopauseForecast | |
| The trained forecaster (used to get feature names) | |
| num_individuals : int | |
| Number of demo individuals to generate | |
| output_file : str | |
| Path to output CSV file | |
| output_dir : str | |
| Directory to save demo file | |
| Returns: | |
| -------- | |
| str : Path to created CSV file | |
| """ | |
| # Get feature names from forecaster | |
| feature_names = forecast_instance.feature_names | |
| # Create demo data with random realistic values | |
| np.random.seed(42) | |
| demo_data = {} | |
| # Add individual ID | |
| demo_data['individual'] = [f"Individual_{i+1}" for i in range(num_individuals)] | |
| # Generate random feature values (using ranges typical for SWAN data) | |
| for feat in feature_names: | |
| # Random values between 1 and 5 (typical Likert scale for SWAN) | |
| demo_data[feat] = np.random.randint(1, 6, size=num_individuals) | |
| # Create DataFrame | |
| demo_df = pd.DataFrame(demo_data) | |
| # Create full path | |
| full_path = os.path.join(output_dir, output_file) | |
| # Ensure output directory exists | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Save demo file | |
| demo_df.to_csv(full_path, index=False) | |
| print(f"✅ Demo CSV created: {full_path}") | |
| print(f" Individuals: {num_individuals}") | |
| print(f" Features: {len(feature_names)}") | |
| print(f" File shape: {demo_df.shape}") | |
| return full_path | |
| def add_performance_metrics_to_csv(results_df, y_test=None, model_name='RandomForest'): | |
| """ | |
| Add performance metrics to predictions CSV. | |
| If true labels available, computes accuracy, precision, recall, F1-score. | |
| Parameters: | |
| ----------- | |
| results_df : pd.DataFrame | |
| Results dataframe with predictions | |
| y_test : array-like | |
| True labels (optional) | |
| model_name : str | |
| Name of model used | |
| Returns: | |
| -------- | |
| pd.DataFrame : Results with metrics appended | |
| """ | |
| if y_test is not None: | |
| from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score | |
| acc = accuracy_score(y_test, results_df['predicted_stage']) | |
| prec = precision_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0) | |
| recall = recall_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0) | |
| f1 = f1_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0) | |
| # Add as metadata comment at bottom | |
| metrics_text = f"\n# Performance Metrics ({model_name})\n" | |
| metrics_text += f"# Accuracy: {acc:.3f}\n" | |
| metrics_text += f"# Precision (weighted): {prec:.3f}\n" | |
| metrics_text += f"# Recall (weighted): {recall:.3f}\n" | |
| metrics_text += f"# F1-Score (weighted): {f1:.3f}\n" | |
| return results_df, metrics_text | |
| return results_df, None |