Spaces:

techatcreated
/

menopause-ml

Sleeping

66d45ea verified about 1 month ago

59.8 kB

	"""
	SWAN Menopause Stage Prediction (pre / peri / post) using self-reported features
	Uses only the uploaded SWAN TSV file (no synthetic data, no external datasets).

	Outputs:
	- saved artifacts in ./swan_ml_output/
	- documentation.md summarizing steps and results
	- optional CSV outputs for stage predictions and symptom predictions (separate files)

	Notes:
	- The script attempts to locate a menopause-stage column heuristically (common names like MENOSTAT,
	MENO, MENOSYM, MENOP etc.). Please verify the chosen stage column against the codebook.
	- Self-reported features are identified using name-pattern heuristics (VMS/HOT/SLEEP/CESD/STRESS/MOOD/SMOK/ALCOH/EXER/PHYS/VAG/URINE/SEX/PAIN etc).
	- Duplicate column names are tolerantly handled by renaming duplicates.
	"""

	import os, re, sys, argparse
	import numpy as np
	import pandas as pd
	import importlib
	import sklearn
	import matplotlib
	# Use a non-interactive backend by default so the script can run on servers/CI
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	from datetime import datetime, timedelta

	from sklearn.model_selection import train_test_split
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
	from sklearn.inspection import permutation_importance
	from sklearn.preprocessing import label_binarize

	# --------------------------
	# Environment / CLI defaults
	# --------------------------
	# Defaults may be overridden by environment variables or CLI args below
	DATA_PATH = os.environ.get('MENOPAUSE_DATA', "ICPSR_31901/DS0001/31901-0001-Data.tsv")
	OUTPUT_DIR = os.environ.get('MENOPAUSE_OUT', "swan_ml_output")

	# Parse CLI args (safe to parse here for a script; this will be ignored when imported)
	parser = argparse.ArgumentParser(description='Run menopause stage prediction pipeline')
	parser.add_argument('--data', '-d', default=DATA_PATH, help='Path to SWAN TSV file')
	parser.add_argument('--output', '-o', default=OUTPUT_DIR, help='Output directory for artifacts')
	parser.add_argument('--show', action='store_true', help='Show plots interactively (default: off)')
	parser.add_argument('--stage-col', default=None, help='Override detected stage column name')
	# Symptom cycle prediction CLI options
	parser.add_argument('--predict-symptoms', action='store_true', help='Run symptom cycle prediction from CSV input')
	parser.add_argument('--symptoms-input', default=None, help='Input CSV for symptom predictions')
	parser.add_argument('--symptoms-output', default=None, help='Output CSV to write symptom predictions')
	parser.add_argument('--lmp-col', default='LMP', help='Column name used as LMP (date string or day-of-month integer)')
	parser.add_argument('--date-col', default=None, help='Column name for target date; if omitted, uses today or VISIT date if present')
	parser.add_argument('--cycle-length', type=int, default=28, help='Average cycle length in days for symptom prediction')
	# Dual prediction CLI options (separate inputs/outputs for each model)
	parser.add_argument('--predict-dual', action='store_true', help='Run stage + symptom predictions using separate input/output files')
	parser.add_argument('--stage-input', default=None, help='Input CSV for menopause stage predictions')
	parser.add_argument('--stage-output', default=None, help='Output CSV for menopause stage predictions')
	parser.add_argument('--stage-model', default='RandomForest', help='Model for stage prediction: RandomForest or LogisticRegression')
	parser.add_argument('--forecast-dir', default=OUTPUT_DIR, help='Directory containing saved forecast models')
	parser.add_argument('--menopause-stage-col', default=None, help='(Deprecated) Kept for backward compatibility; symptom forecasting no longer uses menopause stage')
	# Parse CLI args only when script is run directly; when imported (e.g., during testing), avoid consuming external argv
	if __name__ == '__main__':
	args = parser.parse_args()
	else:
	# Use defaults when module is imported to avoid interfering with external CLI (pytest, etc.)
	args = parser.parse_args([])

	DATA_PATH = args.data
	OUTPUT_DIR = args.output
	SHOW_PLOTS = bool(args.show)
	STAGE_COL_OVERRIDE = args.stage_col

	# If user only wants symptom-cycle predictions, provide a fast-path before loading the large TSV
	# Define a light-weight cycle-based symptom forecaster and CSV helper so users can run predictions
	# without training the menopause models (useful for small CSV inputs).
	class SymptomCycleForecaster:
	def __init__(self, cycle_length=28, hot_mu=14, hot_sigma=5, mood_mu=26, mood_sigma=4,
	base_hot=0.1, amp_hot=0.4, base_mood=0.1, amp_mood=0.45, threshold=0.5):
	self.cycle_length = cycle_length
	self.hot_mu = hot_mu
	self.hot_sigma = hot_sigma
	self.mood_mu = mood_mu
	self.mood_sigma = mood_sigma
	self.base_hot = base_hot
	self.amp_hot = amp_hot
	self.base_mood = base_mood
	self.amp_mood = amp_mood
	self.threshold = threshold

	def _parse_lmp(self, lmp, reference_date=None):
	if pd.isna(lmp):
	return None
	try:
	lmp_int = int(lmp)
	if reference_date is None:
	ref = pd.Timestamp(datetime.today()).to_pydatetime()
	else:
	ref = pd.to_datetime(reference_date, errors='coerce')
	if pd.isna(ref):
	ref = pd.Timestamp(datetime.today()).to_pydatetime()
	else:
	ref = ref.to_pydatetime()
	day = max(1, min(lmp_int, 28))
	return datetime(ref.year, ref.month, day)
	except Exception:
	try:
	return pd.to_datetime(lmp, errors='coerce').to_pydatetime()
	except Exception:
	return None

	def compute_cycle_day(self, lmp, target_date=None):
	if target_date is None:
	tdate = datetime.today()
	else:
	tdate = pd.to_datetime(target_date, errors='coerce')
	if pd.isna(tdate):
	tdate = datetime.today()
	else:
	tdate = tdate.to_pydatetime()
	lmp_date = self._parse_lmp(lmp, reference_date=tdate)
	if lmp_date is None:
	return None
	delta = (tdate - lmp_date).days
	if delta < 0:
	lmp_date = lmp_date - timedelta(days=self.cycle_length)
	delta = (tdate - lmp_date).days
	cycle_day = (delta % self.cycle_length) + 1
	return int(cycle_day)

	def _gauss_prob(self, day, mu, sigma, base, amp):
	if day is None:
	return np.nan
	val = base + amp * np.exp(-0.5 * ((day - mu) / float(sigma)) ** 2)
	return float(min(max(val, 0.0), 1.0))

	def predict_single(self, lmp, target_date=None):
	day = self.compute_cycle_day(lmp, target_date=target_date)
	hot_p = self._gauss_prob(day, self.hot_mu, self.hot_sigma, self.base_hot, self.amp_hot)
	mood_p = self._gauss_prob(day, self.mood_mu, self.mood_sigma, self.base_mood, self.amp_mood)
	return {
	'cycle_day': day,
	'hotflash_prob': hot_p,
	'hotflash_pred': hot_p >= self.threshold if not np.isnan(hot_p) else None,
	'mood_prob': mood_p,
	'mood_pred': mood_p >= self.threshold if not np.isnan(mood_p) else None
	}

	def predict_df(self, df, lmp_col='LMP', date_col=None, menopause_stage_col=None):
	df = df.copy()
	results = df.apply(
	lambda row: pd.Series(self.predict_single(
	lmp=row.get(lmp_col),
	target_date=(row.get(date_col) if date_col is not None else None)
	)), axis=1
	)
	out = pd.concat([df.reset_index(drop=True), results.reset_index(drop=True)], axis=1)
	return out


	def predict_symptoms_from_csv(input_csv, output_csv, lmp_col='LMP', date_col=None,
	menopause_stage_col=None, cycle_length=28, **kwargs):
	df = pd.read_csv(input_csv)
	fore = SymptomCycleForecaster(cycle_length=cycle_length)
	out_df = fore.predict_df(df, lmp_col=lmp_col, date_col=date_col, menopause_stage_col=menopause_stage_col)
	out_df.to_csv(output_csv, index=False)
	print(f"Wrote symptom predictions for {out_df.shape[0]} rows to {output_csv}")
	print("Sample predictions (first 5 rows):")
	print(out_df[[lmp_col] + ['cycle_day','hotflash_prob','hotflash_pred','mood_prob','mood_pred']].head().to_string())

	# If the user requested only symptom predictions from a CSV, run fast-path and exit
	if args.predict_symptoms:
	if not args.symptoms_input or not args.symptoms_output:
	print("Error: --symptoms-input and --symptoms-output are required when --predict-symptoms is set")
	sys.exit(1)
	else:
	predict_symptoms_from_csv(
	input_csv=args.symptoms_input,
	output_csv=args.symptoms_output,
	lmp_col=args.lmp_col,
	date_col=args.date_col,
	menopause_stage_col=None,
	cycle_length=args.cycle_length
	)
	sys.exit(0)

	# Fast-path for dual predictions (separate stage + symptoms) without loading large TSV
	if args.predict_dual:
	if not args.stage_input or not args.stage_output or not args.symptoms_input or not args.symptoms_output:
	print("Error: --stage-input, --stage-output, --symptoms-input, and --symptoms-output are required when --predict-dual is set")
	sys.exit(1)

	# Load saved pipeline directly via joblib to avoid initializing full training pipeline
	import joblib
	model_file = os.path.join(args.forecast_dir, 'rf_pipeline.pkl' if args.stage_model == 'RandomForest' else 'lr_pipeline.pkl')
	try:
	pipeline = joblib.load(model_file)
	except Exception as e:
	print(f"ERROR: Could not load model file '{model_file}': {e}")
	print("Please train the models first (run the script without --predict-dual) or provide correct --forecast-dir")
	sys.exit(1)

	# Stage predictions
	try:
	stage_data = pd.read_csv(args.stage_input)
	except Exception as e:
	print(f"ERROR: Could not read stage input CSV '{args.stage_input}': {e}")
	sys.exit(1)

	id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject']
	feature_cols = [c for c in stage_data.columns if c not in id_cols]

	# Attempt to load feature metadata so we can reindex inputs to expected features
	import json
	metadata_path = os.path.join(args.forecast_dir, 'forecast_metadata.json')
	try:
	with open(metadata_path, 'r') as f:
	metadata = json.load(f)
	expected_features = metadata.get('feature_names', feature_cols)
	except Exception:
	expected_features = feature_cols

	X = stage_data.reindex(columns=expected_features, fill_value=np.nan)
	preds = pd.DataFrame({'predicted_stage': pipeline.predict(X), 'model': args.stage_model})
	try:
	proba = pipeline.predict_proba(X)
	final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]]
	preds['confidence'] = np.max(proba, axis=1)
	for i, cls in enumerate(final_est.classes_):
	preds[f'prob_{cls}'] = proba[:, i]
	except Exception:
	preds['confidence'] = np.nan

	id_data = stage_data[[c for c in id_cols if c in stage_data.columns]] if any(c in stage_data.columns for c in id_cols) else None
	if id_data is not None:
	stage_results = pd.concat([id_data.reset_index(drop=True), preds.reset_index(drop=True)], axis=1)
	else:
	stage_results = preds.reset_index(drop=True)
	stage_results.insert(0, 'individual', range(1, len(stage_results) + 1))

	stage_results.to_csv(args.stage_output, index=False)
	print(f"Wrote stage predictions for {stage_results.shape[0]} rows to {args.stage_output}")

	# Symptom predictions (independent input/output)
	try:
	symptom_data = pd.read_csv(args.symptoms_input)
	except Exception as e:
	print(f"ERROR: Could not read symptom input CSV '{args.symptoms_input}': {e}")
	sys.exit(1)

	date_col = args.date_col if args.date_col else ('date' if 'date' in symptom_data.columns else None)
	fore = SymptomCycleForecaster(cycle_length=args.cycle_length)
	symptom_results = fore.predict_df(symptom_data, lmp_col=args.lmp_col, date_col=date_col)
	symptom_results.to_csv(args.symptoms_output, index=False)
	print(f"Wrote symptom predictions for {symptom_results.shape[0]} rows to {args.symptoms_output}")
	sys.exit(0)

	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# --------------------------
	# Utility: make column names unique (pandas allows duplicates)
	# --------------------------
	def make_unique_columns(cols):
	counts = {}
	new_cols = []
	for c in cols:
	if c not in counts:
	counts[c] = 0
	new_cols.append(c)
	else:
	counts[c] += 1
	new_cols.append(f"{c}__dup{counts[c]}")
	return new_cols

	# --------------------------
	# 1. Load data
	# --------------------------
	# Guard: only run training and heavy data loading when script is executed directly
	if __name__ == '__main__' and os.path.exists(DATA_PATH):
	print("Loading data from:", DATA_PATH)
	df = pd.read_csv(DATA_PATH, sep='\t', low_memory=False)
	print("Original shape:", df.shape)

	# make column names unique for robust selection (duplicates -> __dup1, __dup2)
	df.columns = make_unique_columns(df.columns.tolist())

	# Show a few columns (first 40) so user can inspect if running interactively
	print("First 40 column names (for inspection):")
	print(df.columns[:40].tolist())

	# --------------------------
	# 2. Identify candidate self-reported features and menopause-stage variable
	# --------------------------
	# Heuristic patterns for self-report variables (adjust if you'd like to include additional columns)
	selfreport_patterns = [
	r'VMS', r'HOT', r'HOTFL', r'NIGHTSW', r'SLEEP', r'CESD', r'STRESS', r'MOOD',
	r'SMOK', r'ALCOH', r'ALCO', r'EXER', r'PHYS', r'ACTIV', r'VAG', r'URINE', r'SEX', r'PAIN',
	r'FATIG', r'IRRIT', r'ANXI', r'DEPRESS', r'BLEED', r'MENSE', r'PERIOD', r'LMP',
	r'HOTSW', r'QOL', r'DRY'
	]
	# Exclude laboratory/biomarker variable name patterns
	biomarker_exclude = r'E2\|FSH\|GLUCOSE\|CHOLESTEROL\|HDL\|TRIG\|SHBG\|DHEAS\|INSULIN\|BMD\|BP\|HEIGHT\|WEIGHT'

	upper_cols = {c: c.upper() for c in df.columns}

	selfreport_cols = []
	for orig, up in upper_cols.items():
	for pat in selfreport_patterns:
	if re.search(pat, up):
	# skip biomarkers that match both symptom patterns and biomarker patterns
	if re.search(biomarker_exclude, up):
	continue
	selfreport_cols.append(orig)
	break

	# Also include basic self-report demographics commonly present (AGE, RACE)
	for dem in ['AGE7','AGE','RACE','LANGINT7','LANGINT']:
	if dem in df.columns and dem not in selfreport_cols:
	selfreport_cols.append(dem)

	# Deduplicate preserving order
	seen=set()
	selfreport_cols = [x for x in selfreport_cols if not (x in seen or seen.add(x))]

	print(f"Found {len(selfreport_cols)} candidate self-reported columns (first 50 shown):")
	print(selfreport_cols[:50])

	# Identify menopause-stage variable heuristically
	stage_cand_patterns = [r'MENOSTAT', r'MENOSYM', r'MENO', r'MENOP', r'MENST', r'MENSE', r'STATUS']
	stage_candidates = [c for c in df.columns if any(re.search(p, c, flags=re.I) for p in stage_cand_patterns)]
	print("Stage-like candidate columns (found):", stage_candidates[:10])

	# If user provided an override for stage column via CLI, honor it (if present in data)
	if STAGE_COL_OVERRIDE:
	if STAGE_COL_OVERRIDE in df.columns:
	print(f"Using overridden stage column: {STAGE_COL_OVERRIDE}")
	stage_candidates = [STAGE_COL_OVERRIDE]
	else:
	print(f"Warning: requested stage column '{STAGE_COL_OVERRIDE}' not present in data; proceeding with heuristic detection")

	# If multiple candidates choose one with few unique values (likely coded categories)
	stage_col = None
	for c in stage_candidates:
	nunique = df[c].nunique(dropna=True)
	# prefer small discrete sets (e.g., 2-6 categories)
	if 1 < nunique <= 20:
	stage_col = c
	break

	if stage_col is None and stage_candidates:
	# fallback to first candidate
	stage_col = stage_candidates[0]

	if stage_col is None:
	raise RuntimeError("No menopause-stage-like column found automatically. Inspect df.columns and pick the proper variable (e.g., MENOSTAT).")

	print("Selected stage column:", stage_col, " unique values:", df[stage_col].nunique(dropna=True))
	print("Sample raw counts:")
	print(df[stage_col].value_counts(dropna=False).head(20))

	# --------------------------
	# 3. Create working dataframe with self-report features + stage
	# --------------------------
	use_cols = [stage_col] + [c for c in selfreport_cols if c in df.columns and c != stage_col]
	data = df[use_cols].copy()

	# Replace common SWAN missing codes with NaN
	missing_values = [-9, -8, -7, -1, '.', 'NA', 'N/A', '999', 9999]
	data.replace(missing_values, np.nan, inplace=True)

	# Try convert object columns to numeric when appropriate
	for col in data.columns:
	if data[col].dtype == object:
	coerced = pd.to_numeric(data[col].astype(str).str.strip(), errors='coerce')
	# If many values become numeric, use numeric version; else leave as categorical string
	if coerced.notna().sum() > len(coerced) * 0.5:
	data[col] = coerced
	else:
	# replace blank/'nan' strings with np.nan
	data[col] = data[col].astype(str).str.strip().replace({'nan': np.nan, '': np.nan})

	# --------------------------
	# 4. Map stage variable to standardized labels {pre, peri, post}
	# Important: this is heuristic. Verify using the codebook and adjust mapping if needed.
	# --------------------------
	def map_stage_to_labels(series):
	# Try textual mapping first
	s = series.copy()
	try:
	uniques = [str(x).lower() for x in s.dropna().unique()]
	except Exception:
	uniques = []
	# textual mapping
	if any(x in ['pre','premenopausal','premenopause','pre-menopausal'] for x in uniques):
	s = s.astype(str).str.lower()
	s = s.replace({'premenopausal':'pre','pre-menopausal':'pre','pre-menopause':'pre','pre':'pre'})
	s = s.replace({'perimenopausal':'peri','peri-menopausal':'peri','peri':'peri'})
	s = s.replace({'postmenopausal':'post','post-menopausal':'post','post':'post'})
	return s.map({'pre':'pre','peri':'peri','post':'post'})
	# numeric mapping heuristic: map min->pre, median->peri, max->post
	num = pd.to_numeric(s, errors='coerce')
	num_unique = sorted(num.dropna().unique().tolist())
	if len(num_unique) >= 3:
	mapping = {num_unique[0]:'pre', num_unique[len(num_unique)//2]:'peri', num_unique[-1]:'post'}
	return num.map(mapping)
	# 2-level mapping (assume 1->pre,2->post) or fallback
	if len(num_unique) == 2:
	return num.map({num_unique[0]:'pre', num_unique[1]:'post'})
	# If not mappable, return NaN series
	return pd.Series([np.nan]*len(s), index=s.index)

	mapped_stage = map_stage_to_labels(data[stage_col])
	# If mapping failed (too many NaNs), attempt a simple bleed-based heuristic (last menstrual period)
	if mapped_stage.isna().mean() > 0.9:
	bleed_candidates = [c for c in data.columns if re.search(r'LMP\|BLEED\|PERIOD\|MENSTR', c, flags=re.I)]
	if len(bleed_candidates) > 0:
	lcol = bleed_candidates[0]
	lnum = pd.to_numeric(data[lcol], errors='coerce')
	mapped_stage = pd.Series(index=data.index, dtype=object)
	mapped_stage[lnum.isna()] = 'post'
	mapped_stage[lnum.notna()] = 'pre'
	else:
	raise RuntimeError("Failed to map stage variable to pre/peri/post and no bleed/LMP variable found.")

	data['_menopause_stage'] = mapped_stage
	print("Mapped stage counts (after heuristic mapping):")
	print(data['_menopause_stage'].value_counts(dropna=False))

	# Drop rows with no mapped stage
	data = data[~data['_menopause_stage'].isna()].copy()
	print("Rows available for modeling:", data.shape[0])

	# --------------------------
	# 5. Feature selection for modeling
	# Keep only self-report fields with enough non-missing values and >1 unique value
	# --------------------------
	feature_candidates = [c for c in use_cols if c != stage_col]
	selected_features = []
	for c in feature_candidates:
	non_null = data[c].notna().sum()
	# require at least 2% nonmissing or minimum 50 observations
	if non_null < max(50, len(data) * 0.02):
	continue
	if data[c].nunique(dropna=True) <= 1:
	continue
	selected_features.append(c)

	print("Number of features selected for modeling:", len(selected_features))
	print("First 40 features (if many):", selected_features[:40])

	# --------------------------
	# 6. Preprocessing pipeline
	# Numeric features: impute mean
	# Categorical features: impute most frequent + one-hot encode
	# Normalization: only added for logistic regression pipeline (tree-based RF doesn't need scaling)
	# --------------------------
	numeric_feats = [c for c in selected_features if pd.api.types.is_numeric_dtype(data[c])]
	cat_feats = [c for c in selected_features if c not in numeric_feats]

	from sklearn.pipeline import Pipeline
	from sklearn.compose import ColumnTransformer

	numeric_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='mean'))
	])

	# Construct OneHotEncoder in a sklearn-version compatible way
	try:
	ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
	except TypeError:
	# older sklearn versions use `sparse` kwarg
	ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

	categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='most_frequent')),
	('onehot', ohe)
	])

	preprocessor = ColumnTransformer(transformers=[
	('num', numeric_transformer, numeric_feats),
	('cat', categorical_transformer, cat_feats)
	], remainder='drop')

	# Two pipelines: RandomForest (no scaling) and LogisticRegression (scaling)
	rf_pipeline = Pipeline(steps=[
	('pre', preprocessor),
	('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
	])

	lr_pipeline = Pipeline(steps=[
	('pre', preprocessor),
	('scaler', StandardScaler()),
	('lr', LogisticRegression(solver='lbfgs', max_iter=1000))
	])

	# --------------------------
	# 7. Prepare data, train/test split
	# --------------------------
	X = data[selected_features].copy()
	y = data['_menopause_stage'].copy().astype(str) # values: 'pre','peri','post' (hopefully)

	print("Target class distribution:")
	print(y.value_counts())

	# Stratified split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
	print("Train / test sizes:", X_train.shape[0], X_test.shape[0])

	# --------------------------
	# 8. Train models
	# --------------------------
	print("Training RandomForest...")
	rf_pipeline.fit(X_train, y_train)
	print("RandomForest trained.")

	print("Training LogisticRegression (multinomial)...")
	lr_pipeline.fit(X_train, y_train)
	print("LogisticRegression trained.")

	# --------------------------
	# 9. Predictions and assessment
	# --------------------------
	def evaluate_model(pipeline, X_test, y_test, model_name, output_dir=OUTPUT_DIR):
	y_pred = pipeline.predict(X_test)
	report = classification_report(y_test, y_pred)
	print(f"\n=== {model_name} Classification Report ===\n{report}")
	# confusion matrix
	labels = sorted(y_test.unique())
	cm = confusion_matrix(y_test, y_pred, labels=labels)
	print(f"{model_name} Confusion Matrix (rows=true, cols=pred):\nLabels: {labels}\n{cm}")
	# Save classification report
	with open(os.path.join(output_dir, f"classification_report_{model_name.replace(' ','_')}.txt"), "w") as f:
	f.write(report)
	# Plot confusion matrix with matplotlib
	fig, ax = plt.subplots(figsize=(5,4))
	im = ax.imshow(cm, interpolation='nearest')
	ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=45)
	ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels)
	ax.set_title(f"{model_name} Confusion Matrix")
	for i in range(cm.shape[0]):
	for j in range(cm.shape[1]):
	ax.text(j, i, format(cm[i, j], 'd'), ha="center", va="center")
	plt.tight_layout()
	plt.savefig(os.path.join(output_dir, f"{model_name.replace(' ','_')}_confusion_matrix.png"))
	# Show plots only when requested; otherwise close to free resources (non-interactive default)
	if SHOW_PLOTS:
	plt.show()
	else:
	plt.close('all')
	return y_pred, cm

	rf_pred, rf_cm = evaluate_model(rf_pipeline, X_test, y_test, "RandomForest")
	lr_pred, lr_cm = evaluate_model(lr_pipeline, X_test, y_test, "LogisticRegression")

	# 10. Feature importance
	# Extract feature names after preprocessing (numerics stay same; categorical one-hot create names)
	pre = rf_pipeline.named_steps['pre']
	# Get numeric feature names
	feature_names = []
	if len(numeric_feats) > 0:
	feature_names.extend(numeric_feats)
	if len(cat_feats) > 0:
	# Get onehot output names
	ohe = pre.named_transformers_['cat'].named_steps['onehot']
	try:
	cat_onehot_names = ohe.get_feature_names_out(cat_feats)
	except Exception:
	# fallback
	cat_onehot_names = []
	feature_names.extend(cat_onehot_names.tolist() if hasattr(cat_onehot_names, 'tolist') else list(cat_onehot_names))
	# Feature importances from RandomForest
	rf_model = rf_pipeline.named_steps['rf']
	importances = rf_model.feature_importances_
	imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False)
	imp_df.to_csv(os.path.join(OUTPUT_DIR, "rf_feature_importances.csv"), index=False)
	print("\nTop 20 RF feature importances:")
	print(imp_df.head(20).to_string(index=False))

	# Permutation importance (robust)
	print("Computing permutation importance (this can take some time)...")
	perm = permutation_importance(rf_pipeline, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
	perm_idx = perm.importances_mean.argsort()[::-1]
	perm_df = pd.DataFrame({
	'feature': np.array(feature_names)[perm_idx],
	'importance_mean': perm.importances_mean[perm_idx],
	'importance_std': perm.importances_std[perm_idx]
	})
	perm_df.to_csv(os.path.join(OUTPUT_DIR, "rf_permutation_importances.csv"), index=False)
	print("Top 20 permutation importances:")
	print(perm_df.head(20).to_string(index=False))

	# Plot RF top features
	topn = min(20, imp_df.shape[0])
	fig, ax = plt.subplots(figsize=(8,6))
	ax.barh(imp_df['feature'].head(topn)[::-1], imp_df['importance'].head(topn)[::-1])
	ax.set_title("RandomForest: Top feature importances")
	ax.set_xlabel("Importance")
	plt.tight_layout()
	plt.savefig(os.path.join(OUTPUT_DIR, "rf_top_feature_importances.png"))
	if SHOW_PLOTS:
	plt.show()
	else:
	plt.close('all')

	# 11. ROC curves (one-vs-rest) if predict_proba available
	def plot_multiclass_roc(pipeline, X_test, y_test, model_name):
	if not hasattr(pipeline, "predict_proba"):
	print(f"{model_name} has no predict_proba; skipping ROC plot.")
	return
	# Must use same class order as pipeline's final estimator
	final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]]
	classes = final_est.classes_
	y_test_bin = label_binarize(y_test, classes=classes)
	y_score = pipeline.predict_proba(X_test)
	for i, cls in enumerate(classes):
	fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
	roc_auc = auc(fpr, tpr)
	plt.figure()
	plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
	plt.plot([0,1],[0,1], linestyle='--')
	plt.title(f"{model_name} ROC for class {cls}")
	plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
	plt.legend(loc='lower right')
	plt.savefig(os.path.join(OUTPUT_DIR, f"{model_name.replace(' ','_')}_ROC_{cls}.png"))
	if SHOW_PLOTS:
	plt.show()
	else:
	plt.close('all')

	print("Plotting ROC curves for RandomForest and LogisticRegression (if available)...")
	if __name__ == '__main__' and 'rf_pipeline' in globals():
	plot_multiclass_roc(rf_pipeline, X_test, y_test, "RandomForest")
	plot_multiclass_roc(lr_pipeline, X_test, y_test, "LogisticRegression")

	# ==========================================================================================
	# 12. FORECASTING MODULE: Predict menopausal stage for new individuals
	# ==========================================================================================
	class MenopauseForecast:
	"""
	Forecasting module for predicting menopausal stage (pre/peri/post) given self-reported features.

	This class encapsulates the trained models and preprocessing pipeline to make predictions
	on new data with the same features used during training.
	"""

	def __init__(self, rf_pipeline, lr_pipeline, feature_names, stage_classes):
	"""
	Initialize the forecaster with trained pipelines.

	Parameters:
	-----------
	rf_pipeline : sklearn Pipeline
	Trained RandomForest pipeline
	lr_pipeline : sklearn Pipeline
	Trained LogisticRegression pipeline
	feature_names : list
	List of feature column names used for training
	stage_classes : list
	List of possible menopause stage classes (e.g., ['pre', 'peri', 'post'])
	"""
	self.rf_pipeline = rf_pipeline
	self.lr_pipeline = lr_pipeline
	self.feature_names = feature_names
	self.stage_classes = stage_classes
	self.models = {
	'RandomForest': rf_pipeline,
	'LogisticRegression': lr_pipeline
	}

	def predict_single(self, feature_dict, model='RandomForest', return_proba=True):
	"""
	Predict menopausal stage for a single individual.

	Parameters:
	-----------
	feature_dict : dict
	Dictionary with feature names as keys and values for prediction.
	Example: {'HOT7': 1, 'SLEEP7': 2, 'CESD': 10, ...}
	model : str
	Which model to use for prediction: 'RandomForest' or 'LogisticRegression'
	return_proba : bool
	If True, return prediction probabilities; otherwise just the class label

	Returns:
	--------
	dict : Contains 'stage', 'confidence', and optionally 'probabilities'
	"""
	if model not in self.models:
	raise ValueError(f"Model '{model}' not found. Available: {list(self.models.keys())}")

	# Create DataFrame with single row, reindex to match training features
	X = pd.DataFrame([feature_dict]).reindex(columns=self.feature_names, fill_value=np.nan)

	pipeline = self.models[model]
	prediction = pipeline.predict(X)[0]

	result = {
	'stage': prediction,
	'model': model,
	'confidence': None,
	'probabilities': None
	}

	if return_proba:
	try:
	proba = pipeline.predict_proba(X)[0]
	result['confidence'] = float(np.max(proba))
	result['probabilities'] = {
	cls: float(prob)
	for cls, prob in zip(pipeline.named_steps[list(pipeline.named_steps.keys())[-1]].classes_, proba)
	}
	except Exception as e:
	print(f"Warning: Could not compute probabilities: {e}")

	return result

	def predict_batch(self, df, model='RandomForest', return_proba=True):
	"""
	Predict menopausal stage for multiple individuals (batch prediction).

	Parameters:
	-----------
	df : pd.DataFrame
	DataFrame with feature columns matching training features.
	Missing values will be handled by the preprocessing pipeline.
	model : str
	Which model to use: 'RandomForest' or 'LogisticRegression'
	return_proba : bool
	If True, return prediction probabilities

	Returns:
	--------
	pd.DataFrame : Contains 'predicted_stage', 'confidence', and probability columns
	"""
	if model not in self.models:
	raise ValueError(f"Model '{model}' not found. Available: {list(self.models.keys())}")

	# Reindex to match training features
	X = df.reindex(columns=self.feature_names, fill_value=np.nan)

	pipeline = self.models[model]
	predictions = pipeline.predict(X)

	result_df = pd.DataFrame({
	'predicted_stage': predictions,
	'model': model
	})

	if return_proba:
	try:
	proba = pipeline.predict_proba(X)
	final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]]
	result_df['confidence'] = np.max(proba, axis=1)

	# Add probability column for each class
	for i, cls in enumerate(final_est.classes_):
	result_df[f'prob_{cls}'] = proba[:, i]
	except Exception as e:
	print(f"Warning: Could not compute probabilities: {e}")

	return result_df

	def compare_models(self, feature_dict):
	"""
	Compare predictions from both RandomForest and LogisticRegression models.

	Parameters:
	-----------
	feature_dict : dict
	Feature values for the individual

	Returns:
	--------
	dict : Predictions and probabilities from both models
	"""
	rf_result = self.predict_single(feature_dict, model='RandomForest', return_proba=True)
	lr_result = self.predict_single(feature_dict, model='LogisticRegression', return_proba=True)

	return {
	'RandomForest': rf_result,
	'LogisticRegression': lr_result
	}

	def get_feature_info(self):
	"""Return information about required features."""
	return {
	'num_features': len(self.feature_names),
	'feature_names': self.feature_names,
	'stage_classes': self.stage_classes
	}


	def create_forecast_example():
	"""
	Create an example forecast instance and demonstrate usage.

	This function is robust: if the training artifacts (`rf_pipeline`, `lr_pipeline`,
	`selected_features`, `X_train`, `X_test`) are not available in memory (e.g., when
	the module is imported in another process), it attempts to load saved pipelines
	from `OUTPUT_DIR` via `load_forecast_model()` and uses placeholder inputs.
	"""
	print("\n" + "="*80)
	print("FORECASTING MODULE EXAMPLE: Predicting Menopausal Stage")
	print("="*80)

	# Determine pipelines and feature metadata (use in-memory if available, else load from disk)
	try:
	_rf = rf_pipeline
	_lr = lr_pipeline
	_features = selected_features
	_stage_classes = sorted(y.unique().tolist())
	has_training = True
	except NameError:
	print("Training artifacts not present in memory; attempting to load from disk...")
	try:
	_loaded = load_forecast_model(OUTPUT_DIR)
	_rf = _loaded.rf_pipeline
	_lr = _loaded.lr_pipeline
	_features = _loaded.feature_names
	_stage_classes = _loaded.stage_classes
	has_training = False
	except Exception as e:
	raise RuntimeError(f"Failed to initialize forecaster from disk: {e}")

	forecast = MenopauseForecast(
	rf_pipeline=_rf,
	lr_pipeline=_lr,
	feature_names=_features,
	stage_classes=_stage_classes
	)

	print(f"\nForecaster initialized with {len(_features)} features")
	print(f"Predicting stages: {_stage_classes}")

	# Example 1: Single individual prediction
	print("\n--- Example 1: Predict for a single individual ---")
	example_individual = {}
	n_example_feats = min(10, len(_features))

	if has_training:
	for feat in _features[:n_example_feats]:
	try:
	example_individual[feat] = float(pd.to_numeric(X_train[feat], errors='coerce').median())
	except Exception:
	# Fallback to mode or NaN
	try:
	example_individual[feat] = X_train[feat].mode().iloc[0]
	except Exception:
	example_individual[feat] = np.nan
	else:
	# No training DF available; provide NaN placeholders to let pipeline impute
	for feat in _features[:n_example_feats]:
	example_individual[feat] = np.nan

	result = forecast.predict_single(example_individual, model='RandomForest', return_proba=True)
	print(f"Predicted stage: {result.get('stage')}")
	print(f"Confidence: {result.get('confidence'):.3f}" if result.get('confidence') is not None else "Confidence: None")
	if result.get('probabilities'):
	print("Stage probabilities:")
	for stage, prob in sorted(result['probabilities'].items()):
	print(f" {stage}: {prob:.3f}")

	# Example 2: Compare models
	print("\n--- Example 2: Compare RandomForest vs LogisticRegression ---")
	comparison = forecast.compare_models(example_individual)
	for model_name, cres in comparison.items():
	print(f"\n{model_name}:")
	print(f" Predicted stage: {cres.get('stage')}")
	print(f" Confidence: {cres.get('confidence'):.3f}" if cres.get('confidence') is not None else " Confidence: None")

	# Example 3: Batch prediction on a small sample (either X_test if available or placeholder rows)
	print("\n--- Example 3: Batch prediction (small sample) ---")
	if has_training:
	try:
	test_sample = X_test.iloc[:5].copy()
	batch_results = forecast.predict_batch(test_sample, model='RandomForest', return_proba=True)
	print(batch_results.to_string())
	except Exception as e:
	print(f"Batch prediction failed on training sample: {e}")
	else:
	# Create a small placeholder DataFrame with feature columns filled with NaN
	placeholder = pd.DataFrame([{f: np.nan for f in _features[:n_example_feats]}])
	batch_results = forecast.predict_batch(placeholder, model='RandomForest', return_proba=True)
	print(batch_results.to_string())

	return forecast


	def save_forecast_model(forecast_instance, output_dir=OUTPUT_DIR):
	"""
	Save the forecast model instance for later use (optional: can use joblib for production).

	For now, saves metadata about features and classes that can be used to reinitialize
	the forecaster.

	Parameters:
	-----------
	forecast_instance : MenopauseForecast
	The forecaster to save
	output_dir : str
	Directory to save metadata
	"""
	import json
	import joblib

	metadata = {
	'feature_names': forecast_instance.feature_names,
	'stage_classes': forecast_instance.stage_classes,
	'num_features': len(forecast_instance.feature_names)
	}

	# Save metadata as JSON
	with open(os.path.join(output_dir, 'forecast_metadata.json'), 'w') as f:
	json.dump(metadata, f, indent=2)

	# Save trained pipelines using joblib (allows full reuse)
	joblib.dump(forecast_instance.rf_pipeline, os.path.join(output_dir, 'rf_pipeline.pkl'))
	joblib.dump(forecast_instance.lr_pipeline, os.path.join(output_dir, 'lr_pipeline.pkl'))

	print(f"Forecast model saved to {output_dir}")
	print(f" - forecast_metadata.json")
	print(f" - rf_pipeline.pkl")
	print(f" - lr_pipeline.pkl")


	def load_forecast_model(output_dir=OUTPUT_DIR):
	"""
	Load a previously saved forecast model.

	Parameters:
	-----------
	output_dir : str
	Directory containing saved models

	Returns:
	--------
	MenopauseForecast : The loaded forecaster
	"""
	import json
	import joblib

	# Load metadata
	with open(os.path.join(output_dir, 'forecast_metadata.json'), 'r') as f:
	metadata = json.load(f)

	# Load pipelines
	rf_pipeline_loaded = joblib.load(os.path.join(output_dir, 'rf_pipeline.pkl'))
	lr_pipeline_loaded = joblib.load(os.path.join(output_dir, 'lr_pipeline.pkl'))

	# Recreate forecaster
	forecast = MenopauseForecast(
	rf_pipeline=rf_pipeline_loaded,
	lr_pipeline=lr_pipeline_loaded,
	feature_names=metadata['feature_names'],
	stage_classes=metadata['stage_classes']
	)

	print(f"Forecast model loaded from {output_dir}")
	return forecast


	# Initialize and demonstrate the forecasting module

	# Symptom cycle forecasting (defined earlier near CLI args)
	class SymptomCycleForecaster:
	"""
	Predicts the probability of hot flashes and mood changes within a menstrual cycle
	based on last menstrual period (LMP) date and target date.
	"""
	def __init__(self, cycle_length=28, hot_mu=14, hot_sigma=5, mood_mu=26, mood_sigma=4,
	base_hot=0.1, amp_hot=0.4, base_mood=0.1, amp_mood=0.45, threshold=0.5):
	self.cycle_length = cycle_length
	self.hot_mu = hot_mu
	self.hot_sigma = hot_sigma
	self.mood_mu = mood_mu
	self.mood_sigma = mood_sigma
	self.base_hot = base_hot
	self.amp_hot = amp_hot
	self.base_mood = base_mood
	self.amp_mood = amp_mood
	self.threshold = threshold

	def _parse_lmp(self, lmp, reference_date=None):
	"""Parse LMP input which may be a full date string or an integer day-of-month."""
	if pd.isna(lmp):
	return None
	# If numeric day (int-like), construct a date in the same month as reference_date
	try:
	lmp_int = int(lmp)
	if reference_date is None:
	ref = pd.Timestamp(datetime.today()).to_pydatetime()
	else:
	ref = pd.to_datetime(reference_date, errors='coerce')
	if pd.isna(ref):
	ref = pd.Timestamp(datetime.today()).to_pydatetime()
	else:
	ref = ref.to_pydatetime()
	# Clamp day to valid range
	day = max(1, min(lmp_int, 28))
	return datetime(ref.year, ref.month, day)
	except Exception:
	# Try parse as full date string
	try:
	return pd.to_datetime(lmp, errors='coerce').to_pydatetime()
	except Exception:
	return None

	def compute_cycle_day(self, lmp, target_date=None):
	"""Return 1-based cycle day (1..cycle_length) or None if cannot compute."""
	if target_date is None:
	tdate = datetime.today()
	else:
	tdate = pd.to_datetime(target_date, errors='coerce')
	if pd.isna(tdate):
	tdate = datetime.today()
	else:
	tdate = tdate.to_pydatetime()
	lmp_date = self._parse_lmp(lmp, reference_date=tdate)
	if lmp_date is None:
	return None
	delta = (tdate - lmp_date).days
	if delta < 0:
	# If LMP is in the future, assume it refers to previous cycle (subtract one month)
	lmp_date = lmp_date - timedelta(days=self.cycle_length)
	delta = (tdate - lmp_date).days
	cycle_day = (delta % self.cycle_length) + 1
	return int(cycle_day)

	def _gauss_prob(self, day, mu, sigma, base, amp):
	if day is None:
	return np.nan
	val = base + amp * np.exp(-0.5 * ((day - mu) / float(sigma)) ** 2)
	return float(min(max(val, 0.0), 1.0))

	def predict_single(self, lmp, target_date=None):
	day = self.compute_cycle_day(lmp, target_date=target_date)
	hot_p = self._gauss_prob(day, self.hot_mu, self.hot_sigma, self.base_hot, self.amp_hot)
	mood_p = self._gauss_prob(day, self.mood_mu, self.mood_sigma, self.base_mood, self.amp_mood)
	return {
	'cycle_day': day,
	'hotflash_prob': hot_p,
	'hotflash_pred': hot_p >= self.threshold if not np.isnan(hot_p) else None,
	'mood_prob': mood_p,
	'mood_pred': mood_p >= self.threshold if not np.isnan(mood_p) else None
	}

	def predict_df(self, df, lmp_col='LMP', date_col=None, menopause_stage_col=None):
	df = df.copy()
	results = df.apply(
	lambda row: pd.Series(self.predict_single(
	lmp=row.get(lmp_col),
	target_date=(row.get(date_col) if date_col is not None else None)
	)), axis=1
	)
	out = pd.concat([df.reset_index(drop=True), results.reset_index(drop=True)], axis=1)
	return out


	def predict_symptoms_from_csv(input_csv, output_csv, lmp_col='LMP', date_col=None,
	menopause_stage_col=None, cycle_length=28, **kwargs):
	"""Read input CSV, predict hot flashes/mood by cycle day, and write output CSV."""
	df = pd.read_csv(input_csv)
	fore = SymptomCycleForecaster(cycle_length=cycle_length)
	out_df = fore.predict_df(df, lmp_col=lmp_col, date_col=date_col, menopause_stage_col=menopause_stage_col)
	out_df.to_csv(output_csv, index=False)
	# Print a brief summary
	print(f"Wrote symptom predictions for {out_df.shape[0]} rows to {output_csv}")
	print("Sample predictions (first 5 rows):")
	print(out_df[[lmp_col] + ['cycle_day','hotflash_prob','hotflash_pred','mood_prob','mood_pred']].head().to_string())

	# CLI integration: run symptom prediction if requested
	if __name__ == '__main__':
	# If symptom prediction requested via CLI, run fast-path and exit
	if args.predict_symptoms:
	if not args.symptoms_input or not args.symptoms_output:
	print("Error: --symptoms-input and --symptoms-output are required when --predict-symptoms is set")
	sys.exit(1)
	else:
	predict_symptoms_from_csv(
	input_csv=args.symptoms_input,
	output_csv=args.symptoms_output,
	lmp_col=args.lmp_col,
	date_col=args.date_col,
	cycle_length=args.cycle_length
	)
	sys.exit(0)

	# Dual predictions are handled in the early fast-path above to avoid training.

	# Default behavior: create demo forecaster, save trained models and show summary
	forecast_model = create_forecast_example()
	save_forecast_model(forecast_model)

	print("\n" + "="*80)
	print("FORECASTING MODULE SUMMARY")
	print("="*80)
	print("""
	The MenopauseForecast class provides three main methods for predictions:

	1. predict_single(feature_dict, model='RandomForest', return_proba=True)
	- Predict stage for one individual given feature values
	- Returns predicted stage and confidence scores

	2. predict_batch(df, model='RandomForest', return_proba=True)
	- Predict stages for multiple individuals
	- Returns DataFrame with predictions and probabilities for each stage

	3. compare_models(feature_dict)
	- Compare predictions from both RandomForest and LogisticRegression
	- Useful for validating model agreement

	Usage in your own code:
	from menopause import load_forecast_model

	# Load the trained forecaster
	forecast = load_forecast_model('swan_ml_output')

	# Predict for an individual
	features = {'HOT7': 1, 'SLEEP7': 2, 'CESD': 10, ...}
	result = forecast.predict_single(features, model='RandomForest')

	# Predict for multiple individuals
	results_df = forecast.predict_batch(your_dataframe, model='RandomForest')
	""")


	# ==========================================================================================
	# 13. CSV INPUT/OUTPUT FUNCTIONALITY: Batch prediction from CSV files
	# ==========================================================================================

	def predict_from_csv(input_csv, forecast_instance, output_csv=None, model='RandomForest', output_dir=OUTPUT_DIR):
	"""
	Read individual data from CSV, make predictions, and save results.

	Parameters:
	-----------
	input_csv : str
	Path to input CSV file with feature columns for individuals
	CSV should have columns matching training features (or subset)
	forecast_instance : MenopauseForecast
	The trained forecaster instance
	output_csv : str
	Path to output CSV file (default: input_csv with '_predictions' appended)
	model : str
	Which model to use ('RandomForest' or 'LogisticRegression')
	output_dir : str
	Directory to save results (for metadata)

	Returns:
	--------
	pd.DataFrame : Results with predictions and confidence scores

	Example:
	--------
	forecast = load_forecast_model('swan_ml_output')
	results = predict_from_csv('individuals.csv', forecast)
	# Results saved to 'individuals_predictions.csv'
	"""
	import os

	# Read input CSV
	print(f"Reading input data from: {input_csv}")
	try:
	data = pd.read_csv(input_csv)
	except FileNotFoundError:
	print(f"ERROR: File not found: {input_csv}")
	return None

	n_samples = len(data)
	print(f"Loaded {n_samples} individuals")

	# Identify feature columns (exclude ID columns)
	id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject']
	feature_cols = [c for c in data.columns if c not in id_cols]

	# Separate ID columns from features
	id_data = data[[c for c in id_cols if c in data.columns]] if any(c in data.columns for c in id_cols) else None

	# Make predictions
	print(f"Making predictions using {model}...")
	predictions = forecast_instance.predict_batch(
	data[feature_cols],
	model=model,
	return_proba=True
	)

	# Combine with original data
	if id_data is not None:
	results = pd.concat([id_data.reset_index(drop=True), predictions.reset_index(drop=True)], axis=1)
	else:
	results = predictions.reset_index(drop=True)

	# Add individual index if no ID column
	if id_data is None:
	results.insert(0, 'individual', range(1, n_samples + 1))

	# Set output file path
	if output_csv is None:
	base, ext = os.path.splitext(input_csv)
	output_csv = f"{base}_predictions{ext}"

	# Save results
	print(f"Saving predictions to: {output_csv}")
	results.to_csv(output_csv, index=False)
	return results


	def predict_dual_from_csv(stage_input_csv, stage_output_csv, symptoms_input_csv, symptoms_output_csv,
	forecast_dir=OUTPUT_DIR, model='RandomForest', lmp_col='LMP',
	date_col=None, cycle_length=28):
	"""Run menopause stage prediction and symptom-cycle prediction using separate
	input and output files for each model.

	Returns:
	--------
	dict : {'stage': stage_results_df, 'symptoms': symptom_results_df}
	"""
	print(f"Reading stage input data from: {stage_input_csv}")
	try:
	stage_data = pd.read_csv(stage_input_csv)
	except FileNotFoundError:
	print(f"ERROR: File not found: {stage_input_csv}")
	return None

	# Load forecast model
	try:
	forecast = load_forecast_model(output_dir=forecast_dir)
	except Exception as e:
	print(f"ERROR: Could not load forecast model from '{forecast_dir}': {e}")
	return None

	# Identify id and feature columns
	id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject']
	feature_cols = [c for c in stage_data.columns if c not in id_cols]

	# Make stage predictions
	print(f"Making menopause stage predictions using {model}...")
	stage_preds = forecast.predict_batch(stage_data[feature_cols], model=model, return_proba=True)

	id_data = stage_data[[c for c in id_cols if c in stage_data.columns]] if any(c in stage_data.columns for c in id_cols) else None
	if id_data is not None:
	stage_results = pd.concat([id_data.reset_index(drop=True), stage_preds.reset_index(drop=True)], axis=1)
	else:
	stage_results = stage_preds.reset_index(drop=True)
	stage_results.insert(0, 'individual', range(1, len(stage_results) + 1))

	# Default stage output path if not provided
	if stage_output_csv is None:
	base, ext = os.path.splitext(stage_input_csv)
	stage_output_csv = f"{base}_stage_predictions{ext}"

	print(f"Saving stage predictions to: {stage_output_csv}")
	stage_results.to_csv(stage_output_csv, index=False)

	# Symptom predictions (independent)
	print(f"Reading symptom input data from: {symptoms_input_csv}")
	try:
	symptom_data = pd.read_csv(symptoms_input_csv)
	except FileNotFoundError:
	print(f"ERROR: File not found: {symptoms_input_csv}")
	return None

	if date_col is None and 'date' in symptom_data.columns:
	date_col = 'date'

	fore = SymptomCycleForecaster(cycle_length=cycle_length)
	symptom_results = fore.predict_df(symptom_data, lmp_col=lmp_col, date_col=date_col)

	# Default symptom output path if not provided
	if symptoms_output_csv is None:
	base, ext = os.path.splitext(symptoms_input_csv)
	symptoms_output_csv = f"{base}_symptom_predictions{ext}"

	print(f"Saving symptom predictions to: {symptoms_output_csv}")
	symptom_results.to_csv(symptoms_output_csv, index=False)

	return {'stage': stage_results, 'symptoms': symptom_results}


	def predict_combined_from_csv(args, *kwargs):
	"""Deprecated: combined predictions are removed in favor of separate input/output files."""
	raise ValueError(
	"Combined predictions are deprecated. Use predict_dual_from_csv() with separate stage and symptom input/output files."
	)


	def create_demo_csv(forecast_instance, num_individuals=5, output_file='demo_individuals.csv', output_dir=OUTPUT_DIR):
	"""
	Create a demo CSV file with sample individuals for testing predictions.
	Uses statistics from the training data to generate realistic feature values.

	Parameters:
	-----------
	forecast_instance : MenopauseForecast
	The trained forecaster (used to get feature names)
	num_individuals : int
	Number of demo individuals to generate
	output_file : str
	Path to output CSV file
	output_dir : str
	Directory to save demo file

	Returns:
	--------
	str : Path to created CSV file
	"""

	# Get feature names from forecaster
	feature_names = forecast_instance.feature_names

	# Create demo data with random realistic values
	np.random.seed(42)
	demo_data = {}

	# Add individual ID
	demo_data['individual'] = [f"Individual_{i+1}" for i in range(num_individuals)]

	# Generate random feature values (using ranges typical for SWAN data)
	for feat in feature_names:
	# Random values between 1 and 5 (typical Likert scale for SWAN)
	demo_data[feat] = np.random.randint(1, 6, size=num_individuals)

	# Create DataFrame
	demo_df = pd.DataFrame(demo_data)

	# Create full path
	full_path = os.path.join(output_dir, output_file)

	# Ensure output directory exists
	os.makedirs(output_dir, exist_ok=True)

	# Save demo file
	demo_df.to_csv(full_path, index=False)

	print(f"✅ Demo CSV created: {full_path}")
	print(f" Individuals: {num_individuals}")
	print(f" Features: {len(feature_names)}")
	print(f" File shape: {demo_df.shape}")

	return full_path


	def add_performance_metrics_to_csv(results_df, y_test=None, model_name='RandomForest'):
	"""
	Add performance metrics to predictions CSV.
	If true labels available, computes accuracy, precision, recall, F1-score.

	Parameters:
	-----------
	results_df : pd.DataFrame
	Results dataframe with predictions
	y_test : array-like
	True labels (optional)
	model_name : str
	Name of model used

	Returns:
	--------
	pd.DataFrame : Results with metrics appended
	"""

	if y_test is not None:
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

	acc = accuracy_score(y_test, results_df['predicted_stage'])
	prec = precision_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0)
	recall = recall_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0)
	f1 = f1_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0)

	# Add as metadata comment at bottom
	metrics_text = f"\n# Performance Metrics ({model_name})\n"
	metrics_text += f"# Accuracy: {acc:.3f}\n"
	metrics_text += f"# Precision (weighted): {prec:.3f}\n"
	metrics_text += f"# Recall (weighted): {recall:.3f}\n"
	metrics_text += f"# F1-Score (weighted): {f1:.3f}\n"

	return results_df, metrics_text

	return results_df, None