Upload model_utils.py with huggingface_hub

566749d verified about 1 month ago

9.06 kB

	"""
	Model training utilities: data loading, CV, model selection,
	hyperparameter tuning, evaluation, and artifact persistence.
	"""

	import os
	import json
	import warnings
	import joblib
	import numpy as np
	import pandas as pd
	from datetime import datetime, timezone

	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
	from sklearn.model_selection import (
	train_test_split, StratifiedKFold, cross_validate, GridSearchCV
	)
	from sklearn.metrics import (
	accuracy_score, precision_score, recall_score,
	f1_score, roc_auc_score
	)

	from .feature_engineering import prepare_feature_matrix, build_preprocessor, FEATURE_NAMES

	RANDOM_STATE = 42


	# ---------------------------------------------------------------------------
	# Data loading
	# ---------------------------------------------------------------------------

	def load_and_merge_data(households_path: str, gold_path: str) -> pd.DataFrame:
	"""Load and merge households + gold labels. Validates >= 50 rows."""
	if not os.path.exists(households_path):
	raise FileNotFoundError(f"Households file not found: {households_path}")
	if not os.path.exists(gold_path):
	raise FileNotFoundError(f"Gold labels file not found: {gold_path}")

	households = pd.read_csv(households_path)
	gold = pd.read_csv(gold_path)
	merged = gold.merge(households, on='household_id')

	if len(merged) < 50:
	raise ValueError(
	f"Merged dataset has only {len(merged)} rows. Minimum 50 required."
	)
	return merged


	def split_data(df: pd.DataFrame):
	"""
	80/20 stratified train/test split on stunting_flag.
	Returns X_train, X_test, y_train, y_test (all as numpy arrays after scaling).
	Also returns the fitted scaler and raw feature DataFrames.
	"""
	X_raw = prepare_feature_matrix(df)
	y = df['stunting_flag'].values

	X_train_raw, X_test_raw, y_train, y_test = train_test_split(
	X_raw, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
	)

	scaler = build_preprocessor()
	X_train = scaler.fit_transform(X_train_raw)
	X_test = scaler.transform(X_test_raw)

	return X_train, X_test, y_train, y_test, scaler, X_train_raw, X_test_raw


	# ---------------------------------------------------------------------------
	# Candidate models
	# ---------------------------------------------------------------------------

	def get_candidate_models() -> dict:
	return {
	'LogisticRegression': LogisticRegression(
	C=1.0, class_weight='balanced',
	random_state=RANDOM_STATE, max_iter=1000
	),
	'RandomForest': RandomForestClassifier(
	n_estimators=100, class_weight='balanced',
	random_state=RANDOM_STATE, n_jobs=-1
	),
	'GradientBoosting': GradientBoostingClassifier(
	n_estimators=100, learning_rate=0.1,
	max_depth=3, random_state=RANDOM_STATE
	),
	}


	# ---------------------------------------------------------------------------
	# Cross-validation
	# ---------------------------------------------------------------------------

	def run_cross_validation(models: dict, X_train: np.ndarray, y_train: np.ndarray) -> dict:
	"""
	5-fold stratified CV for each model.
	Returns dict: {model_name: {auc_roc_mean, auc_roc_std, f1_mean, f1_std,
	precision_mean, precision_std, recall_mean, recall_std}}
	"""
	cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
	results = {}

	for name, estimator in models.items():
	scores = cross_validate(
	estimator, X_train, y_train, cv=cv,
	scoring=['roc_auc', 'f1', 'precision', 'recall'],
	return_train_score=False
	)
	results[name] = {
	'auc_roc_mean': float(np.mean(scores['test_roc_auc'])),
	'auc_roc_std': float(np.std(scores['test_roc_auc'])),
	'f1_mean': float(np.mean(scores['test_f1'])),
	'f1_std': float(np.std(scores['test_f1'])),
	'precision_mean': float(np.mean(scores['test_precision'])),
	'precision_std': float(np.std(scores['test_precision'])),
	'recall_mean': float(np.mean(scores['test_recall'])),
	'recall_std': float(np.std(scores['test_recall'])),
	}
	print(f" {name}: CV AUC-ROC = {results[name]['auc_roc_mean']:.4f} "
	f"± {results[name]['auc_roc_std']:.4f}")

	return results


	def select_best_model(cv_results: dict) -> str:
	"""
	Return the name of the model with highest mean AUC-ROC.
	Ties broken by lower std.
	"""
	best = max(
	cv_results.items(),
	key=lambda kv: (kv[1]['auc_roc_mean'], -kv[1]['auc_roc_std'])
	)
	return best[0]


	# ---------------------------------------------------------------------------
	# Hyperparameter tuning
	# ---------------------------------------------------------------------------

	PARAM_GRIDS = {
	'LogisticRegression': {
	'C': [0.01, 0.1, 1.0, 10.0],
	},
	'RandomForest': {
	'n_estimators': [50, 100, 200],
	'max_depth': [3, 5, 10, None],
	'min_samples_split': [2, 5, 10],
	},
	'GradientBoosting': {
	'n_estimators': [50, 100, 200],
	'learning_rate': [0.01, 0.1, 0.2],
	'max_depth': [3, 5, 7],
	},
	}


	def tune_hyperparameters(model_name: str, estimator, X_train: np.ndarray, y_train: np.ndarray):
	"""
	GridSearchCV with 5-fold stratified CV, scoring=roc_auc.
	Returns (best_estimator, best_params, best_score).
	"""
	param_grid = PARAM_GRIDS.get(model_name, {})
	if not param_grid:
	print(f" No param grid for {model_name}, skipping tuning.")
	estimator.fit(X_train, y_train)
	return estimator, {}, None

	cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
	search = GridSearchCV(
	estimator, param_grid, scoring='roc_auc',
	cv=cv, n_jobs=-1, refit=True
	)
	search.fit(X_train, y_train)
	print(f" Best params: {search.best_params_}")
	print(f" Best CV AUC-ROC: {search.best_score_:.4f}")
	return search.best_estimator_, search.best_params_, search.best_score_


	# ---------------------------------------------------------------------------
	# Evaluation
	# ---------------------------------------------------------------------------

	def evaluate_on_test_set(model, X_test: np.ndarray, y_test: np.ndarray) -> dict:
	"""Compute full metrics on held-out test set."""
	y_pred = model.predict(X_test)
	y_prob = model.predict_proba(X_test)[:, 1]

	metrics = {
	'accuracy': float(accuracy_score(y_test, y_pred)),
	'precision': float(precision_score(y_test, y_pred, zero_division=0)),
	'recall': float(recall_score(y_test, y_pred, zero_division=0)),
	'f1_score': float(f1_score(y_test, y_pred, zero_division=0)),
	'auc_roc': float(roc_auc_score(y_test, y_prob)),
	}

	if metrics['auc_roc'] < 0.70:
	warnings.warn(
	f"Test AUC-ROC is {metrics['auc_roc']:.4f} (< 0.70). "
	"Review feature engineering or data quality.",
	UserWarning
	)

	return metrics


	# ---------------------------------------------------------------------------
	# Artifact persistence
	# ---------------------------------------------------------------------------

	def save_artifact(artifact: dict, output_dir: str) -> str:
	"""
	Save versioned + canonical .pkl and update model_registry.json.
	Returns the versioned artifact path.
	"""
	os.makedirs(output_dir, exist_ok=True)
	version_tag = datetime.now().strftime('%Y%m%d_%H%M%S')
	artifact['version_tag'] = version_tag

	versioned_path = os.path.join(output_dir, f"risk_model_{version_tag}.pkl")
	canonical_path = os.path.join(output_dir, "risk_model.pkl")

	joblib.dump(artifact, versioned_path)
	joblib.dump(artifact, canonical_path)
	print(f" Saved versioned artifact: {versioned_path}")
	print(f" Saved canonical artifact: {canonical_path}")

	# Update registry
	registry_path = os.path.join(output_dir, "model_registry.json")
	registry = []
	if os.path.exists(registry_path):
	with open(registry_path, 'r') as f:
	registry = json.load(f)

	registry.append({
	'version_tag': version_tag,
	'model_type': artifact.get('model_type', 'Unknown'),
	'test_auc_roc': artifact.get('metrics', {}).get('auc_roc', None),
	'timestamp': datetime.now(timezone.utc).isoformat(),
	'artifact_path': versioned_path,
	})

	with open(registry_path, 'w') as f:
	json.dump(registry, f, indent=2)
	print(f" Registry updated: {registry_path}")

	return versioned_path