Spaces:
Sleeping
Sleeping
| """ | |
| train_models.py | |
| βββββββββββββββ | |
| Step 3 of the pipeline. Trains three models and saves all artifacts. | |
| Models: | |
| A) Task Failure Predictor β XGBoost binary classifier | |
| Input : 15 behavioral features | |
| Output: failure_probability (0β1) | |
| Artifact: model_artifacts/failure_predictor.pkl | |
| B) Work Style Classifier β Random Forest 3-class | |
| Input : 7 work-style features | |
| Output: "turtle" | "hare" | "hybrid" | |
| Artifact: model_artifacts/work_style_classifier.pkl | |
| C) Distraction Scorer β Gradient Boosted Regressor | |
| Input : 5 distraction signals | |
| Output: distraction_score (0β1) | |
| Artifact: model_artifacts/distraction_scorer.pkl | |
| Also saves: | |
| model_artifacts/feature_scaler.pkl β StandardScaler fitted on training set | |
| model_artifacts/feature_columns.json β exact column lists for each model | |
| model_artifacts/metrics.json β evaluation metrics for dashboard display | |
| """ | |
| import sys, json, warnings | |
| import numpy as np | |
| import pandas as pd | |
| import joblib | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from pathlib import Path | |
| from rich.console import Console | |
| from rich.table import Table | |
| from rich.panel import Panel | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor | |
| from sklearn.metrics import ( | |
| classification_report, roc_auc_score, f1_score, | |
| accuracy_score, confusion_matrix, mean_squared_error, r2_score | |
| ) | |
| from imblearn.over_sampling import SMOTE | |
| from xgboost import XGBClassifier | |
| warnings.filterwarnings("ignore") | |
| console = Console() | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| from config import ( | |
| PROCESSED_DIR, ARTIFACTS_DIR, | |
| BEHAVIORAL_FEATURES, TARGET_FAILURE, TARGET_STYLE, | |
| XGBOOST_PARAMS, RF_PARAMS, RANDOM_STATE, TEST_SIZE | |
| ) | |
| METRICS = {} # filled during training, saved to JSON at end | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # A β Task Failure Predictor (XGBoost) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def train_failure_predictor(df: pd.DataFrame) -> dict: | |
| console.rule("[bold cyan]A β Task Failure Predictor[/bold cyan]") | |
| feat_cols = [c for c in BEHAVIORAL_FEATURES if c in df.columns] | |
| X = df[feat_cols].values | |
| y = df[TARGET_FAILURE].values | |
| # Scale features | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| # SMOTE to balance classes | |
| sm = SMOTE(random_state=RANDOM_STATE) | |
| X_res, y_res = sm.fit_resample(X_scaled, y) | |
| console.log(f"After SMOTE β {np.bincount(y_res)}") | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X_res, y_res, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_res | |
| ) | |
| # --- Train --- | |
| model = XGBClassifier(**XGBOOST_PARAMS) | |
| model.fit( | |
| X_train, y_train, | |
| eval_set=[(X_test, y_test)], | |
| verbose=False, | |
| ) | |
| # --- Evaluate --- | |
| y_pred = model.predict(X_test) | |
| y_proba = model.predict_proba(X_test)[:, 1] | |
| auc = roc_auc_score(y_test, y_proba) | |
| f1 = f1_score(y_test, y_pred) | |
| acc = accuracy_score(y_test, y_pred) | |
| report = classification_report(y_test, y_pred, output_dict=True) | |
| console.print(classification_report(y_test, y_pred)) | |
| console.log(f"[bold]AUC-ROC:[/bold] {auc:.4f} | [bold]F1:[/bold] {f1:.4f} | [bold]Accuracy:[/bold] {acc:.4f}") | |
| # 5-fold CV AUC on original (not resampled) to get honest estimate | |
| cv_model = XGBClassifier(**XGBOOST_PARAMS) | |
| cv_scaler = StandardScaler().fit(df[feat_cols].values) | |
| cv_scores = cross_val_score( | |
| cv_model, | |
| cv_scaler.transform(df[feat_cols].values), | |
| df[TARGET_FAILURE].values, | |
| cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE), | |
| scoring="roc_auc", | |
| n_jobs=-1, | |
| ) | |
| console.log(f"5-fold CV AUC: {cv_scores.mean():.4f} Β± {cv_scores.std():.4f}") | |
| # Feature importance plot | |
| importances = model.feature_importances_ | |
| fi_df = pd.DataFrame({"feature": feat_cols, "importance": importances}) | |
| fi_df = fi_df.sort_values("importance", ascending=False) | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| sns.barplot(data=fi_df, x="importance", y="feature", palette="viridis", ax=ax) | |
| ax.set_title("XGBoost Feature Importances β Task Failure Predictor") | |
| ax.set_xlabel("Importance") | |
| fig.tight_layout() | |
| fig.savefig(ARTIFACTS_DIR / "feature_importance_failure.png", dpi=150) | |
| plt.close() | |
| # Confusion matrix plot | |
| cm = confusion_matrix(y_test, y_pred) | |
| fig, ax = plt.subplots(figsize=(5, 4)) | |
| sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", | |
| xticklabels=["Predicted: Fail", "Predicted: Complete"], | |
| yticklabels=["Actual: Fail", "Actual: Complete"], ax=ax) | |
| ax.set_title("Confusion Matrix β Failure Predictor") | |
| fig.tight_layout() | |
| fig.savefig(ARTIFACTS_DIR / "confusion_matrix_failure.png", dpi=150) | |
| plt.close() | |
| # --- Save --- | |
| joblib.dump(model, ARTIFACTS_DIR / "failure_predictor.pkl") | |
| joblib.dump(scaler, ARTIFACTS_DIR / "feature_scaler.pkl") | |
| console.log("[green]β Saved: failure_predictor.pkl, feature_scaler.pkl[/green]") | |
| return { | |
| "failure_predictor": { | |
| "auc_roc": round(auc, 4), | |
| "f1_score": round(f1, 4), | |
| "accuracy": round(acc, 4), | |
| "cv_auc_mean": round(float(cv_scores.mean()), 4), | |
| "cv_auc_std": round(float(cv_scores.std()), 4), | |
| "n_train": int(len(X_train)), | |
| "n_test": int(len(X_test)), | |
| "features": feat_cols, | |
| } | |
| } | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # B β Work Style Classifier (Random Forest) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| WORK_STYLE_FEATURES = [ | |
| "session_duration_minutes", | |
| "break_count", | |
| "distraction_events", | |
| "stress_level", | |
| "motivation_level", | |
| "previous_completion_rate", | |
| "deadline_days_remaining", | |
| ] | |
| def train_work_style_classifier(df: pd.DataFrame) -> dict: | |
| console.rule("[bold cyan]B β Work Style Classifier[/bold cyan]") | |
| feat_cols = [c for c in WORK_STYLE_FEATURES if c in df.columns] | |
| # Only rows with labelled style | |
| df_ws = df[df[TARGET_STYLE].isin(["turtle", "hare", "hybrid"])].copy() | |
| if len(df_ws) < 100: | |
| console.log("[yellow]β Not enough labelled work-style rows. Skipping.[/yellow]") | |
| return {} | |
| le = LabelEncoder() | |
| y = le.fit_transform(df_ws[TARGET_STYLE]) | |
| X = df_ws[feat_cols].values | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y | |
| ) | |
| model = RandomForestClassifier(**RF_PARAMS) | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| acc = accuracy_score(y_test, y_pred) | |
| f1 = f1_score(y_test, y_pred, average="macro") | |
| console.print(classification_report(y_test, y_pred, target_names=le.classes_)) | |
| console.log(f"[bold]Accuracy:[/bold] {acc:.4f} | [bold]Macro-F1:[/bold] {f1:.4f}") | |
| # Save | |
| joblib.dump(model, ARTIFACTS_DIR / "work_style_classifier.pkl") | |
| joblib.dump(le, ARTIFACTS_DIR / "work_style_label_encoder.pkl") | |
| console.log("[green]β Saved: work_style_classifier.pkl, work_style_label_encoder.pkl[/green]") | |
| return { | |
| "work_style_classifier": { | |
| "accuracy": round(acc, 4), | |
| "macro_f1": round(f1, 4), | |
| "classes": list(le.classes_), | |
| "n_train": int(len(X_train)), | |
| "n_test": int(len(X_test)), | |
| "features": feat_cols, | |
| } | |
| } | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # C β Distraction Scorer (Gradient Boosting Regressor) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DISTRACTION_FEATURES = [ | |
| "distraction_events", | |
| "social_media_minutes_before", | |
| "break_count", | |
| "session_duration_minutes", | |
| "focus_score", | |
| ] | |
| def train_distraction_scorer(df: pd.DataFrame) -> dict: | |
| console.rule("[bold cyan]C β Distraction Scorer[/bold cyan]") | |
| feat_cols = [c for c in DISTRACTION_FEATURES if c in df.columns] | |
| # Derive a distraction_score target from available signals | |
| df = df.copy() | |
| df["distraction_score"] = ( | |
| 0.35 * (df["distraction_events"] / df["distraction_events"].quantile(0.95)).clip(0, 1) | |
| + 0.30 * (df["social_media_minutes_before"] / 120).clip(0, 1) | |
| + 0.15 * (df["break_count"] / df["break_count"].quantile(0.95)).clip(0, 1) | |
| + 0.20 * (1 - df.get("focus_score", 0.5)) | |
| ).clip(0, 1) | |
| X = df[feat_cols].values | |
| y = df["distraction_score"].values | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE | |
| ) | |
| model = GradientBoostingRegressor( | |
| n_estimators=300, max_depth=4, | |
| learning_rate=0.05, subsample=0.8, | |
| random_state=RANDOM_STATE, | |
| ) | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| rmse = np.sqrt(mean_squared_error(y_test, y_pred)) | |
| r2 = r2_score(y_test, y_pred) | |
| console.log(f"[bold]RMSE:[/bold] {rmse:.4f} | [bold]RΒ²:[/bold] {r2:.4f}") | |
| joblib.dump(model, ARTIFACTS_DIR / "distraction_scorer.pkl") | |
| console.log("[green]β Saved: distraction_scorer.pkl[/green]") | |
| return { | |
| "distraction_scorer": { | |
| "rmse": round(rmse, 4), | |
| "r2": round(r2, 4), | |
| "n_train": int(len(X_train)), | |
| "n_test": int(len(X_test)), | |
| "features": feat_cols, | |
| } | |
| } | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Save feature column manifest (backend uses this for inference) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def save_feature_manifest(metrics: dict): | |
| manifest = { | |
| "failure_predictor": { | |
| "features": metrics.get("failure_predictor", {}).get("features", BEHAVIORAL_FEATURES), | |
| "threshold": 0.65, | |
| }, | |
| "work_style_classifier": { | |
| "features": metrics.get("work_style_classifier", {}).get("features", WORK_STYLE_FEATURES), | |
| "classes": ["hare", "hybrid", "turtle"], | |
| }, | |
| "distraction_scorer": { | |
| "features": metrics.get("distraction_scorer", {}).get("features", DISTRACTION_FEATURES), | |
| }, | |
| } | |
| with open(ARTIFACTS_DIR / "feature_columns.json", "w") as f: | |
| json.dump(manifest, f, indent=2) | |
| console.log("[green]β Saved: feature_columns.json[/green]") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MAIN | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| console.print(Panel.fit("π€ Step 3 β Model Training", style="bold magenta")) | |
| training_path = PROCESSED_DIR / "training_dataset.csv" | |
| if not training_path.exists(): | |
| console.print("[red]β training_dataset.csv not found. Run preprocess.py first.[/red]") | |
| sys.exit(1) | |
| df = pd.read_csv(training_path) | |
| console.log(f"Loaded training data: {df.shape[0]:,} rows Γ {df.shape[1]} cols") | |
| metrics = {} | |
| # ββ Train all three models ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| metrics.update(train_failure_predictor(df)) | |
| metrics.update(train_work_style_classifier(df)) | |
| metrics.update(train_distraction_scorer(df)) | |
| # ββ Save metrics JSON βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| save_feature_manifest(metrics) | |
| with open(ARTIFACTS_DIR / "metrics.json", "w") as f: | |
| json.dump(metrics, f, indent=2) | |
| console.log("[green]β Saved: metrics.json[/green]") | |
| # ββ Print summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| console.print(Panel.fit("π Training Summary", style="bold green")) | |
| table = Table() | |
| table.add_column("Model", style="cyan") | |
| table.add_column("Key Metric", style="yellow") | |
| table.add_column("Score", justify="right") | |
| fp = metrics.get("failure_predictor", {}) | |
| ws = metrics.get("work_style_classifier", {}) | |
| ds = metrics.get("distraction_scorer", {}) | |
| if fp: | |
| table.add_row("Failure Predictor", "AUC-ROC", str(fp.get("auc_roc"))) | |
| table.add_row("Failure Predictor", "F1 Score", str(fp.get("f1_score"))) | |
| table.add_row("Failure Predictor", "CV AUC (5-fold)", f"{fp.get('cv_auc_mean')} Β± {fp.get('cv_auc_std')}") | |
| if ws: | |
| table.add_row("Work Style Classifier", "Accuracy", str(ws.get("accuracy"))) | |
| table.add_row("Work Style Classifier", "Macro-F1", str(ws.get("macro_f1"))) | |
| if ds: | |
| table.add_row("Distraction Scorer", "RMSE", str(ds.get("rmse"))) | |
| table.add_row("Distraction Scorer", "RΒ²", str(ds.get("r2"))) | |
| console.print(table) | |
| console.print("\n[bold green]β All model artifacts saved β model_artifacts/[/bold green]") | |
| if __name__ == "__main__": | |
| main() | |