agentbaba's picture
Upload data_pipeline/train_models.py with huggingface_hub
532cd37 verified
"""
train_models.py
───────────────
Step 3 of the pipeline. Trains three models and saves all artifacts.
Models:
A) Task Failure Predictor β€” XGBoost binary classifier
Input : 15 behavioral features
Output: failure_probability (0–1)
Artifact: model_artifacts/failure_predictor.pkl
B) Work Style Classifier β€” Random Forest 3-class
Input : 7 work-style features
Output: "turtle" | "hare" | "hybrid"
Artifact: model_artifacts/work_style_classifier.pkl
C) Distraction Scorer β€” Gradient Boosted Regressor
Input : 5 distraction signals
Output: distraction_score (0–1)
Artifact: model_artifacts/distraction_scorer.pkl
Also saves:
model_artifacts/feature_scaler.pkl ← StandardScaler fitted on training set
model_artifacts/feature_columns.json ← exact column lists for each model
model_artifacts/metrics.json ← evaluation metrics for dashboard display
"""
import sys, json, warnings
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import (
classification_report, roc_auc_score, f1_score,
accuracy_score, confusion_matrix, mean_squared_error, r2_score
)
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")
console = Console()
sys.path.insert(0, str(Path(__file__).parent))
from config import (
PROCESSED_DIR, ARTIFACTS_DIR,
BEHAVIORAL_FEATURES, TARGET_FAILURE, TARGET_STYLE,
XGBOOST_PARAMS, RF_PARAMS, RANDOM_STATE, TEST_SIZE
)
METRICS = {} # filled during training, saved to JSON at end
# ══════════════════════════════════════════════════════════════════════════════
# A β€” Task Failure Predictor (XGBoost)
# ══════════════════════════════════════════════════════════════════════════════
def train_failure_predictor(df: pd.DataFrame) -> dict:
console.rule("[bold cyan]A β€” Task Failure Predictor[/bold cyan]")
feat_cols = [c for c in BEHAVIORAL_FEATURES if c in df.columns]
X = df[feat_cols].values
y = df[TARGET_FAILURE].values
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# SMOTE to balance classes
sm = SMOTE(random_state=RANDOM_STATE)
X_res, y_res = sm.fit_resample(X_scaled, y)
console.log(f"After SMOTE β†’ {np.bincount(y_res)}")
X_train, X_test, y_train, y_test = train_test_split(
X_res, y_res, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_res
)
# --- Train ---
model = XGBClassifier(**XGBOOST_PARAMS)
model.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=False,
)
# --- Evaluate ---
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_proba)
f1 = f1_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
console.print(classification_report(y_test, y_pred))
console.log(f"[bold]AUC-ROC:[/bold] {auc:.4f} | [bold]F1:[/bold] {f1:.4f} | [bold]Accuracy:[/bold] {acc:.4f}")
# 5-fold CV AUC on original (not resampled) to get honest estimate
cv_model = XGBClassifier(**XGBOOST_PARAMS)
cv_scaler = StandardScaler().fit(df[feat_cols].values)
cv_scores = cross_val_score(
cv_model,
cv_scaler.transform(df[feat_cols].values),
df[TARGET_FAILURE].values,
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
scoring="roc_auc",
n_jobs=-1,
)
console.log(f"5-fold CV AUC: {cv_scores.mean():.4f} Β± {cv_scores.std():.4f}")
# Feature importance plot
importances = model.feature_importances_
fi_df = pd.DataFrame({"feature": feat_cols, "importance": importances})
fi_df = fi_df.sort_values("importance", ascending=False)
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=fi_df, x="importance", y="feature", palette="viridis", ax=ax)
ax.set_title("XGBoost Feature Importances β€” Task Failure Predictor")
ax.set_xlabel("Importance")
fig.tight_layout()
fig.savefig(ARTIFACTS_DIR / "feature_importance_failure.png", dpi=150)
plt.close()
# Confusion matrix plot
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
xticklabels=["Predicted: Fail", "Predicted: Complete"],
yticklabels=["Actual: Fail", "Actual: Complete"], ax=ax)
ax.set_title("Confusion Matrix β€” Failure Predictor")
fig.tight_layout()
fig.savefig(ARTIFACTS_DIR / "confusion_matrix_failure.png", dpi=150)
plt.close()
# --- Save ---
joblib.dump(model, ARTIFACTS_DIR / "failure_predictor.pkl")
joblib.dump(scaler, ARTIFACTS_DIR / "feature_scaler.pkl")
console.log("[green]βœ“ Saved: failure_predictor.pkl, feature_scaler.pkl[/green]")
return {
"failure_predictor": {
"auc_roc": round(auc, 4),
"f1_score": round(f1, 4),
"accuracy": round(acc, 4),
"cv_auc_mean": round(float(cv_scores.mean()), 4),
"cv_auc_std": round(float(cv_scores.std()), 4),
"n_train": int(len(X_train)),
"n_test": int(len(X_test)),
"features": feat_cols,
}
}
# ══════════════════════════════════════════════════════════════════════════════
# B β€” Work Style Classifier (Random Forest)
# ══════════════════════════════════════════════════════════════════════════════
WORK_STYLE_FEATURES = [
"session_duration_minutes",
"break_count",
"distraction_events",
"stress_level",
"motivation_level",
"previous_completion_rate",
"deadline_days_remaining",
]
def train_work_style_classifier(df: pd.DataFrame) -> dict:
console.rule("[bold cyan]B β€” Work Style Classifier[/bold cyan]")
feat_cols = [c for c in WORK_STYLE_FEATURES if c in df.columns]
# Only rows with labelled style
df_ws = df[df[TARGET_STYLE].isin(["turtle", "hare", "hybrid"])].copy()
if len(df_ws) < 100:
console.log("[yellow]⚠ Not enough labelled work-style rows. Skipping.[/yellow]")
return {}
le = LabelEncoder()
y = le.fit_transform(df_ws[TARGET_STYLE])
X = df_ws[feat_cols].values
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
model = RandomForestClassifier(**RF_PARAMS)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
console.print(classification_report(y_test, y_pred, target_names=le.classes_))
console.log(f"[bold]Accuracy:[/bold] {acc:.4f} | [bold]Macro-F1:[/bold] {f1:.4f}")
# Save
joblib.dump(model, ARTIFACTS_DIR / "work_style_classifier.pkl")
joblib.dump(le, ARTIFACTS_DIR / "work_style_label_encoder.pkl")
console.log("[green]βœ“ Saved: work_style_classifier.pkl, work_style_label_encoder.pkl[/green]")
return {
"work_style_classifier": {
"accuracy": round(acc, 4),
"macro_f1": round(f1, 4),
"classes": list(le.classes_),
"n_train": int(len(X_train)),
"n_test": int(len(X_test)),
"features": feat_cols,
}
}
# ══════════════════════════════════════════════════════════════════════════════
# C β€” Distraction Scorer (Gradient Boosting Regressor)
# ══════════════════════════════════════════════════════════════════════════════
DISTRACTION_FEATURES = [
"distraction_events",
"social_media_minutes_before",
"break_count",
"session_duration_minutes",
"focus_score",
]
def train_distraction_scorer(df: pd.DataFrame) -> dict:
console.rule("[bold cyan]C β€” Distraction Scorer[/bold cyan]")
feat_cols = [c for c in DISTRACTION_FEATURES if c in df.columns]
# Derive a distraction_score target from available signals
df = df.copy()
df["distraction_score"] = (
0.35 * (df["distraction_events"] / df["distraction_events"].quantile(0.95)).clip(0, 1)
+ 0.30 * (df["social_media_minutes_before"] / 120).clip(0, 1)
+ 0.15 * (df["break_count"] / df["break_count"].quantile(0.95)).clip(0, 1)
+ 0.20 * (1 - df.get("focus_score", 0.5))
).clip(0, 1)
X = df[feat_cols].values
y = df["distraction_score"].values
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
model = GradientBoostingRegressor(
n_estimators=300, max_depth=4,
learning_rate=0.05, subsample=0.8,
random_state=RANDOM_STATE,
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
console.log(f"[bold]RMSE:[/bold] {rmse:.4f} | [bold]RΒ²:[/bold] {r2:.4f}")
joblib.dump(model, ARTIFACTS_DIR / "distraction_scorer.pkl")
console.log("[green]βœ“ Saved: distraction_scorer.pkl[/green]")
return {
"distraction_scorer": {
"rmse": round(rmse, 4),
"r2": round(r2, 4),
"n_train": int(len(X_train)),
"n_test": int(len(X_test)),
"features": feat_cols,
}
}
# ══════════════════════════════════════════════════════════════════════════════
# Save feature column manifest (backend uses this for inference)
# ══════════════════════════════════════════════════════════════════════════════
def save_feature_manifest(metrics: dict):
manifest = {
"failure_predictor": {
"features": metrics.get("failure_predictor", {}).get("features", BEHAVIORAL_FEATURES),
"threshold": 0.65,
},
"work_style_classifier": {
"features": metrics.get("work_style_classifier", {}).get("features", WORK_STYLE_FEATURES),
"classes": ["hare", "hybrid", "turtle"],
},
"distraction_scorer": {
"features": metrics.get("distraction_scorer", {}).get("features", DISTRACTION_FEATURES),
},
}
with open(ARTIFACTS_DIR / "feature_columns.json", "w") as f:
json.dump(manifest, f, indent=2)
console.log("[green]βœ“ Saved: feature_columns.json[/green]")
# ══════════════════════════════════════════════════════════════════════════════
# MAIN
# ══════════════════════════════════════════════════════════════════════════════
def main():
console.print(Panel.fit("πŸ€– Step 3 β€” Model Training", style="bold magenta"))
training_path = PROCESSED_DIR / "training_dataset.csv"
if not training_path.exists():
console.print("[red]❌ training_dataset.csv not found. Run preprocess.py first.[/red]")
sys.exit(1)
df = pd.read_csv(training_path)
console.log(f"Loaded training data: {df.shape[0]:,} rows Γ— {df.shape[1]} cols")
metrics = {}
# ── Train all three models ────────────────────────────────────────────────
metrics.update(train_failure_predictor(df))
metrics.update(train_work_style_classifier(df))
metrics.update(train_distraction_scorer(df))
# ── Save metrics JSON ─────────────────────────────────────────────────────
save_feature_manifest(metrics)
with open(ARTIFACTS_DIR / "metrics.json", "w") as f:
json.dump(metrics, f, indent=2)
console.log("[green]βœ“ Saved: metrics.json[/green]")
# ── Print summary ─────────────────────────────────────────────────────────
console.print(Panel.fit("πŸ“Š Training Summary", style="bold green"))
table = Table()
table.add_column("Model", style="cyan")
table.add_column("Key Metric", style="yellow")
table.add_column("Score", justify="right")
fp = metrics.get("failure_predictor", {})
ws = metrics.get("work_style_classifier", {})
ds = metrics.get("distraction_scorer", {})
if fp:
table.add_row("Failure Predictor", "AUC-ROC", str(fp.get("auc_roc")))
table.add_row("Failure Predictor", "F1 Score", str(fp.get("f1_score")))
table.add_row("Failure Predictor", "CV AUC (5-fold)", f"{fp.get('cv_auc_mean')} Β± {fp.get('cv_auc_std')}")
if ws:
table.add_row("Work Style Classifier", "Accuracy", str(ws.get("accuracy")))
table.add_row("Work Style Classifier", "Macro-F1", str(ws.get("macro_f1")))
if ds:
table.add_row("Distraction Scorer", "RMSE", str(ds.get("rmse")))
table.add_row("Distraction Scorer", "RΒ²", str(ds.get("r2")))
console.print(table)
console.print("\n[bold green]βœ… All model artifacts saved β†’ model_artifacts/[/bold green]")
if __name__ == "__main__":
main()