Spaces:
Sleeping
Sleeping
File size: 4,139 Bytes
6ccbbfd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | """
src/train_model.py
──────────────────
Trains Logistic Regression, Random Forest, and Gradient Boosting,
evaluates all three, picks the best, and saves it.
"""
import os, json
import numpy as np
import pandas as pd
import joblib
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
accuracy_score, f1_score, classification_report, confusion_matrix
)
from sklearn.model_selection import cross_val_score, StratifiedKFold
from preprocessing import load_and_preprocess
MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "models")
os.makedirs(MODEL_DIR, exist_ok=True)
LABEL_MAP_INV = {0: "Low Risk", 1: "Medium Risk", 2: "High Risk"}
def evaluate(name, model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
cv = cross_val_score(model, X_train, y_train,
cv=StratifiedKFold(5), scoring="f1_weighted").mean()
print(f"\n{'='*50}")
print(f" {name}")
print(f" Accuracy : {acc:.4f}")
print(f" F1-score : {f1:.4f} | CV F1: {cv:.4f}")
print(classification_report(y_test, y_pred,
target_names=["Low Risk", "Medium Risk", "High Risk"]))
return {"name": name, "model": model,
"accuracy": acc, "f1": f1, "cv_f1": cv}
def get_feature_importance(model, feature_names):
"""Return sorted (feature, importance) list."""
if hasattr(model, "feature_importances_"):
imp = model.feature_importances_
elif hasattr(model, "coef_"):
imp = np.abs(model.coef_).mean(axis=0)
else:
return []
pairs = sorted(zip(feature_names, imp), key=lambda x: x[1], reverse=True)
return pairs
def main():
print("Loading & preprocessing data …")
X_train, X_test, y_train, y_test, scaler, features = load_and_preprocess(save=True)
models = [
("Logistic Regression",
LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)),
("Random Forest",
RandomForestClassifier(n_estimators=200, max_depth=8,
min_samples_leaf=1, max_features="sqrt",
class_weight="balanced", random_state=42)),
("Gradient Boosting",
GradientBoostingClassifier(n_estimators=200, learning_rate=0.05,
max_depth=4, random_state=42)),
]
results = []
for name, clf in models:
r = evaluate(name, clf, X_train, y_train, X_test, y_test)
results.append(r)
# Pick best by CV F1
best = max(results, key=lambda r: r["cv_f1"])
print(f"\n>>> Best model: {best['name']} (CV F1 = {best['cv_f1']:.4f})")
# Feature importance
fi = get_feature_importance(best["model"], features)
print("\nTop feature importances:")
for feat, imp in fi[:8]:
print(f" {feat:25s} {imp:.4f}")
# Save
joblib.dump(best["model"], os.path.join(MODEL_DIR, "best_model.pkl"))
joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))
# Save metadata for app
meta = {
"best_model_name": best["name"],
"accuracy": round(best["accuracy"], 4),
"f1_weighted": round(best["f1"], 4),
"cv_f1": round(best["cv_f1"], 4),
"features": features,
"feature_importance": [(f, round(float(i), 4)) for f, i in fi],
"comparison": [
{"model": r["name"],
"accuracy": round(r["accuracy"], 4),
"f1_weighted": round(r["f1"], 4),
"cv_f1": round(r["cv_f1"], 4)}
for r in results
]
}
with open(os.path.join(MODEL_DIR, "model_meta.json"), "w") as f:
json.dump(meta, f, indent=2)
print("\nAll artefacts saved to models/")
return best, features
if __name__ == "__main__":
main()
|