studysmart / src /train_model.py
DKatheesrupan's picture
Upload 5 files
6ccbbfd verified
"""
src/train_model.py
──────────────────
Trains Logistic Regression, Random Forest, and Gradient Boosting,
evaluates all three, picks the best, and saves it.
"""
import os, json
import numpy as np
import pandas as pd
import joblib
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
accuracy_score, f1_score, classification_report, confusion_matrix
)
from sklearn.model_selection import cross_val_score, StratifiedKFold
from preprocessing import load_and_preprocess
MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "models")
os.makedirs(MODEL_DIR, exist_ok=True)
LABEL_MAP_INV = {0: "Low Risk", 1: "Medium Risk", 2: "High Risk"}
def evaluate(name, model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
cv = cross_val_score(model, X_train, y_train,
cv=StratifiedKFold(5), scoring="f1_weighted").mean()
print(f"\n{'='*50}")
print(f" {name}")
print(f" Accuracy : {acc:.4f}")
print(f" F1-score : {f1:.4f} | CV F1: {cv:.4f}")
print(classification_report(y_test, y_pred,
target_names=["Low Risk", "Medium Risk", "High Risk"]))
return {"name": name, "model": model,
"accuracy": acc, "f1": f1, "cv_f1": cv}
def get_feature_importance(model, feature_names):
"""Return sorted (feature, importance) list."""
if hasattr(model, "feature_importances_"):
imp = model.feature_importances_
elif hasattr(model, "coef_"):
imp = np.abs(model.coef_).mean(axis=0)
else:
return []
pairs = sorted(zip(feature_names, imp), key=lambda x: x[1], reverse=True)
return pairs
def main():
print("Loading & preprocessing data …")
X_train, X_test, y_train, y_test, scaler, features = load_and_preprocess(save=True)
models = [
("Logistic Regression",
LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)),
("Random Forest",
RandomForestClassifier(n_estimators=200, max_depth=8,
min_samples_leaf=1, max_features="sqrt",
class_weight="balanced", random_state=42)),
("Gradient Boosting",
GradientBoostingClassifier(n_estimators=200, learning_rate=0.05,
max_depth=4, random_state=42)),
]
results = []
for name, clf in models:
r = evaluate(name, clf, X_train, y_train, X_test, y_test)
results.append(r)
# Pick best by CV F1
best = max(results, key=lambda r: r["cv_f1"])
print(f"\n>>> Best model: {best['name']} (CV F1 = {best['cv_f1']:.4f})")
# Feature importance
fi = get_feature_importance(best["model"], features)
print("\nTop feature importances:")
for feat, imp in fi[:8]:
print(f" {feat:25s} {imp:.4f}")
# Save
joblib.dump(best["model"], os.path.join(MODEL_DIR, "best_model.pkl"))
joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))
# Save metadata for app
meta = {
"best_model_name": best["name"],
"accuracy": round(best["accuracy"], 4),
"f1_weighted": round(best["f1"], 4),
"cv_f1": round(best["cv_f1"], 4),
"features": features,
"feature_importance": [(f, round(float(i), 4)) for f, i in fi],
"comparison": [
{"model": r["name"],
"accuracy": round(r["accuracy"], 4),
"f1_weighted": round(r["f1"], 4),
"cv_f1": round(r["cv_f1"], 4)}
for r in results
]
}
with open(os.path.join(MODEL_DIR, "model_meta.json"), "w") as f:
json.dump(meta, f, indent=2)
print("\nAll artefacts saved to models/")
return best, features
if __name__ == "__main__":
main()