Spaces:

DKatheesrupan
/

studysmart

Sleeping

App Files Files Community

studysmart / src /train_model.py

DKatheesrupan

Upload 5 files

6ccbbfd verified 10 days ago

raw

history blame contribute delete

4.14 kB

	"""
	src/train_model.py
	──────────────────
	Trains Logistic Regression, Random Forest, and Gradient Boosting,
	evaluates all three, picks the best, and saves it.
	"""
	import os, json
	import numpy as np
	import pandas as pd
	import joblib
	import warnings
	warnings.filterwarnings("ignore")

	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
	from sklearn.metrics import (
	accuracy_score, f1_score, classification_report, confusion_matrix
	)
	from sklearn.model_selection import cross_val_score, StratifiedKFold

	from preprocessing import load_and_preprocess

	MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "models")
	os.makedirs(MODEL_DIR, exist_ok=True)

	LABEL_MAP_INV = {0: "Low Risk", 1: "Medium Risk", 2: "High Risk"}


	def evaluate(name, model, X_train, y_train, X_test, y_test):
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	acc = accuracy_score(y_test, y_pred)
	f1 = f1_score(y_test, y_pred, average="weighted")
	cv = cross_val_score(model, X_train, y_train,
	cv=StratifiedKFold(5), scoring="f1_weighted").mean()

	print(f"\n{'='*50}")
	print(f" {name}")
	print(f" Accuracy : {acc:.4f}")
	print(f" F1-score : {f1:.4f} \| CV F1: {cv:.4f}")
	print(classification_report(y_test, y_pred,
	target_names=["Low Risk", "Medium Risk", "High Risk"]))
	return {"name": name, "model": model,
	"accuracy": acc, "f1": f1, "cv_f1": cv}


	def get_feature_importance(model, feature_names):
	"""Return sorted (feature, importance) list."""
	if hasattr(model, "feature_importances_"):
	imp = model.feature_importances_
	elif hasattr(model, "coef_"):
	imp = np.abs(model.coef_).mean(axis=0)
	else:
	return []
	pairs = sorted(zip(feature_names, imp), key=lambda x: x[1], reverse=True)
	return pairs


	def main():
	print("Loading & preprocessing data …")
	X_train, X_test, y_train, y_test, scaler, features = load_and_preprocess(save=True)

	models = [
	("Logistic Regression",
	LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)),

	("Random Forest",
	RandomForestClassifier(n_estimators=200, max_depth=8,
	min_samples_leaf=1, max_features="sqrt",
	class_weight="balanced", random_state=42)),

	("Gradient Boosting",
	GradientBoostingClassifier(n_estimators=200, learning_rate=0.05,
	max_depth=4, random_state=42)),
	]

	results = []
	for name, clf in models:
	r = evaluate(name, clf, X_train, y_train, X_test, y_test)
	results.append(r)

	# Pick best by CV F1
	best = max(results, key=lambda r: r["cv_f1"])
	print(f"\n>>> Best model: {best['name']} (CV F1 = {best['cv_f1']:.4f})")

	# Feature importance
	fi = get_feature_importance(best["model"], features)
	print("\nTop feature importances:")
	for feat, imp in fi[:8]:
	print(f" {feat:25s} {imp:.4f}")

	# Save
	joblib.dump(best["model"], os.path.join(MODEL_DIR, "best_model.pkl"))
	joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))

	# Save metadata for app
	meta = {
	"best_model_name": best["name"],
	"accuracy": round(best["accuracy"], 4),
	"f1_weighted": round(best["f1"], 4),
	"cv_f1": round(best["cv_f1"], 4),
	"features": features,
	"feature_importance": [(f, round(float(i), 4)) for f, i in fi],
	"comparison": [
	{"model": r["name"],
	"accuracy": round(r["accuracy"], 4),
	"f1_weighted": round(r["f1"], 4),
	"cv_f1": round(r["cv_f1"], 4)}
	for r in results
	]
	}
	with open(os.path.join(MODEL_DIR, "model_meta.json"), "w") as f:
	json.dump(meta, f, indent=2)

	print("\nAll artefacts saved to models/")
	return best, features


	if __name__ == "__main__":
	main()