Spaces:

DKatheesrupan
/

studysmart

Sleeping

App Files Files Community

studysmart / src /preprocessing.py

DKatheesrupan

Upload 5 files

6ccbbfd verified 8 days ago

raw

history blame contribute delete

5.4 kB

	"""
	src/preprocessing.py
	────────────────────
	Loads REAL Kaggle/UCI data (preferred) or synthetic fallback.
	Auto-detects which files are present.
	"""
	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import StandardScaler
	from sklearn.model_selection import train_test_split
	import joblib, os

	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	RAW_DIR = os.path.join(BASE_DIR, "..", "data", "raw")
	PROC_DIR = os.path.join(BASE_DIR, "..", "data", "processed")
	MODEL_DIR = os.path.join(BASE_DIR, "..", "models")

	os.makedirs(PROC_DIR, exist_ok=True)
	os.makedirs(MODEL_DIR, exist_ok=True)

	RISK_ORDER = {"Low Risk": 0, "Medium Risk": 1, "High Risk": 2}
	RISK_ORDER_INV = {0: "Low Risk", 1: "Medium Risk", 2: "High Risk"}
	FAM_MAP = {"none": 0, "low": 1, "medium": 2, "high": 3}

	FEATURES = [
	"study_time", "absences", "failures", "G1", "G2",
	"internet", "higher_edu", "activities", "romantic",
	"family_support_num", "gender_bin", "high_absence", "study_efficiency",
	]


	def grade_to_risk(g3):
	if g3 >= 14: return "Low Risk"
	if g3 >= 10: return "Medium Risk"
	return "High Risk"


	def _engineer(df):
	df = df.copy()
	df["family_support_num"] = df["family_support"].map(FAM_MAP).fillna(2)
	df["gender_bin"] = (df["gender"] == "F").astype(int)
	df["high_absence"] = (df["absences"] > 10).astype(int)
	# Use only information available before the final grade (G3) to avoid data leakage.
	df["study_efficiency"] = ((df["G1"] + df["G2"]) / 2) / (df["study_time"].clip(lower=0.5) + 0.5)
	return df


	def _load_performance_csv():
	"""Try real data first, fall back to synthetic."""
	real_path = os.path.join(RAW_DIR, "student_performance.csv")
	if os.path.exists(real_path):
	df = pd.read_csv(real_path)
	# Validate required columns exist
	required = ["G1", "G2", "G3", "study_time", "absences", "failures"]
	if all(c in df.columns for c in required):
	print(f" ✓ Using REAL dataset: {real_path} ({len(df)} rows)")
	return df, "real"

	# Fallback
	synth_path = os.path.join(RAW_DIR, "student_performance_synthetic.csv")
	if os.path.exists(synth_path):
	print(f" ⚠ Using synthetic fallback: {synth_path}")
	return pd.read_csv(synth_path), "synthetic"

	raise FileNotFoundError(
	"No performance dataset found!\n"
	"Run: python data/raw/download_datasets.py\n"
	"or: python data/raw/generate_data.py"
	)


	def load_and_preprocess(save=True):
	print("Loading student performance data...")
	df, source = _load_performance_csv()

	# Add missing optional columns with defaults if not present
	defaults = {
	"gender": "M",
	"family_support": "medium",
	"internet": 1,
	"higher_edu": 1,
	"activities": 0,
	"romantic": 0,
	}
	for col, default in defaults.items():
	if col not in df.columns:
	df[col] = default
	print(f" ⚠ Column '{col}' missing → default: {default!r}")

	df["risk"] = df["G3"].apply(grade_to_risk)
	df["risk_code"] = df["risk"].map(RISK_ORDER)
	df = _engineer(df)

	X = df[FEATURES].astype(float)
	y = df["risk_code"]

	scaler = StandardScaler()
	X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=FEATURES)

	X_train, X_test, y_train, y_test = train_test_split(
	X_scaled, y, test_size=0.2, random_state=42, stratify=y)

	if save:
	X_train.to_csv(os.path.join(PROC_DIR, "X_train.csv"), index=False)
	X_test.to_csv(os.path.join(PROC_DIR, "X_test.csv"), index=False)
	y_train.to_csv(os.path.join(PROC_DIR, "y_train.csv"), index=False)
	y_test.to_csv(os.path.join(PROC_DIR, "y_test.csv"), index=False)
	joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))
	print(f" ✓ Data source: {source} \| Rows: {len(df)} \| Saved to data/processed/")

	return X_train, X_test, y_train, y_test, scaler, FEATURES


	def preprocess_single_input(user_input, scaler, features):
	row = {
	"study_time": float(user_input.get("study_time", 2)),
	"absences": float(user_input.get("absences", 5)),
	"failures": float(user_input.get("failures", 0)),
	"G1": float(user_input.get("G1", 10.0)),
	"G2": float(user_input.get("G2", 10.0)),
	"internet": float(user_input.get("internet", 1)),
	"higher_edu": float(user_input.get("higher_edu", 1)),
	"activities": float(user_input.get("activities", 0)),
	"romantic": float(user_input.get("romantic", 0)),
	"family_support_num": float(FAM_MAP.get(user_input.get("family_support","medium"), 2)),
	"gender_bin": float(user_input.get("gender","F") == "F"),
	"high_absence": float(user_input.get("absences", 5) > 10),
	"study_efficiency": ((float(user_input.get("G1", 10.0)) + float(user_input.get("G2", 10.0))) / 2) /
	(float(user_input.get("study_time", 2)) + 0.5),
	}
	df_row = pd.DataFrame([[row[f] for f in features]], columns=features)
	return pd.DataFrame(scaler.transform(df_row), columns=features)


	if __name__ == "__main__":
	load_and_preprocess()
	print("Done.")