""" src/preprocessing.py ──────────────────── Loads REAL Kaggle/UCI data (preferred) or synthetic fallback. Auto-detects which files are present. """ import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split import joblib, os BASE_DIR = os.path.dirname(os.path.abspath(__file__)) RAW_DIR = os.path.join(BASE_DIR, "..", "data", "raw") PROC_DIR = os.path.join(BASE_DIR, "..", "data", "processed") MODEL_DIR = os.path.join(BASE_DIR, "..", "models") os.makedirs(PROC_DIR, exist_ok=True) os.makedirs(MODEL_DIR, exist_ok=True) RISK_ORDER = {"Low Risk": 0, "Medium Risk": 1, "High Risk": 2} RISK_ORDER_INV = {0: "Low Risk", 1: "Medium Risk", 2: "High Risk"} FAM_MAP = {"none": 0, "low": 1, "medium": 2, "high": 3} FEATURES = [ "study_time", "absences", "failures", "G1", "G2", "internet", "higher_edu", "activities", "romantic", "family_support_num", "gender_bin", "high_absence", "study_efficiency", ] def grade_to_risk(g3): if g3 >= 14: return "Low Risk" if g3 >= 10: return "Medium Risk" return "High Risk" def _engineer(df): df = df.copy() df["family_support_num"] = df["family_support"].map(FAM_MAP).fillna(2) df["gender_bin"] = (df["gender"] == "F").astype(int) df["high_absence"] = (df["absences"] > 10).astype(int) # Use only information available before the final grade (G3) to avoid data leakage. df["study_efficiency"] = ((df["G1"] + df["G2"]) / 2) / (df["study_time"].clip(lower=0.5) + 0.5) return df def _load_performance_csv(): """Try real data first, fall back to synthetic.""" real_path = os.path.join(RAW_DIR, "student_performance.csv") if os.path.exists(real_path): df = pd.read_csv(real_path) # Validate required columns exist required = ["G1", "G2", "G3", "study_time", "absences", "failures"] if all(c in df.columns for c in required): print(f" ✓ Using REAL dataset: {real_path} ({len(df)} rows)") return df, "real" # Fallback synth_path = os.path.join(RAW_DIR, "student_performance_synthetic.csv") if os.path.exists(synth_path): print(f" ⚠ Using synthetic fallback: {synth_path}") return pd.read_csv(synth_path), "synthetic" raise FileNotFoundError( "No performance dataset found!\n" "Run: python data/raw/download_datasets.py\n" "or: python data/raw/generate_data.py" ) def load_and_preprocess(save=True): print("Loading student performance data...") df, source = _load_performance_csv() # Add missing optional columns with defaults if not present defaults = { "gender": "M", "family_support": "medium", "internet": 1, "higher_edu": 1, "activities": 0, "romantic": 0, } for col, default in defaults.items(): if col not in df.columns: df[col] = default print(f" ⚠ Column '{col}' missing → default: {default!r}") df["risk"] = df["G3"].apply(grade_to_risk) df["risk_code"] = df["risk"].map(RISK_ORDER) df = _engineer(df) X = df[FEATURES].astype(float) y = df["risk_code"] scaler = StandardScaler() X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=FEATURES) X_train, X_test, y_train, y_test = train_test_split( X_scaled, y, test_size=0.2, random_state=42, stratify=y) if save: X_train.to_csv(os.path.join(PROC_DIR, "X_train.csv"), index=False) X_test.to_csv(os.path.join(PROC_DIR, "X_test.csv"), index=False) y_train.to_csv(os.path.join(PROC_DIR, "y_train.csv"), index=False) y_test.to_csv(os.path.join(PROC_DIR, "y_test.csv"), index=False) joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl")) print(f" ✓ Data source: {source} | Rows: {len(df)} | Saved to data/processed/") return X_train, X_test, y_train, y_test, scaler, FEATURES def preprocess_single_input(user_input, scaler, features): row = { "study_time": float(user_input.get("study_time", 2)), "absences": float(user_input.get("absences", 5)), "failures": float(user_input.get("failures", 0)), "G1": float(user_input.get("G1", 10.0)), "G2": float(user_input.get("G2", 10.0)), "internet": float(user_input.get("internet", 1)), "higher_edu": float(user_input.get("higher_edu", 1)), "activities": float(user_input.get("activities", 0)), "romantic": float(user_input.get("romantic", 0)), "family_support_num": float(FAM_MAP.get(user_input.get("family_support","medium"), 2)), "gender_bin": float(user_input.get("gender","F") == "F"), "high_absence": float(user_input.get("absences", 5) > 10), "study_efficiency": ((float(user_input.get("G1", 10.0)) + float(user_input.get("G2", 10.0))) / 2) / (float(user_input.get("study_time", 2)) + 0.5), } df_row = pd.DataFrame([[row[f] for f in features]], columns=features) return pd.DataFrame(scaler.transform(df_row), columns=features) if __name__ == "__main__": load_and_preprocess() print("Done.")