Spaces:
Sleeping
Sleeping
File size: 5,403 Bytes
6ccbbfd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | """
src/preprocessing.py
────────────────────
Loads REAL Kaggle/UCI data (preferred) or synthetic fallback.
Auto-detects which files are present.
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib, os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
RAW_DIR = os.path.join(BASE_DIR, "..", "data", "raw")
PROC_DIR = os.path.join(BASE_DIR, "..", "data", "processed")
MODEL_DIR = os.path.join(BASE_DIR, "..", "models")
os.makedirs(PROC_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
RISK_ORDER = {"Low Risk": 0, "Medium Risk": 1, "High Risk": 2}
RISK_ORDER_INV = {0: "Low Risk", 1: "Medium Risk", 2: "High Risk"}
FAM_MAP = {"none": 0, "low": 1, "medium": 2, "high": 3}
FEATURES = [
"study_time", "absences", "failures", "G1", "G2",
"internet", "higher_edu", "activities", "romantic",
"family_support_num", "gender_bin", "high_absence", "study_efficiency",
]
def grade_to_risk(g3):
if g3 >= 14: return "Low Risk"
if g3 >= 10: return "Medium Risk"
return "High Risk"
def _engineer(df):
df = df.copy()
df["family_support_num"] = df["family_support"].map(FAM_MAP).fillna(2)
df["gender_bin"] = (df["gender"] == "F").astype(int)
df["high_absence"] = (df["absences"] > 10).astype(int)
# Use only information available before the final grade (G3) to avoid data leakage.
df["study_efficiency"] = ((df["G1"] + df["G2"]) / 2) / (df["study_time"].clip(lower=0.5) + 0.5)
return df
def _load_performance_csv():
"""Try real data first, fall back to synthetic."""
real_path = os.path.join(RAW_DIR, "student_performance.csv")
if os.path.exists(real_path):
df = pd.read_csv(real_path)
# Validate required columns exist
required = ["G1", "G2", "G3", "study_time", "absences", "failures"]
if all(c in df.columns for c in required):
print(f" ✓ Using REAL dataset: {real_path} ({len(df)} rows)")
return df, "real"
# Fallback
synth_path = os.path.join(RAW_DIR, "student_performance_synthetic.csv")
if os.path.exists(synth_path):
print(f" ⚠ Using synthetic fallback: {synth_path}")
return pd.read_csv(synth_path), "synthetic"
raise FileNotFoundError(
"No performance dataset found!\n"
"Run: python data/raw/download_datasets.py\n"
"or: python data/raw/generate_data.py"
)
def load_and_preprocess(save=True):
print("Loading student performance data...")
df, source = _load_performance_csv()
# Add missing optional columns with defaults if not present
defaults = {
"gender": "M",
"family_support": "medium",
"internet": 1,
"higher_edu": 1,
"activities": 0,
"romantic": 0,
}
for col, default in defaults.items():
if col not in df.columns:
df[col] = default
print(f" ⚠ Column '{col}' missing → default: {default!r}")
df["risk"] = df["G3"].apply(grade_to_risk)
df["risk_code"] = df["risk"].map(RISK_ORDER)
df = _engineer(df)
X = df[FEATURES].astype(float)
y = df["risk_code"]
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=FEATURES)
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42, stratify=y)
if save:
X_train.to_csv(os.path.join(PROC_DIR, "X_train.csv"), index=False)
X_test.to_csv(os.path.join(PROC_DIR, "X_test.csv"), index=False)
y_train.to_csv(os.path.join(PROC_DIR, "y_train.csv"), index=False)
y_test.to_csv(os.path.join(PROC_DIR, "y_test.csv"), index=False)
joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))
print(f" ✓ Data source: {source} | Rows: {len(df)} | Saved to data/processed/")
return X_train, X_test, y_train, y_test, scaler, FEATURES
def preprocess_single_input(user_input, scaler, features):
row = {
"study_time": float(user_input.get("study_time", 2)),
"absences": float(user_input.get("absences", 5)),
"failures": float(user_input.get("failures", 0)),
"G1": float(user_input.get("G1", 10.0)),
"G2": float(user_input.get("G2", 10.0)),
"internet": float(user_input.get("internet", 1)),
"higher_edu": float(user_input.get("higher_edu", 1)),
"activities": float(user_input.get("activities", 0)),
"romantic": float(user_input.get("romantic", 0)),
"family_support_num": float(FAM_MAP.get(user_input.get("family_support","medium"), 2)),
"gender_bin": float(user_input.get("gender","F") == "F"),
"high_absence": float(user_input.get("absences", 5) > 10),
"study_efficiency": ((float(user_input.get("G1", 10.0)) + float(user_input.get("G2", 10.0))) / 2) /
(float(user_input.get("study_time", 2)) + 0.5),
}
df_row = pd.DataFrame([[row[f] for f in features]], columns=features)
return pd.DataFrame(scaler.transform(df_row), columns=features)
if __name__ == "__main__":
load_and_preprocess()
print("Done.")
|