studysmart / src /preprocessing.py
DKatheesrupan's picture
Upload 5 files
6ccbbfd verified
"""
src/preprocessing.py
────────────────────
Loads REAL Kaggle/UCI data (preferred) or synthetic fallback.
Auto-detects which files are present.
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib, os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
RAW_DIR = os.path.join(BASE_DIR, "..", "data", "raw")
PROC_DIR = os.path.join(BASE_DIR, "..", "data", "processed")
MODEL_DIR = os.path.join(BASE_DIR, "..", "models")
os.makedirs(PROC_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
RISK_ORDER = {"Low Risk": 0, "Medium Risk": 1, "High Risk": 2}
RISK_ORDER_INV = {0: "Low Risk", 1: "Medium Risk", 2: "High Risk"}
FAM_MAP = {"none": 0, "low": 1, "medium": 2, "high": 3}
FEATURES = [
"study_time", "absences", "failures", "G1", "G2",
"internet", "higher_edu", "activities", "romantic",
"family_support_num", "gender_bin", "high_absence", "study_efficiency",
]
def grade_to_risk(g3):
if g3 >= 14: return "Low Risk"
if g3 >= 10: return "Medium Risk"
return "High Risk"
def _engineer(df):
df = df.copy()
df["family_support_num"] = df["family_support"].map(FAM_MAP).fillna(2)
df["gender_bin"] = (df["gender"] == "F").astype(int)
df["high_absence"] = (df["absences"] > 10).astype(int)
# Use only information available before the final grade (G3) to avoid data leakage.
df["study_efficiency"] = ((df["G1"] + df["G2"]) / 2) / (df["study_time"].clip(lower=0.5) + 0.5)
return df
def _load_performance_csv():
"""Try real data first, fall back to synthetic."""
real_path = os.path.join(RAW_DIR, "student_performance.csv")
if os.path.exists(real_path):
df = pd.read_csv(real_path)
# Validate required columns exist
required = ["G1", "G2", "G3", "study_time", "absences", "failures"]
if all(c in df.columns for c in required):
print(f" ✓ Using REAL dataset: {real_path} ({len(df)} rows)")
return df, "real"
# Fallback
synth_path = os.path.join(RAW_DIR, "student_performance_synthetic.csv")
if os.path.exists(synth_path):
print(f" ⚠ Using synthetic fallback: {synth_path}")
return pd.read_csv(synth_path), "synthetic"
raise FileNotFoundError(
"No performance dataset found!\n"
"Run: python data/raw/download_datasets.py\n"
"or: python data/raw/generate_data.py"
)
def load_and_preprocess(save=True):
print("Loading student performance data...")
df, source = _load_performance_csv()
# Add missing optional columns with defaults if not present
defaults = {
"gender": "M",
"family_support": "medium",
"internet": 1,
"higher_edu": 1,
"activities": 0,
"romantic": 0,
}
for col, default in defaults.items():
if col not in df.columns:
df[col] = default
print(f" ⚠ Column '{col}' missing → default: {default!r}")
df["risk"] = df["G3"].apply(grade_to_risk)
df["risk_code"] = df["risk"].map(RISK_ORDER)
df = _engineer(df)
X = df[FEATURES].astype(float)
y = df["risk_code"]
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=FEATURES)
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42, stratify=y)
if save:
X_train.to_csv(os.path.join(PROC_DIR, "X_train.csv"), index=False)
X_test.to_csv(os.path.join(PROC_DIR, "X_test.csv"), index=False)
y_train.to_csv(os.path.join(PROC_DIR, "y_train.csv"), index=False)
y_test.to_csv(os.path.join(PROC_DIR, "y_test.csv"), index=False)
joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))
print(f" ✓ Data source: {source} | Rows: {len(df)} | Saved to data/processed/")
return X_train, X_test, y_train, y_test, scaler, FEATURES
def preprocess_single_input(user_input, scaler, features):
row = {
"study_time": float(user_input.get("study_time", 2)),
"absences": float(user_input.get("absences", 5)),
"failures": float(user_input.get("failures", 0)),
"G1": float(user_input.get("G1", 10.0)),
"G2": float(user_input.get("G2", 10.0)),
"internet": float(user_input.get("internet", 1)),
"higher_edu": float(user_input.get("higher_edu", 1)),
"activities": float(user_input.get("activities", 0)),
"romantic": float(user_input.get("romantic", 0)),
"family_support_num": float(FAM_MAP.get(user_input.get("family_support","medium"), 2)),
"gender_bin": float(user_input.get("gender","F") == "F"),
"high_absence": float(user_input.get("absences", 5) > 10),
"study_efficiency": ((float(user_input.get("G1", 10.0)) + float(user_input.get("G2", 10.0))) / 2) /
(float(user_input.get("study_time", 2)) + 0.5),
}
df_row = pd.DataFrame([[row[f] for f in features]], columns=features)
return pd.DataFrame(scaler.transform(df_row), columns=features)
if __name__ == "__main__":
load_and_preprocess()
print("Done.")