File size: 5,403 Bytes
6ccbbfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
src/preprocessing.py
────────────────────
Loads REAL Kaggle/UCI data (preferred) or synthetic fallback.
Auto-detects which files are present.
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib, os

BASE_DIR  = os.path.dirname(os.path.abspath(__file__))
RAW_DIR   = os.path.join(BASE_DIR, "..", "data", "raw")
PROC_DIR  = os.path.join(BASE_DIR, "..", "data", "processed")
MODEL_DIR = os.path.join(BASE_DIR, "..", "models")

os.makedirs(PROC_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

RISK_ORDER     = {"Low Risk": 0, "Medium Risk": 1, "High Risk": 2}
RISK_ORDER_INV = {0: "Low Risk", 1: "Medium Risk", 2: "High Risk"}
FAM_MAP        = {"none": 0, "low": 1, "medium": 2, "high": 3}

FEATURES = [
    "study_time", "absences", "failures", "G1", "G2",
    "internet", "higher_edu", "activities", "romantic",
    "family_support_num", "gender_bin", "high_absence", "study_efficiency",
]


def grade_to_risk(g3):
    if g3 >= 14: return "Low Risk"
    if g3 >= 10: return "Medium Risk"
    return "High Risk"


def _engineer(df):
    df = df.copy()
    df["family_support_num"] = df["family_support"].map(FAM_MAP).fillna(2)
    df["gender_bin"]         = (df["gender"] == "F").astype(int)
    df["high_absence"]       = (df["absences"] > 10).astype(int)
    # Use only information available before the final grade (G3) to avoid data leakage.
    df["study_efficiency"]   = ((df["G1"] + df["G2"]) / 2) / (df["study_time"].clip(lower=0.5) + 0.5)
    return df


def _load_performance_csv():
    """Try real data first, fall back to synthetic."""
    real_path = os.path.join(RAW_DIR, "student_performance.csv")
    if os.path.exists(real_path):
        df = pd.read_csv(real_path)
        # Validate required columns exist
        required = ["G1", "G2", "G3", "study_time", "absences", "failures"]
        if all(c in df.columns for c in required):
            print(f"  ✓  Using REAL dataset: {real_path}  ({len(df)} rows)")
            return df, "real"
    
    # Fallback
    synth_path = os.path.join(RAW_DIR, "student_performance_synthetic.csv")
    if os.path.exists(synth_path):
        print(f"  ⚠  Using synthetic fallback: {synth_path}")
        return pd.read_csv(synth_path), "synthetic"
    
    raise FileNotFoundError(
        "No performance dataset found!\n"
        "Run: python data/raw/download_datasets.py\n"
        "or:  python data/raw/generate_data.py"
    )


def load_and_preprocess(save=True):
    print("Loading student performance data...")
    df, source = _load_performance_csv()

    # Add missing optional columns with defaults if not present
    defaults = {
        "gender":      "M",
        "family_support": "medium",
        "internet":    1,
        "higher_edu":  1,
        "activities":  0,
        "romantic":    0,
    }
    for col, default in defaults.items():
        if col not in df.columns:
            df[col] = default
            print(f"  ⚠  Column '{col}' missing → default: {default!r}")

    df["risk"]      = df["G3"].apply(grade_to_risk)
    df["risk_code"] = df["risk"].map(RISK_ORDER)
    df = _engineer(df)

    X = df[FEATURES].astype(float)
    y = df["risk_code"]

    scaler   = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=FEATURES)

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y)

    if save:
        X_train.to_csv(os.path.join(PROC_DIR, "X_train.csv"), index=False)
        X_test.to_csv(os.path.join(PROC_DIR,  "X_test.csv"),  index=False)
        y_train.to_csv(os.path.join(PROC_DIR, "y_train.csv"), index=False)
        y_test.to_csv(os.path.join(PROC_DIR,  "y_test.csv"),  index=False)
        joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))
        print(f"  ✓  Data source: {source} | Rows: {len(df)} | Saved to data/processed/")

    return X_train, X_test, y_train, y_test, scaler, FEATURES


def preprocess_single_input(user_input, scaler, features):
    row = {
        "study_time":         float(user_input.get("study_time", 2)),
        "absences":           float(user_input.get("absences", 5)),
        "failures":           float(user_input.get("failures", 0)),
        "G1":                 float(user_input.get("G1", 10.0)),
        "G2":                 float(user_input.get("G2", 10.0)),
        "internet":           float(user_input.get("internet", 1)),
        "higher_edu":         float(user_input.get("higher_edu", 1)),
        "activities":         float(user_input.get("activities", 0)),
        "romantic":           float(user_input.get("romantic", 0)),
        "family_support_num": float(FAM_MAP.get(user_input.get("family_support","medium"), 2)),
        "gender_bin":         float(user_input.get("gender","F") == "F"),
        "high_absence":       float(user_input.get("absences", 5) > 10),
        "study_efficiency":   ((float(user_input.get("G1", 10.0)) + float(user_input.get("G2", 10.0))) / 2) /
                              (float(user_input.get("study_time", 2)) + 0.5),
    }
    df_row = pd.DataFrame([[row[f] for f in features]], columns=features)
    return pd.DataFrame(scaler.transform(df_row), columns=features)


if __name__ == "__main__":
    load_and_preprocess()
    print("Done.")