| |
|
| |
|
| | """
|
| | Student Dropout Prediction - PRACTICAL VERSION
|
| | Uses only features a teacher would realistically have access to.
|
| | """
|
| |
|
| | import pandas as pd
|
| | import numpy as np
|
| | import json
|
| | import joblib
|
| |
|
| | from sklearn.model_selection import StratifiedKFold
|
| | from sklearn.linear_model import LogisticRegression
|
| | from sklearn.pipeline import Pipeline
|
| | from sklearn.preprocessing import StandardScaler
|
| | from sklearn.metrics import roc_auc_score, accuracy_score
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | df = pd.read_csv('../data.csv', sep=';')
|
| | df = df[df['Target'] != 'Enrolled']
|
| | df.columns = df.columns.str.strip()
|
| | df = df.round()
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | practical_features = [
|
| |
|
| | 'Curricular units 2nd sem (approved)',
|
| | 'Curricular units 2nd sem (evaluations)',
|
| | 'Curricular units 2nd sem (without evaluations)',
|
| |
|
| |
|
| | 'Tuition fees up to date',
|
| | 'Scholarship holder',
|
| | 'Debtor',
|
| |
|
| |
|
| | 'Gender',
|
| | 'Age at enrollment',
|
| |
|
| |
|
| | 'Daytime/evening attendance',
|
| | 'Displaced',
|
| | ]
|
| |
|
| |
|
| | missing = [f for f in practical_features if f not in df.columns]
|
| | if missing:
|
| | print(f"Warning: Missing features: {missing}")
|
| | practical_features = [f for f in practical_features if f in df.columns]
|
| |
|
| | print(f"Using {len(practical_features)} practical features:")
|
| | for i, f in enumerate(practical_features, 1):
|
| | print(f" {i}. {f}")
|
| |
|
| |
|
| | x = df[practical_features].copy()
|
| | y = df['Target'].map({'Dropout': 0, 'Graduate': 1}).astype(int)
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | model = Pipeline([
|
| | ('scaler', StandardScaler()),
|
| | ('clf', LogisticRegression(
|
| | C=1.0,
|
| | solver='lbfgs',
|
| | class_weight='balanced',
|
| | random_state=42,
|
| | max_iter=1000
|
| | ))
|
| | ])
|
| |
|
| |
|
| | print("\n" + "="*60)
|
| | print("CROSS-VALIDATION RESULTS")
|
| | print("="*60)
|
| |
|
| | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
| | auc_scores = []
|
| | acc_scores = []
|
| |
|
| | for fold, (train_idx, val_idx) in enumerate(skf.split(x, y), 1):
|
| | x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
|
| | y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
|
| |
|
| | model.fit(x_train, y_train)
|
| | y_pred = model.predict(x_val)
|
| | y_proba = model.predict_proba(x_val)[:, 1]
|
| |
|
| | auc = roc_auc_score(y_val, y_proba)
|
| | acc = accuracy_score(y_val, y_pred)
|
| |
|
| | auc_scores.append(auc)
|
| | acc_scores.append(acc)
|
| | print(f"Fold {fold}: Accuracy={acc:.4f}, ROC-AUC={auc:.4f}")
|
| |
|
| | print(f"\nAverage ROC-AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")
|
| | print(f"Average Accuracy: {np.mean(acc_scores):.4f} ± {np.std(acc_scores):.4f}")
|
| |
|
| |
|
| | final_model = model.fit(x, y)
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | joblib.dump(final_model, "student_dropout_model.pkl")
|
| |
|
| |
|
| | config = {
|
| | "model_name": "Student Dropout Predictor",
|
| | "version": "1.0",
|
| | "description": "Predicts if a student will dropout or graduate based on available data",
|
| |
|
| | "input_schema": {
|
| | "units_approved": {
|
| | "description": "Number of curricular units passed this semester",
|
| | "type": "integer",
|
| | "min": 0,
|
| | "max": 20,
|
| | "required": True,
|
| | "maps_to": "Curricular units 2nd sem (approved)"
|
| | },
|
| | "evaluations_taken": {
|
| | "description": "Number of evaluations/exams the student took",
|
| | "type": "integer",
|
| | "min": 0,
|
| | "max": 30,
|
| | "required": True,
|
| | "maps_to": "Curricular units 2nd sem (evaluations)"
|
| | },
|
| | "evaluations_missed": {
|
| | "description": "Number of evaluations the student missed/skipped",
|
| | "type": "integer",
|
| | "min": 0,
|
| | "max": 20,
|
| | "required": True,
|
| | "maps_to": "Curricular units 2nd sem (without evaluations)"
|
| | },
|
| | "tuition_paid": {
|
| | "description": "Is tuition up to date?",
|
| | "type": "boolean",
|
| | "required": True,
|
| | "maps_to": "Tuition fees up to date"
|
| | },
|
| | "has_scholarship": {
|
| | "description": "Does student have a scholarship?",
|
| | "type": "boolean",
|
| | "required": True,
|
| | "maps_to": "Scholarship holder"
|
| | },
|
| | "has_debt": {
|
| | "description": "Does student have outstanding debt?",
|
| | "type": "boolean",
|
| | "required": True,
|
| | "maps_to": "Debtor"
|
| | },
|
| | "gender": {
|
| | "description": "Student gender",
|
| | "type": "integer",
|
| | "values": {"0": "Female", "1": "Male"},
|
| | "required": True,
|
| | "maps_to": "Gender"
|
| | },
|
| | "age": {
|
| | "description": "Student's age at enrollment",
|
| | "type": "integer",
|
| | "min": 17,
|
| | "max": 70,
|
| | "required": True,
|
| | "maps_to": "Age at enrollment"
|
| | },
|
| | "is_daytime": {
|
| | "description": "Is student enrolled in daytime classes?",
|
| | "type": "boolean",
|
| | "required": True,
|
| | "maps_to": "Daytime/evening attendance"
|
| | },
|
| | "is_displaced": {
|
| | "description": "Is student displaced from home region?",
|
| | "type": "boolean",
|
| | "required": True,
|
| | "maps_to": "Displaced"
|
| | }
|
| | },
|
| |
|
| | "output_schema": {
|
| | "prediction": {
|
| | "type": "string",
|
| | "values": ["Dropout", "Graduate"]
|
| | },
|
| | "dropout_probability": {
|
| | "type": "float",
|
| | "min": 0,
|
| | "max": 1
|
| | },
|
| | "risk_level": {
|
| | "type": "string",
|
| | "values": ["LOW", "MEDIUM", "HIGH"]
|
| | }
|
| | },
|
| |
|
| | "feature_order": practical_features,
|
| |
|
| | "performance": {
|
| | "roc_auc": round(np.mean(auc_scores), 4),
|
| | "accuracy": round(np.mean(acc_scores), 4)
|
| | }
|
| | }
|
| |
|
| | with open("model_config.json", 'w') as f:
|
| | json.dump(config, f, indent=2)
|
| |
|
| | print("\n" + "="*60)
|
| | print("SAVED FILES")
|
| | print("="*60)
|
| | print("1. student_dropout_model.pkl")
|
| | print("2. model_config.json") |