sahlnizar's picture
Add files using upload-large-folder tool
6114b9e verified
#!/usr/bin/env python
# coding: utf-8
"""
Student Dropout Prediction - PRACTICAL VERSION
Uses only features a teacher would realistically have access to.
"""
import pandas as pd
import numpy as np
import json
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
# =============================================================================
# 1. LOAD DATA
# =============================================================================
df = pd.read_csv('../data.csv', sep=';')
df = df[df['Target'] != 'Enrolled']
df.columns = df.columns.str.strip()
df = df.round()
# =============================================================================
# 2. SELECT ONLY REALISTIC FEATURES
# =============================================================================
# Features a teacher would realistically have
practical_features = [
# Academic Performance (MOST IMPORTANT - teacher's main data)
'Curricular units 2nd sem (approved)', # Units passed
'Curricular units 2nd sem (evaluations)', # Exams taken
'Curricular units 2nd sem (without evaluations)', # Exams missed
# Financial Status (from system)
'Tuition fees up to date',
'Scholarship holder',
'Debtor',
# Basic Demographics (in student profile)
'Gender',
'Age at enrollment',
# Enrollment Info
'Daytime/evening attendance',
'Displaced',
]
# Verify all features exist
missing = [f for f in practical_features if f not in df.columns]
if missing:
print(f"Warning: Missing features: {missing}")
practical_features = [f for f in practical_features if f in df.columns]
print(f"Using {len(practical_features)} practical features:")
for i, f in enumerate(practical_features, 1):
print(f" {i}. {f}")
# Create feature matrix
x = df[practical_features].copy()
y = df['Target'].map({'Dropout': 0, 'Graduate': 1}).astype(int)
# =============================================================================
# 3. TRAIN MODEL
# =============================================================================
model = Pipeline([
('scaler', StandardScaler()),
('clf', LogisticRegression(
C=1.0,
solver='lbfgs',
class_weight='balanced',
random_state=42,
max_iter=1000
))
])
# Cross-validation
print("\n" + "="*60)
print("CROSS-VALIDATION RESULTS")
print("="*60)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []
acc_scores = []
for fold, (train_idx, val_idx) in enumerate(skf.split(x, y), 1):
x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
y_proba = model.predict_proba(x_val)[:, 1]
auc = roc_auc_score(y_val, y_proba)
acc = accuracy_score(y_val, y_pred)
auc_scores.append(auc)
acc_scores.append(acc)
print(f"Fold {fold}: Accuracy={acc:.4f}, ROC-AUC={auc:.4f}")
print(f"\nAverage ROC-AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")
print(f"Average Accuracy: {np.mean(acc_scores):.4f} ± {np.std(acc_scores):.4f}")
# Train final model
final_model = model.fit(x, y)
# =============================================================================
# 4. SAVE MODEL AND PRACTICAL CONFIG
# =============================================================================
joblib.dump(final_model, "student_dropout_model.pkl")
# Create a PRACTICAL config for the agent
config = {
"model_name": "Student Dropout Predictor",
"version": "1.0",
"description": "Predicts if a student will dropout or graduate based on available data",
"input_schema": {
"units_approved": {
"description": "Number of curricular units passed this semester",
"type": "integer",
"min": 0,
"max": 20,
"required": True,
"maps_to": "Curricular units 2nd sem (approved)"
},
"evaluations_taken": {
"description": "Number of evaluations/exams the student took",
"type": "integer",
"min": 0,
"max": 30,
"required": True,
"maps_to": "Curricular units 2nd sem (evaluations)"
},
"evaluations_missed": {
"description": "Number of evaluations the student missed/skipped",
"type": "integer",
"min": 0,
"max": 20,
"required": True,
"maps_to": "Curricular units 2nd sem (without evaluations)"
},
"tuition_paid": {
"description": "Is tuition up to date?",
"type": "boolean",
"required": True,
"maps_to": "Tuition fees up to date"
},
"has_scholarship": {
"description": "Does student have a scholarship?",
"type": "boolean",
"required": True,
"maps_to": "Scholarship holder"
},
"has_debt": {
"description": "Does student have outstanding debt?",
"type": "boolean",
"required": True,
"maps_to": "Debtor"
},
"gender": {
"description": "Student gender",
"type": "integer",
"values": {"0": "Female", "1": "Male"},
"required": True,
"maps_to": "Gender"
},
"age": {
"description": "Student's age at enrollment",
"type": "integer",
"min": 17,
"max": 70,
"required": True,
"maps_to": "Age at enrollment"
},
"is_daytime": {
"description": "Is student enrolled in daytime classes?",
"type": "boolean",
"required": True,
"maps_to": "Daytime/evening attendance"
},
"is_displaced": {
"description": "Is student displaced from home region?",
"type": "boolean",
"required": True,
"maps_to": "Displaced"
}
},
"output_schema": {
"prediction": {
"type": "string",
"values": ["Dropout", "Graduate"]
},
"dropout_probability": {
"type": "float",
"min": 0,
"max": 1
},
"risk_level": {
"type": "string",
"values": ["LOW", "MEDIUM", "HIGH"]
}
},
"feature_order": practical_features,
"performance": {
"roc_auc": round(np.mean(auc_scores), 4),
"accuracy": round(np.mean(acc_scores), 4)
}
}
with open("model_config.json", 'w') as f:
json.dump(config, f, indent=2)
print("\n" + "="*60)
print("SAVED FILES")
print("="*60)
print("1. student_dropout_model.pkl")
print("2. model_config.json")