sahlnizar's picture
Add files using upload-large-folder tool
5da71f2 verified
#!/usr/bin/env python
# coding: utf-8
"""
Student Dropout Prediction Model
Trains a Logistic Regression model and saves it with feature configuration.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score
# =============================================================================
# 1. LOAD AND PREPROCESS DATA
# =============================================================================
# Load data
df = pd.read_csv('data.csv', sep=';')
print(f"Original dataset shape: {df.shape}")
# Filter out 'Enrolled' - keep only Dropout and Graduate
df = df[df['Target'] != 'Enrolled']
print(f"After filtering 'Enrolled': {df.shape}")
# Round numeric columns
df = df.round()
# Convert specific columns to int64
numeric_cols = [
'Admission grade',
'Previous qualification (grade)',
'Curricular units 1st sem (grade)',
'Curricular units 2nd sem (grade)',
'Unemployment rate',
'Inflation rate',
'GDP'
]
df[numeric_cols] = df[numeric_cols].astype(np.int64)
# Drop unnecessary columns (selected by your classmate)
columns_to_drop = [
"Father's occupation",
"Curricular units 2nd sem (credited)",
"Curricular units 2nd sem (enrolled)",
"Curricular units 2nd sem (approved)"
]
df.drop(columns=columns_to_drop, inplace=True)
# Transform Target column
df['Target'] = df['Target'].map({'Dropout': 0, 'Graduate': 1})
# Verify target transformation
print(f"\nTarget distribution:")
print(df['Target'].value_counts())
# Create features and target
x = df.drop('Target', axis=1)
y = df['Target'].astype(int)
print(f"\nFeatures shape: {x.shape}")
print(f"Target shape: {y.shape}")
# =============================================================================
# 2. DEFINE MODEL
# =============================================================================
model = Pipeline([
('scaler', StandardScaler()),
('clf', LogisticRegression(
penalty='l2',
C=1.0,
solver='lbfgs',
class_weight='balanced',
random_state=42,
max_iter=1000
))
])
# =============================================================================
# 3. CROSS-VALIDATION
# =============================================================================
print("\n" + "="*60)
print("CROSS-VALIDATION RESULTS")
print("="*60)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_roc_scores = []
acc_scores = []
for fold, (train_index, val_index) in enumerate(skf.split(x, y), 1):
x_train, x_val = x.iloc[train_index], x.iloc[val_index]
y_train, y_val = y.iloc[train_index], y.iloc[val_index]
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
y_pred_proba = model.predict_proba(x_val)[:, 1]
auc_roc = roc_auc_score(y_val, y_pred_proba)
acc = accuracy_score(y_val, y_pred)
auc_roc_scores.append(auc_roc)
acc_scores.append(acc)
print(f"\nFold {fold}:")
print(f" Accuracy: {acc:.4f}, ROC-AUC: {auc_roc:.4f}")
print("\n" + "-"*60)
print(f"Average ROC-AUC: {np.mean(auc_roc_scores):.4f} ± {np.std(auc_roc_scores):.4f}")
print(f"Average Accuracy: {np.mean(acc_scores):.4f} ± {np.std(acc_scores):.4f}")
# =============================================================================
# 4. TRAIN FINAL MODEL ON ALL DATA
# =============================================================================
print("\n" + "="*60)
print("TRAINING FINAL MODEL ON ALL DATA")
print("="*60)
final_model = model.fit(x, y)
print("Final model trained successfully!")
# =============================================================================
# 5. FEATURE IMPORTANCE
# =============================================================================
classifier = final_model.named_steps['clf']
feature_importance = pd.DataFrame({
'feature': x.columns,
'coefficient': classifier.coef_[0]
}).sort_values('coefficient', key=abs, ascending=False)
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))
# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance.head(10), x='coefficient', y='feature')
plt.title('Top 10 Feature Importance (Logistic Regression Coefficients)')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=150)
plt.show()
# =============================================================================
# 6. SAVE MODEL AND CONFIGURATION
# =============================================================================
print("\n" + "="*60)
print("SAVING MODEL AND CONFIGURATION")
print("="*60)
# Save model using joblib (better for sklearn models)
model_path = "student_dropout_model.pkl"
joblib.dump(final_model, model_path)
print(f"Model saved to: {model_path}")
# Create and save configuration
config = {
"model_name": "Student Dropout Prediction Model",
"model_type": "LogisticRegression with StandardScaler",
"target_mapping": {
"0": "Dropout",
"1": "Graduate"
},
"features": x.columns.tolist(),
"num_features": len(x.columns),
"dropped_columns": columns_to_drop,
"feature_details": {},
"model_performance": {
"avg_roc_auc": round(np.mean(auc_roc_scores), 4),
"std_roc_auc": round(np.std(auc_roc_scores), 4),
"avg_accuracy": round(np.mean(acc_scores), 4),
"std_accuracy": round(np.std(acc_scores), 4)
},
"feature_importance": feature_importance.to_dict('records')
}
# Add feature details (dtype, min, max, etc.)
for col in x.columns:
config["feature_details"][col] = {
"dtype": str(x[col].dtype),
"min": float(x[col].min()),
"max": float(x[col].max()),
"mean": float(x[col].mean()),
"example_value": int(x[col].iloc[0]) if x[col].dtype in ['int64', 'int32'] else float(x[col].iloc[0])
}
# Save configuration
config_path = "model_config.json"
with open(config_path, 'w') as f:
json.dump(config, f, indent=2)
print(f"Configuration saved to: {config_path}")
# =============================================================================
# 7. PRINT SUMMARY
# =============================================================================
print("\n" + "="*60)
print("SUMMARY: FEATURES YOUR CLASSMATE SELECTED")
print("="*60)
print(f"\nTotal features: {len(x.columns)}")
print("\nFeature list:")
for i, col in enumerate(x.columns, 1):
print(f" {i:2d}. {col}")
print(f"\nDropped columns:")
for col in columns_to_drop:
print(f" - {col}")
print("\n" + "="*60)
print("DONE! Files created:")
print(f" 1. {model_path} (trained model)")
print(f" 2. {config_path} (feature configuration)")
print(f" 3. feature_importance.png (visualization)")
print("="*60)