Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| import matplotlib.pyplot as plt | |
| from sklearn.model_selection import train_test_split, cross_val_score | |
| from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay | |
| from sklearn.preprocessing import LabelEncoder | |
| from xgboost import XGBClassifier | |
| from imblearn.over_sampling import SMOTE | |
| from prepare_data import load_and_prepare, get_features_and_target | |
| # ββ 1. Load & Prepare ββββββββββββββββββββββββββββββββββββββββ | |
| print("π¦ Loading data...") | |
| df = load_and_prepare("../data/raw/your_dataset.csv") | |
| X, y = get_features_and_target(df) | |
| print(f"β Dataset shape: {X.shape}") | |
| print(f"β Class distribution:\n{y.value_counts()}") | |
| # ββ 2. Handle Class Imbalance ββββββββββββββββββββββββββββββββ | |
| # (If one class has far more samples than others) | |
| smote = SMOTE(random_state=42) | |
| X_balanced, y_balanced = smote.fit_resample(X, y) | |
| print(f"β After SMOTE: {pd.Series(y_balanced).value_counts().to_dict()}") | |
| # ββ 3. Train/Test Split ββββββββββββββββββββββββββββββββββββββ | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X_balanced, y_balanced, | |
| test_size=0.2, | |
| random_state=42, | |
| stratify=y_balanced | |
| ) | |
| # ββ 4. Train XGBoost βββββββββββββββββββββββββββββββββββββββββ | |
| print("\nπ Training XGBoost model...") | |
| model = XGBClassifier( | |
| n_estimators=200, | |
| max_depth=6, | |
| learning_rate=0.1, | |
| subsample=0.8, | |
| colsample_bytree=0.8, | |
| use_label_encoder=False, | |
| eval_metric='mlogloss', | |
| random_state=42 | |
| ) | |
| model.fit( | |
| X_train, y_train, | |
| eval_set=[(X_test, y_test)], | |
| verbose=50 | |
| ) | |
| # ββ 5. Evaluate ββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\nπ Evaluation Results:") | |
| y_pred = model.predict(X_test) | |
| print(classification_report( | |
| y_test, y_pred, | |
| target_names=['LOW π’', 'MEDIUM π‘', 'HIGH π΄'] | |
| )) | |
| # Cross-validation score | |
| cv_scores = cross_val_score(model, X_balanced, y_balanced, cv=5) | |
| print(f"β Cross-val Accuracy: {cv_scores.mean():.2f} Β± {cv_scores.std():.2f}") | |
| # ββ 6. Confusion Matrix ββββββββββββββββββββββββββββββββββββββ | |
| cm = confusion_matrix(y_test, y_pred) | |
| disp = ConfusionMatrixDisplay(cm, display_labels=['LOW', 'MEDIUM', 'HIGH']) | |
| disp.plot(cmap='Blues') | |
| plt.title('CrowdRoute β Confusion Matrix') | |
| plt.tight_layout() | |
| plt.savefig("../data/confusion_matrix.png") | |
| print("β Confusion matrix saved!") | |
| # ββ 7. Feature Importance ββββββββββββββββββββββββββββββββββββ | |
| feat_importance = pd.Series( | |
| model.feature_importances_, | |
| index=X.columns | |
| ).sort_values(ascending=False) | |
| print(f"\nπ Feature Importances:\n{feat_importance}") | |
| # ββ 8. Save Model ββββββββββββββββββββββββββββββββββββββββββββ | |
| joblib.dump(model, "saved_models/crowd_model.joblib") | |
| joblib.dump(list(X.columns), "saved_models/feature_columns.joblib") | |
| print("\nβ Model saved to saved_models/crowd_model.joblib") |