import pandas as pd import numpy as np import joblib import xgboost as xgb import shap import networkx as nx from collections import Counter from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from imblearn.over_sampling import SMOTE import warnings warnings.filterwarnings('ignore') from routing_engine import RoutingEngine def main(): print("=" * 60) print(" Placement Predictor & Career Router Training") print("=" * 60) # Configuration PLACE_DATA = "collegePlace.csv" CAREER_DATA = "Tech_Data_Cleaned.csv" OUTPUT_PATH = "placement_artifacts.pkl" print("[*] Loading placement dataset...") df = pd.read_csv(PLACE_DATA) # Preprocessing le_gender = LabelEncoder() le_stream = LabelEncoder() df['Gender'] = le_gender.fit_transform(df['Gender']) df['Stream'] = le_stream.fit_transform(df['Stream']) X = df.drop('PlacedOrNot', axis=1) y = df['PlacedOrNot'] feature_names = list(X.columns) print("\n[*] Balancing dataset with SMOTE...") smote = SMOTE(random_state=42) X_resampled, y_resampled = smote.fit_resample(X, y) # Train-Test Split X_train, X_test, y_train, y_test = train_test_split( X_resampled, y_resampled, test_size=0.2, random_state=42 ) print("\n[*] Building Preprocessor Pipeline...") preprocessor = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) X_train_processed = preprocessor.fit_transform(X_train) X_test_processed = preprocessor.transform(X_test) print("\n[*] Training StackingClassifier Ensemble...") estimators = [ ('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)), ('gbc', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)), ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=4)) ] stacking_model = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), cv=5, stack_method='predict_proba' ) stacking_model.fit(X_train_processed, y_train) accuracy = stacking_model.score(X_test_processed, y_test) print(f" [OK] Ensemble Training complete! Accuracy: {accuracy:.4f}") print("\n[*] Training Standalone XGBoost for SHAP...") standalone_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=4) standalone_xgb.fit(X_train_processed, y_train) print("\n[*] Initializing RoutingEngine from Career Data...") routing_engine = RoutingEngine(CAREER_DATA) print(f" [OK] Extracted {len(routing_engine.all_jobs)} jobs and {len(routing_engine.all_unique_skills)} skills.") print(f"\n[*] Saving artifacts to '{OUTPUT_PATH}'...") artifacts = { 'preprocessor': preprocessor, 'model': stacking_model, 'shap_model': standalone_xgb, 'le_gender': le_gender, 'le_stream': le_stream, 'routing_engine': routing_engine } joblib.dump(artifacts, OUTPUT_PATH) print(" [OK] Successfully saved!") print("=" * 60) if __name__ == "__main__": main()