File size: 3,517 Bytes
6b98981
 
 
 
 
 
 
 
a43ebba
 
 
 
 
6b98981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a43ebba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b98981
a43ebba
 
 
 
6b98981
a43ebba
 
 
6b98981
 
 
 
 
 
 
a43ebba
 
 
6b98981
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
import shap
import networkx as nx
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

from routing_engine import RoutingEngine

def main():
    print("=" * 60)
    print("    Placement Predictor & Career Router Training")
    print("=" * 60)
    
    # Configuration
    PLACE_DATA = "collegePlace.csv"
    CAREER_DATA = "Tech_Data_Cleaned.csv"
    OUTPUT_PATH = "placement_artifacts.pkl"
    
    print("[*] Loading placement dataset...")
    df = pd.read_csv(PLACE_DATA)
    
    # Preprocessing
    le_gender = LabelEncoder()
    le_stream = LabelEncoder()
    
    df['Gender'] = le_gender.fit_transform(df['Gender'])
    df['Stream'] = le_stream.fit_transform(df['Stream'])
    
    X = df.drop('PlacedOrNot', axis=1)
    y = df['PlacedOrNot']
    
    feature_names = list(X.columns)
    
    print("\n[*] Balancing dataset with SMOTE...")
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_resampled, y_resampled, test_size=0.2, random_state=42
    )
    
    print("\n[*] Building Preprocessor Pipeline...")
    preprocessor = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    print("\n[*] Training StackingClassifier Ensemble...")
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)),
        ('gbc', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)),
        ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=4))
    ]

    stacking_model = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(),
        cv=5,
        stack_method='predict_proba'
    )
    stacking_model.fit(X_train_processed, y_train)

    accuracy = stacking_model.score(X_test_processed, y_test)
    print(f"    [OK] Ensemble Training complete! Accuracy: {accuracy:.4f}")
    
    print("\n[*] Training Standalone XGBoost for SHAP...")
    standalone_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=4)
    standalone_xgb.fit(X_train_processed, y_train)
    
    print("\n[*] Initializing RoutingEngine from Career Data...")
    routing_engine = RoutingEngine(CAREER_DATA)
    print(f"    [OK] Extracted {len(routing_engine.all_jobs)} jobs and {len(routing_engine.all_unique_skills)} skills.")

    print(f"\n[*] Saving artifacts to '{OUTPUT_PATH}'...")
    artifacts = {
        'preprocessor': preprocessor,
        'model': stacking_model,
        'shap_model': standalone_xgb,
        'le_gender': le_gender,
        'le_stream': le_stream,
        'routing_engine': routing_engine
    }
    joblib.dump(artifacts, OUTPUT_PATH)
    print("    [OK] Successfully saved!")
    print("=" * 60)

if __name__ == "__main__":
    main()