PlacementPredictor / train_model.py
google-labs-jules[bot]
feat: placement predictor v2 upgrade
a43ebba
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
import shap
import networkx as nx
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')
from routing_engine import RoutingEngine
def main():
print("=" * 60)
print(" Placement Predictor & Career Router Training")
print("=" * 60)
# Configuration
PLACE_DATA = "collegePlace.csv"
CAREER_DATA = "Tech_Data_Cleaned.csv"
OUTPUT_PATH = "placement_artifacts.pkl"
print("[*] Loading placement dataset...")
df = pd.read_csv(PLACE_DATA)
# Preprocessing
le_gender = LabelEncoder()
le_stream = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])
df['Stream'] = le_stream.fit_transform(df['Stream'])
X = df.drop('PlacedOrNot', axis=1)
y = df['PlacedOrNot']
feature_names = list(X.columns)
print("\n[*] Balancing dataset with SMOTE...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
X_resampled, y_resampled, test_size=0.2, random_state=42
)
print("\n[*] Building Preprocessor Pipeline...")
preprocessor = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print("\n[*] Training StackingClassifier Ensemble...")
estimators = [
('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)),
('gbc', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)),
('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=4))
]
stacking_model = StackingClassifier(
estimators=estimators,
final_estimator=LogisticRegression(),
cv=5,
stack_method='predict_proba'
)
stacking_model.fit(X_train_processed, y_train)
accuracy = stacking_model.score(X_test_processed, y_test)
print(f" [OK] Ensemble Training complete! Accuracy: {accuracy:.4f}")
print("\n[*] Training Standalone XGBoost for SHAP...")
standalone_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=4)
standalone_xgb.fit(X_train_processed, y_train)
print("\n[*] Initializing RoutingEngine from Career Data...")
routing_engine = RoutingEngine(CAREER_DATA)
print(f" [OK] Extracted {len(routing_engine.all_jobs)} jobs and {len(routing_engine.all_unique_skills)} skills.")
print(f"\n[*] Saving artifacts to '{OUTPUT_PATH}'...")
artifacts = {
'preprocessor': preprocessor,
'model': stacking_model,
'shap_model': standalone_xgb,
'le_gender': le_gender,
'le_stream': le_stream,
'routing_engine': routing_engine
}
joblib.dump(artifacts, OUTPUT_PATH)
print(" [OK] Successfully saved!")
print("=" * 60)
if __name__ == "__main__":
main()