Spaces:
Sleeping
Sleeping
File size: 3,517 Bytes
6b98981 a43ebba 6b98981 a43ebba 6b98981 a43ebba 6b98981 a43ebba 6b98981 a43ebba 6b98981 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
import shap
import networkx as nx
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')
from routing_engine import RoutingEngine
def main():
print("=" * 60)
print(" Placement Predictor & Career Router Training")
print("=" * 60)
# Configuration
PLACE_DATA = "collegePlace.csv"
CAREER_DATA = "Tech_Data_Cleaned.csv"
OUTPUT_PATH = "placement_artifacts.pkl"
print("[*] Loading placement dataset...")
df = pd.read_csv(PLACE_DATA)
# Preprocessing
le_gender = LabelEncoder()
le_stream = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])
df['Stream'] = le_stream.fit_transform(df['Stream'])
X = df.drop('PlacedOrNot', axis=1)
y = df['PlacedOrNot']
feature_names = list(X.columns)
print("\n[*] Balancing dataset with SMOTE...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
X_resampled, y_resampled, test_size=0.2, random_state=42
)
print("\n[*] Building Preprocessor Pipeline...")
preprocessor = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print("\n[*] Training StackingClassifier Ensemble...")
estimators = [
('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)),
('gbc', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)),
('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=4))
]
stacking_model = StackingClassifier(
estimators=estimators,
final_estimator=LogisticRegression(),
cv=5,
stack_method='predict_proba'
)
stacking_model.fit(X_train_processed, y_train)
accuracy = stacking_model.score(X_test_processed, y_test)
print(f" [OK] Ensemble Training complete! Accuracy: {accuracy:.4f}")
print("\n[*] Training Standalone XGBoost for SHAP...")
standalone_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=4)
standalone_xgb.fit(X_train_processed, y_train)
print("\n[*] Initializing RoutingEngine from Career Data...")
routing_engine = RoutingEngine(CAREER_DATA)
print(f" [OK] Extracted {len(routing_engine.all_jobs)} jobs and {len(routing_engine.all_unique_skills)} skills.")
print(f"\n[*] Saving artifacts to '{OUTPUT_PATH}'...")
artifacts = {
'preprocessor': preprocessor,
'model': stacking_model,
'shap_model': standalone_xgb,
'le_gender': le_gender,
'le_stream': le_stream,
'routing_engine': routing_engine
}
joblib.dump(artifacts, OUTPUT_PATH)
print(" [OK] Successfully saved!")
print("=" * 60)
if __name__ == "__main__":
main()
|