Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.ensemble import ( | |
| RandomForestClassifier, AdaBoostClassifier, | |
| StackingClassifier, VotingClassifier | |
| ) | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.svm import SVC | |
| from xgboost import XGBClassifier | |
| from sklearn.metrics import ( | |
| accuracy_score, roc_auc_score, | |
| confusion_matrix, classification_report | |
| ) | |
| from imblearn.over_sampling import SMOTE | |
| class HRTurnoverPredictor: | |
| def __init__(self, data): | |
| self.df = data | |
| self.X, self.y = self.preprocess_data() | |
| self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( | |
| self.X, self.y, test_size=0.2, random_state=42, stratify=self.y | |
| ) | |
| self.scaler = StandardScaler() | |
| self.X_train_scaled = self.scaler.fit_transform(self.X_train) | |
| self.X_test_scaled = self.scaler.transform(self.X_test) | |
| self.smote = SMOTE(random_state=42) | |
| self.X_train_resampled, self.y_train_resampled = self.smote.fit_resample( | |
| self.X_train_scaled, self.y_train | |
| ) | |
| self.models = self.initialize_models() | |
| self.results = None | |
| def preprocess_data(self): | |
| df = self.df.copy() | |
| categorical_cols = ['MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource'] | |
| for col in categorical_cols: | |
| df[col] = LabelEncoder().fit_transform(df[col].astype(str)) | |
| features = [ | |
| 'Salary', 'PerfScoreID', 'EngagementSurvey', 'EmpSatisfaction', | |
| 'SpecialProjectsCount', 'DaysLateLast30', 'Absences', | |
| 'MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource' | |
| ] | |
| X = df[features] | |
| y = df['Termd'].astype(int) | |
| imputer = SimpleImputer(strategy='median') | |
| X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns) | |
| return X, y | |
| def initialize_models(self): | |
| base_models = { | |
| 'Logistic Regression': LogisticRegression(random_state=42), | |
| 'Decision Tree': DecisionTreeClassifier(random_state=42), | |
| 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42), | |
| 'XGBoost': XGBClassifier(n_estimators=100, random_state=42), | |
| 'AdaBoost': AdaBoostClassifier(random_state=42), | |
| 'SVM': SVC(probability=True, random_state=42) | |
| } | |
| base_estimators = [ | |
| ('lr', LogisticRegression(random_state=42)), | |
| ('rf', RandomForestClassifier(random_state=42)), | |
| ('xgb', XGBClassifier(random_state=42)) | |
| ] | |
| stacking_classifier = StackingClassifier( | |
| estimators=base_estimators, | |
| final_estimator=LogisticRegression(), | |
| cv=5 | |
| ) | |
| base_models['Stacking'] = stacking_classifier | |
| voting_classifier = VotingClassifier( | |
| estimators=list(base_models.items())[:-1], # Exclude Stacking | |
| voting='soft' | |
| ) | |
| base_models['Voting'] = voting_classifier | |
| return base_models | |
| def train_models(self): | |
| results = {} | |
| for name, model in self.models.items(): | |
| model.fit(self.X_train_resampled, self.y_train_resampled) | |
| y_pred = model.predict(self.X_test_scaled) | |
| y_pred_proba = model.predict_proba(self.X_test_scaled)[:, 1] | |
| acc = accuracy_score(self.y_test, y_pred) | |
| roc_auc = roc_auc_score(self.y_test, y_pred_proba) | |
| results[name] = { | |
| 'Accuracy': acc, | |
| 'ROC AUC': roc_auc, | |
| 'Confusion Matrix': confusion_matrix(self.y_test, y_pred), | |
| 'Classification Report': classification_report(self.y_test, y_pred, output_dict=True), | |
| 'Predicted Probabilities': y_pred_proba | |
| } | |
| self.results = results | |
| return results | |
| def get_feature_importance(self): | |
| rf_model = self.models['Random Forest'] | |
| if hasattr(rf_model, "feature_importances_"): | |
| importances = rf_model.feature_importances_ | |
| feature_importance = pd.DataFrame({ | |
| "Feature": self.X.columns, | |
| "Importance": importances | |
| }).sort_values(by="Importance", ascending=False) | |
| return feature_importance | |
| return None | |
| def main(): | |
| st.set_page_config(page_title="HR Turnover Prediction", layout="wide") | |
| st.title("🏢 Employee Turnover Prediction Dashboard") | |
| # Sidebar for file upload | |
| st.sidebar.header("Upload HR Dataset") | |
| uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv") | |
| if uploaded_file is not None: | |
| # Read the CSV file | |
| df = pd.read_csv(uploaded_file) | |
| st.sidebar.success("File successfully uploaded!") | |
| # Create predictor | |
| predictor = HRTurnoverPredictor(df) | |
| # Tabs for different analyses | |
| tab1, tab2, tab3, tab4 = st.tabs([ | |
| "Model Performance", | |
| "Confusion Matrices", | |
| "Feature Importance", | |
| "Model Insights" | |
| ]) | |
| with tab1: | |
| st.header("Model Performance Comparison") | |
| # Train models | |
| results = predictor.train_models() | |
| # Create performance DataFrame | |
| perf_df = pd.DataFrame([ | |
| { | |
| 'Model': model_name, | |
| 'Accuracy': metrics['Accuracy'], | |
| 'ROC AUC': metrics['ROC AUC'] | |
| } | |
| for model_name, metrics in results.items() | |
| ]).sort_values('ROC AUC', ascending=False) | |
| # Display performance table | |
| st.dataframe(perf_df) | |
| # Bar plot of model performance | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| perf_df.plot(x='Model', y=['Accuracy', 'ROC AUC'], kind='bar', ax=ax) | |
| plt.title("Model Performance Comparison") | |
| plt.xlabel("Model") | |
| plt.ylabel("Score") | |
| plt.tight_layout() | |
| st.pyplot(fig) | |
| with tab2: | |
| st.header("Confusion Matrices") | |
| # Create a grid of confusion matrices | |
| fig, axes = plt.subplots(3, 3, figsize=(15, 15)) | |
| axes = axes.ravel() | |
| for i, (model_name, metrics) in enumerate(results.items()): | |
| cm = metrics['Confusion Matrix'] | |
| sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axes[i]) | |
| axes[i].set_title(f"{model_name} Confusion Matrix") | |
| axes[i].set_xlabel("Predicted Label") | |
| axes[i].set_ylabel("True Label") | |
| plt.tight_layout() | |
| st.pyplot(fig) | |
| with tab3: | |
| st.header("Feature Importance") | |
| # Get and display feature importance | |
| feature_importance = predictor.get_feature_importance() | |
| if feature_importance is not None: | |
| st.dataframe(feature_importance) | |
| # Feature importance plot | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| feature_importance.plot(x='Feature', y='Importance', kind='bar', ax=ax) | |
| plt.title("Random Forest Feature Importance") | |
| plt.xlabel("Features") | |
| plt.ylabel("Importance") | |
| plt.tight_layout() | |
| st.pyplot(fig) | |
| with tab4: | |
| st.header("Model Insights") | |
| # Display detailed classification reports | |
| for model_name, metrics in results.items(): | |
| st.subheader(f"{model_name} Classification Report") | |
| report_df = pd.DataFrame(metrics['Classification Report']).transpose() | |
| st.dataframe(report_df) | |
| if __name__ == '__main__': | |
| main()# |