Benchmark-Kit-26 / src /benchmarking.py
dwmk's picture
Update src/benchmarking.py
bca40b9 verified
# benchmarking.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
def run_benchmarking():
st.header("βš–οΈ Professional Model Benchmarking")
df = st.session_state.processed_df
target_col = st.session_state.target_col
feature_cols = st.session_state.feature_cols
# Validation
if not target_col or target_col == "None":
st.error("⚠️ Please select a Target variable in the EDA tab.")
return
if not feature_cols:
st.error("⚠️ Please select Feature variables in the EDA tab.")
return
# Data Preparation
X = df[feature_cols]
y = df[target_col]
# Detect task type (Binary vs Multiclass)
y_nunique = y.nunique()
is_binary = y_nunique == 2
# Encode Target
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_names = [str(c) for c in le.classes_]
# Train/Test Split
test_size = st.slider("Test Split Size", 0.1, 0.5, 0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=test_size, random_state=42)
# ---------------- Model Configuration ----------------
available_models = {
"Logistic Regression": LogisticRegression(max_iter=1000),
"Random Forest": RandomForestClassifier(n_estimators=100),
"SVM": SVC(probability=True)
}
selected_models = st.multiselect("Select Models to Benchmark", list(available_models.keys()), default=["Random Forest"])
if st.button("πŸš€ Run Benchmark"):
results_list = []
# Define Preprocessing Pipeline
# Numeric -> Scale, Categorical -> OneHot
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(exclude=np.number).columns
transformers = []
if len(num_cols) > 0:
transformers.append(('num', StandardScaler(), num_cols))
if len(cat_cols) > 0:
transformers.append(('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols))
preprocessor = ColumnTransformer(transformers=transformers)
st.markdown("### πŸ† Results")
for name in selected_models:
with st.status(f"Training {name}...", expanded=True) as status:
# Build Pipeline
clf = available_models[name]
model_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', clf)
])
# Train
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)
# Metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
results_list.append({
"Model": name,
"Accuracy": acc,
"F1 Score (Weighted)": f1
})
status.write(f"Accuracy: {acc:.4f}")
status.update(label=f"{name} Finished", state="complete")
# Detailed Analysis (Expander)
with st.expander(f"πŸ” Details: {name}"):
c1, c2 = st.columns(2)
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
fig_cm = px.imshow(cm, text_auto=True,
x=class_names, y=class_names,
labels=dict(x="Predicted", y="Actual"),
title=f"Confusion Matrix ({name})",
color_continuous_scale="Blues")
c1.plotly_chart(fig_cm, use_container_width=True)
# Classification Report
report = classification_report(y_test, y_pred, target_names=class_names, output_dict=True)
df_report = pd.DataFrame(report).transpose()
c2.dataframe(df_report.style.background_gradient(cmap="Greens", subset=["f1-score"]))
# Summary Table
st.subheader("🏁 Leaderboard")
res_df = pd.DataFrame(results_list).sort_values(by="F1 Score (Weighted)", ascending=False)
st.dataframe(res_df.style.highlight_max(axis=0, color="lightgreen"), use_container_width=True)
# Comparison Chart
fig_bench = px.bar(res_df, x="Model", y="F1 Score (Weighted)",
color="Accuracy", title="Model Performance Comparison",
range_y=[0, 1])
st.plotly_chart(fig_bench, use_container_width=True)