Spaces:
Sleeping
Sleeping
File size: 4,656 Bytes
cb74654 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | import streamlit as st
import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
accuracy_score, mean_squared_error, r2_score,
classification_report, confusion_matrix
)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")
def model_training():
st.title("Model Training & Evaluation")
# Step 1: Load preprocessed data
files = os.listdir('preprocessed_data') if os.path.exists('preprocessed_data') else []
if not files:
st.warning("No preprocessed data found. Please preprocess a file first.")
return
selected_file = st.selectbox("Select a preprocessed CSV file", files)
df = pd.read_csv(os.path.join('preprocessed_data', selected_file))
st.write("Data Preview:")
st.dataframe(df.head())
# Step 2: Select target column
target_column = st.selectbox("Select Target Column", df.columns)
X = df.drop(columns=[target_column])
y = df[target_column]
# Step 3: Detect problem type
if y.nunique() <= 20 and y.dtype in ['int64', 'int32']:
problem_type = "classification"
st.info("Detected as Classification Problem")
else:
problem_type = "regression"
st.info("Detected as Regression Problem")
# Step 4: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 5: Define models
if problem_type == "classification":
models = {
"Random Forest Classifier": RandomForestClassifier(),
"Decision Tree Classifier": DecisionTreeClassifier(),
"KNN Classifier": KNeighborsClassifier()
}
else:
models = {
"Random Forest Regressor": RandomForestRegressor(),
"Decision Tree Regressor": DecisionTreeRegressor(),
"KNN Regressor": KNeighborsRegressor(),
"Linear Regression": LinearRegression()
}
# Step 6: Train, Evaluate, and Display Metrics
results = []
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
if problem_type == "classification":
score = (accuracy_score(y_test, y_pred))*100
else:
score = (r2_score(y_test, y_pred))*100
results.append((name, score, model, y_pred))
# Step 7: Show model performance table
results_df = pd.DataFrame(results, columns=["Model", "Score", "Trained_Model", "Predictions"])
st.write("Model Performance:")
st.dataframe(results_df[["Model", "Score"]])
# Step 8: Best Model Selection
best_model_row = results_df.loc[results_df["Score"].idxmax()]
st.success(f"Best Model: {best_model_row['Model']} with Score: {best_model_row['Score']:.4f}")
#SHOW NAME OF BEST MODEL
st.write("Best Model Details:", best_model_row)
best_model = best_model_row["Trained_Model"]
# Step 9: Detailed Metrics for Best Model
st.subheader("Detailed Metrics for Best Model")
best_y_pred = best_model_row["Predictions"]
if problem_type == "classification":
st.write("**Accuracy Score:**", accuracy_score(y_test, best_y_pred))
st.text("**Classification Report:**")
st.text(classification_report(y_test, best_y_pred))
# Confusion Matrix
cm = confusion_matrix(y_test, best_y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")
st.pyplot(fig)
else:
st.write("**Mean Squared Error:**", np.sqrt(mean_squared_error(y_test, best_y_pred))*100, "%")
st.write("**R² Score:**", r2_score(y_test, best_y_pred))
# Step 10: Save & Download Best Model
if st.button("Save Best Model"):
st.write("Saving the best model...")
st.write(f"Model Name: {best_model_row['Model']}")
os.makedirs("saved_models", exist_ok=True)
model_path = f"saved_models/{best_model_row['Model'].replace(' ', '_')}.pkl"
joblib.dump(best_model_row["Trained_Model"], model_path)
st.download_button(
label="Download Model",
data=open(model_path, "rb").read(),
file_name=os.path.basename(model_path)
)
|