No_code_ML_builder / src /model_training.py
Shubham 10000
update to main file
cb74654
import streamlit as st
import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
accuracy_score, mean_squared_error, r2_score,
classification_report, confusion_matrix
)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")
def model_training():
st.title("Model Training & Evaluation")
# Step 1: Load preprocessed data
files = os.listdir('preprocessed_data') if os.path.exists('preprocessed_data') else []
if not files:
st.warning("No preprocessed data found. Please preprocess a file first.")
return
selected_file = st.selectbox("Select a preprocessed CSV file", files)
df = pd.read_csv(os.path.join('preprocessed_data', selected_file))
st.write("Data Preview:")
st.dataframe(df.head())
# Step 2: Select target column
target_column = st.selectbox("Select Target Column", df.columns)
X = df.drop(columns=[target_column])
y = df[target_column]
# Step 3: Detect problem type
if y.nunique() <= 20 and y.dtype in ['int64', 'int32']:
problem_type = "classification"
st.info("Detected as Classification Problem")
else:
problem_type = "regression"
st.info("Detected as Regression Problem")
# Step 4: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 5: Define models
if problem_type == "classification":
models = {
"Random Forest Classifier": RandomForestClassifier(),
"Decision Tree Classifier": DecisionTreeClassifier(),
"KNN Classifier": KNeighborsClassifier()
}
else:
models = {
"Random Forest Regressor": RandomForestRegressor(),
"Decision Tree Regressor": DecisionTreeRegressor(),
"KNN Regressor": KNeighborsRegressor(),
"Linear Regression": LinearRegression()
}
# Step 6: Train, Evaluate, and Display Metrics
results = []
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
if problem_type == "classification":
score = (accuracy_score(y_test, y_pred))*100
else:
score = (r2_score(y_test, y_pred))*100
results.append((name, score, model, y_pred))
# Step 7: Show model performance table
results_df = pd.DataFrame(results, columns=["Model", "Score", "Trained_Model", "Predictions"])
st.write("Model Performance:")
st.dataframe(results_df[["Model", "Score"]])
# Step 8: Best Model Selection
best_model_row = results_df.loc[results_df["Score"].idxmax()]
st.success(f"Best Model: {best_model_row['Model']} with Score: {best_model_row['Score']:.4f}")
#SHOW NAME OF BEST MODEL
st.write("Best Model Details:", best_model_row)
best_model = best_model_row["Trained_Model"]
# Step 9: Detailed Metrics for Best Model
st.subheader("Detailed Metrics for Best Model")
best_y_pred = best_model_row["Predictions"]
if problem_type == "classification":
st.write("**Accuracy Score:**", accuracy_score(y_test, best_y_pred))
st.text("**Classification Report:**")
st.text(classification_report(y_test, best_y_pred))
# Confusion Matrix
cm = confusion_matrix(y_test, best_y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")
st.pyplot(fig)
else:
st.write("**Mean Squared Error:**", np.sqrt(mean_squared_error(y_test, best_y_pred))*100, "%")
st.write("**R² Score:**", r2_score(y_test, best_y_pred))
# Step 10: Save & Download Best Model
if st.button("Save Best Model"):
st.write("Saving the best model...")
st.write(f"Model Name: {best_model_row['Model']}")
os.makedirs("saved_models", exist_ok=True)
model_path = f"saved_models/{best_model_row['Model'].replace(' ', '_')}.pkl"
joblib.dump(best_model_row["Trained_Model"], model_path)
st.download_button(
label="Download Model",
data=open(model_path, "rb").read(),
file_name=os.path.basename(model_path)
)