import streamlit as st import pandas as pd import numpy as np import os import joblib import matplotlib.pyplot as plt import sklearn import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.metrics import ( accuracy_score, mean_squared_error, r2_score, classification_report, confusion_matrix ) from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.linear_model import LinearRegression import warnings warnings.filterwarnings("ignore") def model_training(): st.title("Model Training & Evaluation") # Step 1: Load preprocessed data files = os.listdir('preprocessed_data') if os.path.exists('preprocessed_data') else [] if not files: st.warning("No preprocessed data found. Please preprocess a file first.") return selected_file = st.selectbox("Select a preprocessed CSV file", files) df = pd.read_csv(os.path.join('preprocessed_data', selected_file)) st.write("Data Preview:") st.dataframe(df.head()) # Step 2: Select target column target_column = st.selectbox("Select Target Column", df.columns) X = df.drop(columns=[target_column]) y = df[target_column] # Step 3: Detect problem type if y.nunique() <= 20 and y.dtype in ['int64', 'int32']: problem_type = "classification" st.info("Detected as Classification Problem") else: problem_type = "regression" st.info("Detected as Regression Problem") # Step 4: Train/Test Split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Step 5: Define models if problem_type == "classification": models = { "Random Forest Classifier": RandomForestClassifier(), "Decision Tree Classifier": DecisionTreeClassifier(), "KNN Classifier": KNeighborsClassifier() } else: models = { "Random Forest Regressor": RandomForestRegressor(), "Decision Tree Regressor": DecisionTreeRegressor(), "KNN Regressor": KNeighborsRegressor(), "Linear Regression": LinearRegression() } # Step 6: Train, Evaluate, and Display Metrics results = [] for name, model in models.items(): model.fit(X_train, y_train) y_pred = model.predict(X_test) if problem_type == "classification": score = (accuracy_score(y_test, y_pred))*100 else: score = (r2_score(y_test, y_pred))*100 results.append((name, score, model, y_pred)) # Step 7: Show model performance table results_df = pd.DataFrame(results, columns=["Model", "Score", "Trained_Model", "Predictions"]) st.write("Model Performance:") st.dataframe(results_df[["Model", "Score"]]) # Step 8: Best Model Selection best_model_row = results_df.loc[results_df["Score"].idxmax()] st.success(f"Best Model: {best_model_row['Model']} with Score: {best_model_row['Score']:.4f}") #SHOW NAME OF BEST MODEL st.write("Best Model Details:", best_model_row) best_model = best_model_row["Trained_Model"] # Step 9: Detailed Metrics for Best Model st.subheader("Detailed Metrics for Best Model") best_y_pred = best_model_row["Predictions"] if problem_type == "classification": st.write("**Accuracy Score:**", accuracy_score(y_test, best_y_pred)) st.text("**Classification Report:**") st.text(classification_report(y_test, best_y_pred)) # Confusion Matrix cm = confusion_matrix(y_test, best_y_pred) fig, ax = plt.subplots() sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax) ax.set_xlabel("Predicted") ax.set_ylabel("Actual") st.pyplot(fig) else: st.write("**Mean Squared Error:**", np.sqrt(mean_squared_error(y_test, best_y_pred))*100, "%") st.write("**R² Score:**", r2_score(y_test, best_y_pred)) # Step 10: Save & Download Best Model if st.button("Save Best Model"): st.write("Saving the best model...") st.write(f"Model Name: {best_model_row['Model']}") os.makedirs("saved_models", exist_ok=True) model_path = f"saved_models/{best_model_row['Model'].replace(' ', '_')}.pkl" joblib.dump(best_model_row["Trained_Model"], model_path) st.download_button( label="Download Model", data=open(model_path, "rb").read(), file_name=os.path.basename(model_path) )