Spaces:

saherPervaiz
/

ModelTrain

Running

App Files Files Community

saherPervaiz commited on Jan 13, 2025

Commit

5775758

verified ·

1 Parent(s): 9aa6cc4

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -208

app.py CHANGED Viewed

@@ -2,13 +2,13 @@ import streamlit as st
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.linear_model import LogisticRegression
-from sklearn.svm import SVC
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.tree import DecisionTreeClassifier
 from sklearn.naive_bayes import GaussianNB
-from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
@@ -24,7 +24,7 @@ if uploaded_file is not None:
     st.write("Dataset:")
     st.dataframe(df)
-        # Convert categorical (str) data to numerical
     st.write("Converting Categorical Columns to Numerical Values:")
     label_encoder = LabelEncoder()
@@ -37,209 +37,125 @@ if uploaded_file is not None:
     st.write("Dataset After Conversion:")
     st.dataframe(df)
-    # Model Training Section
-    st.subheader("Model Training")
-    if df.empty:
-        st.warning("The dataset is empty. Please upload a valid CSV file.")
-    else:
-        # Handle Null Values (Missing Data)
-        st.write("Handling Missing (Null) Values:")
-        # Option to drop rows with null values or fill them
-        fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
-        if fill_method == "Drop rows":
-            df = df.dropna()
-        elif fill_method == "Fill with mean/median":
-            for col in df.columns:
-                if df[col].dtype in ['float64', 'int64']:
-                    df[col].fillna(df[col].mean(), inplace=True)  # For numeric columns, fill with mean
-                else:
-                    df[col].fillna(df[col].mode()[0], inplace=True)  # For categorical columns, fill with mode
-        # Handle Outliers using IQR method
-        st.write("Handling Outliers:")
-        # Define function to remove outliers using IQR
-        def remove_outliers_iqr(dataframe):
-            Q1 = dataframe.quantile(0.25)
-            Q3 = dataframe.quantile(0.75)
-            IQR = Q3 - Q1
-            # Filter out rows that are outside the IQR range
-            return dataframe[~((dataframe < (Q1 - 1.5 * IQR)) | (dataframe > (Q3 + 1.5 * IQR))).any(axis=1)]
-        # Remove outliers from the numerical columns
-        df = remove_outliers_iqr(df)
-        # Handle Extreme Values by Capping (Winsorization)
-        st.write("Handling Extreme Values (Capping):")
-        def cap_extreme_values(dataframe):
-            for col in dataframe.select_dtypes(include=[np.number]).columns:
-                # Define the thresholds for extreme values (95th percentile and 5th percentile)
-                lower_limit = dataframe[col].quantile(0.05)
-                upper_limit = dataframe[col].quantile(0.95)
-                # Cap the extreme values
-                dataframe[col] = np.clip(dataframe[col], lower_limit, upper_limit)
-            return dataframe
-        df = cap_extreme_values(df)
-        # Show cleaned dataset
-        st.write("Cleaned Dataset:")
-        st.dataframe(df)
-        # Add clean data download option
-        st.subheader("Download Cleaned Dataset")
-        # Provide a download button for the cleaned dataset
         st.download_button(
-        label="Download Cleaned Dataset (CSV)",
-        data=df.to_csv(index=False),  # Convert the cleaned dataset to CSV
-        file_name="cleaned_dataset.csv",  # Specify the file name
-        mime="text/csv"  # Specify the MIME type for CSV
         )
-        target = st.selectbox("Select Target Variable", df.columns)
-        features = [col for col in df.columns if col != target]
-        X = df[features]
-        y = df[target]
-        # Label Encoding for categorical columns
-        label_encoder = LabelEncoder()
-        # Encode the target variable (if it's categorical)
-        if y.dtype == 'object' or len(y.unique()) <= 10:  # If the target variable is categorical
-            y = label_encoder.fit_transform(y)
-        # Encode categorical feature columns (if any)
-        for col in X.columns:
-            if X[col].dtype == 'object' or len(X[col].unique()) <= 10:  # If the column is categorical
-                X[col] = label_encoder.fit_transform(X[col])
-        # Ensure there is enough data before proceeding with train-test split
-        if len(X) == 0 or len(y) == 0:
-            st.warning("Insufficient data. Please ensure there are valid feature and target columns.")
-        else:
-            # Split the data into training and test sets with customizable training size
-            train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
-            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
-            # List of classifiers to evaluate
-            classifiers = {
-                'Logistic Regression': LogisticRegression(max_iter=5000, solver='saga', penalty='l1'),
-                'Decision Tree': DecisionTreeClassifier(),
-                'Random Forest': RandomForestClassifier(),
-                'Support Vector Machine (SVM)': SVC(),
-                'K-Nearest Neighbors (k-NN)': KNeighborsClassifier(),
-                'Naive Bayes': GaussianNB()
-            }
-            # Initialize results storage
-            metrics = []
-            # Train and evaluate each model
-            for name, classifier in classifiers.items():
-                # Train the model
-                classifier.fit(X_train, y_train)
-                # Make predictions
-                y_pred = classifier.predict(X_test)
-                # Evaluate metrics
-                accuracy = accuracy_score(y_test, y_pred)
-                precision = precision_score(y_test, y_pred, zero_division=1, average='macro')
-                recall = recall_score(y_test, y_pred, zero_division=1, average='macro')
-                f1 = f1_score(y_test, y_pred, zero_division=1, average='macro')
-                metrics.append({
-                    'Model': name,
-                    'Accuracy': round(accuracy, 2),
-                    'Precision': round(precision, 2),
-                    'Recall': round(recall, 2),
-                    'F1-Score': round(f1, 2)
-                })
-            # Create a metrics DataFrame
-            metrics_df = pd.DataFrame(metrics)
-            # Display results in a table using st.dataframe
-            st.subheader("Model Performance Metrics")
-            st.dataframe(metrics_df)
-            # Download options
-            st.subheader("Download Model Performance Report in Different Formats")
-            # CSV
-            st.download_button(
-                label="Download as CSV",
-                data=metrics_df.to_csv(index=False),
-                file_name="model_report.csv",
-                mime="text/csv"
-            )
-            # JSON
-            st.download_button(
-                label="Download as JSON",
-                data=metrics_df.to_json(orient='records'),
-                file_name="model_report.json",
-                mime="application/json"
-            )
-            # PDF (using `fpdf` library)
-            from fpdf import FPDF
-            def generate_pdf(df):
-                pdf = FPDF()
-                pdf.add_page()
-                pdf.set_font("Arial", size=12)
-                pdf.cell(200, 10, txt="Model Performance Report", ln=True, align="C")
-                pdf.ln(10)
-                # Add table header
-                pdf.set_font("Arial", style='B', size=10)
-                for header in df.columns:
-                    pdf.cell(40, 10, header, border=1)
-                pdf.ln()
-                # Add table rows
-                pdf.set_font("Arial", size=10)
-                for row in df.values:
-                    for value in row:
-                        pdf.cell(40, 10, str(value), border=1)
-                    pdf.ln()
-                return pdf.output(dest='S').encode('latin1')
-            # PDF download
-            st.download_button(
-                label="Download as PDF",
-                data=generate_pdf(metrics_df),
-                file_name="model_report.pdf",
-                mime="application/pdf"
-            )
-            # Generate and download PNG report
-            st.subheader("Download Report as PNG")
-            # Create table plot using matplotlib
-            fig, ax = plt.subplots(figsize=(12, 4))  # Adjust the figure size to match the table's layout
-            ax.axis('tight')
-            ax.axis('off')
-            table_data = metrics_df.values
-            table_columns = metrics_df.columns.tolist()
-            table = ax.table(cellText=table_data, colLabels=table_columns, loc='center', cellLoc='center', colLoc='center')
-            table.auto_set_font_size(False)
-            table.set_fontsize(10)
-            table.scale(1.2, 1.2)  # Adjust the scale for better appearance
-            # Save the table as a PNG file
-            png_file = "model_report.png"
-            fig.savefig(png_file, bbox_inches='tight', dpi=300)
-            # Provide a download button for the PNG file
-            with open(png_file, "rb") as file:
-                st.download_button(
-                    label="Download as PNG",
-                    data=file,
-                    file_name="model_report.png",
-                    mime="image/png"
-                )

 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.svm import SVC, SVR
+from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.naive_bayes import GaussianNB
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
     st.write("Dataset:")
     st.dataframe(df)
+    # Convert categorical (str) data to numerical
     st.write("Converting Categorical Columns to Numerical Values:")
     label_encoder = LabelEncoder()
     st.write("Dataset After Conversion:")
     st.dataframe(df)
+    # Handle Null Values (Missing Data)
+    st.write("Handling Missing (Null) Values:")
+    fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
+    if fill_method == "Drop rows":
+        df = df.dropna()
+    elif fill_method == "Fill with mean/median":
+        for col in df.columns:
+            if df[col].dtype in ['float64', 'int64']:
+                df[col].fillna(df[col].mean(), inplace=True)
+            else:
+                df[col].fillna(df[col].mode()[0], inplace=True)
+    # Handle Outliers using IQR method
+    st.write("Handling Outliers:")
+    def remove_outliers_iqr(dataframe):
+        Q1 = dataframe.quantile(0.25)
+        Q3 = dataframe.quantile(0.75)
+        IQR = Q3 - Q1
+        return dataframe[~((dataframe < (Q1 - 1.5 * IQR)) | (dataframe > (Q3 + 1.5 * IQR))).any(axis=1)]
+    df = remove_outliers_iqr(df)
+    # Cap Extreme Values
+    st.write("Handling Extreme Values (Capping):")
+    def cap_extreme_values(dataframe):
+        for col in dataframe.select_dtypes(include=[np.number]).columns:
+            lower_limit = dataframe[col].quantile(0.05)
+            upper_limit = dataframe[col].quantile(0.95)
+            dataframe[col] = np.clip(dataframe[col], lower_limit, upper_limit)
+        return dataframe
+    df = cap_extreme_values(df)
+    # Show cleaned dataset
+    st.write("Cleaned Dataset:")
+    st.dataframe(df)
+    # Add clean data download option
+    st.subheader("Download Cleaned Dataset")
+    st.download_button(
+        label="Download Cleaned Dataset (CSV)",
+        data=df.to_csv(index=False),
+        file_name="cleaned_dataset.csv",
+        mime="text/csv"
+    )
+    target = st.selectbox("Select Target Variable", df.columns)
+    features = [col for col in df.columns if col != target]
+    X = df[features]
+    y = df[target]
+    if y.dtype == 'object' or len(y.unique()) <= 10:  # Categorical target (classification)
+        st.subheader("Classification Model Training")
+        classifiers = {
+            'Logistic Regression': LogisticRegression(max_iter=5000, solver='saga', penalty='l1'),
+            'Decision Tree': DecisionTreeClassifier(),
+            'Random Forest': RandomForestClassifier(),
+            'Support Vector Machine (SVM)': SVC(),
+            'K-Nearest Neighbors (k-NN)': KNeighborsClassifier(),
+            'Naive Bayes': GaussianNB()
+        }
+        metrics = []
+        train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
+        for name, classifier in classifiers.items():
+            classifier.fit(X_train, y_train)
+            y_pred = classifier.predict(X_test)
+            metrics.append({
+                'Model': name,
+                'Accuracy': round(accuracy_score(y_test, y_pred), 2),
+                'Precision': round(precision_score(y_test, y_pred, zero_division=1, average='macro'), 2),
+                'Recall': round(recall_score(y_test, y_pred, zero_division=1, average='macro'), 2),
+                'F1-Score': round(f1_score(y_test, y_pred, zero_division=1, average='macro'), 2)
+            })
+        metrics_df = pd.DataFrame(metrics)
+        st.subheader("Classification Model Performance Metrics")
+        st.dataframe(metrics_df)
         st.download_button(
+            label="Download Classification Report as CSV",
+            data=metrics_df.to_csv(index=False),
+            file_name="classification_report.csv",
+            mime="text/csv"
         )
+    else:  # Continuous target (regression)
+        st.subheader("Regression Model Training")
+        regressors = {
+            'Linear Regression': LinearRegression(),
+            'Decision Tree Regressor': DecisionTreeRegressor(),
+            'Random Forest Regressor': RandomForestRegressor(),
+            'Support Vector Regressor (SVR)': SVR(),
+            'K-Nearest Neighbors Regressor (k-NN)': KNeighborsRegressor()
+        }
+        regression_metrics = []
+        train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
+        for name, regressor in regressors.items():
+            regressor.fit(X_train, y_train)
+            y_pred = regressor.predict(X_test)
+            regression_metrics.append({
+                'Model': name,
+                'Mean Squared Error (MSE)': round(mean_squared_error(y_test, y_pred), 2),
+                'Mean Absolute Error (MAE)': round(mean_absolute_error(y_test, y_pred), 2),
+                'R² Score': round(r2_score(y_test, y_pred), 2)
+            })
+        regression_metrics_df = pd.DataFrame(regression_metrics)
+        st.subheader("Regression Model Performance Metrics")
+        st.dataframe(regression_metrics_df)
+        st.download_button(
+            label="Download Regression Report as CSV",
+            data=regression_metrics_df.to_csv(index=False),
+            file_name="regression_report.csv",
+            mime="text/csv"
+        )