Spaces:

saherPervaiz
/

ModelTrain

Running

App Files Files Community

saherPervaiz commited on Jan 12, 2025

Commit

40166c0

verified ·

1 Parent(s): 5d2751f

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -141

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import streamlit as st
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 from sklearn.ensemble import RandomForestClassifier
@@ -9,9 +11,34 @@ from sklearn.neighbors import KNeighborsClassifier
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
 # File uploader
 st.title("Model Training with Metrics")
@@ -29,65 +56,13 @@ if uploaded_file is not None:
     if df.empty:
         st.warning("The dataset is empty. Please upload a valid CSV file.")
     else:
-        # Handle Null Values (Missing Data)
-        st.write("Handling Missing (Null) Values:")
-        # Option to drop rows with null values or fill them
-        fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
-        if fill_method == "Drop rows":
-            df = df.dropna()
-        elif fill_method == "Fill with mean/median":
-            for col in df.columns:
-                if df[col].dtype in ['float64', 'int64']:
-                    df[col].fillna(df[col].mean(), inplace=True)  # For numeric columns, fill with mean
-                else:
-                    df[col].fillna(df[col].mode()[0], inplace=True)  # For categorical columns, fill with mode
-        # Handle Outliers using IQR method
-        st.write("Handling Outliers:")
-        # Define function to remove outliers using IQR
-        def remove_outliers_iqr(dataframe):
-            Q1 = dataframe.quantile(0.25)
-            Q3 = dataframe.quantile(0.75)
-            IQR = Q3 - Q1
-            # Filter out rows that are outside the IQR range
-            return dataframe[~((dataframe < (Q1 - 1.5 * IQR)) | (dataframe > (Q3 + 1.5 * IQR))).any(axis=1)]
-        # Remove outliers from the numerical columns
-        df = remove_outliers_iqr(df)
-        # Handle Extreme Values by Capping (Winsorization)
-        st.write("Handling Extreme Values (Capping):")
-        def cap_extreme_values(dataframe):
-            for col in dataframe.select_dtypes(include=[np.number]).columns:
-                # Define the thresholds for extreme values (95th percentile and 5th percentile)
-                lower_limit = dataframe[col].quantile(0.05)
-                upper_limit = dataframe[col].quantile(0.95)
-                # Cap the extreme values
-                dataframe[col] = np.clip(dataframe[col], lower_limit, upper_limit)
-            return dataframe
-        df = cap_extreme_values(df)
-        # Show cleaned dataset
-        st.write("Cleaned Dataset:")
-        st.dataframe(df)
         target = st.selectbox("Select Target Variable", df.columns)
         features = [col for col in df.columns if col != target]
         X = df[features]
         y = df[target]
-        # Label Encoding for categorical columns
-        label_encoder = LabelEncoder()
-        # Encode the target variable (if it's categorical)
-        if y.dtype == 'object' or len(y.unique()) <= 10:  # If the target variable is categorical
-            y = label_encoder.fit_transform(y)
-        # Encode categorical feature columns (if any)
-        for col in X.columns:
-            if X[col].dtype == 'object' or len(X[col].unique()) <= 10:  # If the column is categorical
-                X[col] = label_encoder.fit_transform(X[col])
         # Ensure there is enough data before proceeding with train-test split
         if len(X) == 0 or len(y) == 0:
@@ -135,101 +110,45 @@ if uploaded_file is not None:
             # Create a metrics DataFrame
             metrics_df = pd.DataFrame(metrics)
-            # Display results in a table using st.dataframe
-            st.subheader("Model Performance Metrics")
-            st.dataframe(metrics_df)
-            # Download options
-            st.subheader("Download Model Performance Report in Different Formats")
-            # CSV
-            st.download_button(
-                label="Download as CSV",
-                data=metrics_df.to_csv(index=False),
-                file_name="model_report.csv",
-                mime="text/csv"
             )
-            # Excel
             st.download_button(
-                label="Download as Excel",
-                data=metrics_df.to_excel(index=False, engine='openpyxl'),
                 file_name="model_report.xlsx",
                 mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
             )
-            # JSON
-            st.download_button(
-                label="Download as JSON",
-                data=metrics_df.to_json(orient='records'),
-                file_name="model_report.json",
-                mime="application/json"
-            )
-            # PDF (using `fpdf` library)
-            from fpdf import FPDF
-            def generate_pdf(df):
-                pdf = FPDF()
-                pdf.add_page()
-                pdf.set_font("Arial", size=12)
-                pdf.cell(200, 10, txt="Model Performance Report", ln=True, align="C")
-                pdf.ln(10)
-                # Add table header
-                pdf.set_font("Arial", style='B', size=10)
-                for header in df.columns:
-                    pdf.cell(40, 10, header, border=1)
-                pdf.ln()
-                # Add table rows
-                pdf.set_font("Arial", size=10)
-                for row in df.values:
-                    for value in row:
-                        pdf.cell(40, 10, str(value), border=1)
-                    pdf.ln()
-                return pdf.output(dest='S').encode('latin1')
-            # PDF download
             st.download_button(
-                label="Download as PDF",
-                data=generate_pdf(metrics_df),
-                file_name="model_report.pdf",
-                mime="application/pdf"
-            )
-            # Option to download the dataset
-            st.download_button(
-                label="Download Dataset",
-                data=df.to_csv(index=False),
-                file_name="dataset.csv",
-                mime="text/csv"
             )
-            # Generate and download PNG report
-            st.subheader("Download Report as PNG")
-            # Create table plot using matplotlib
-            fig, ax = plt.subplots(figsize=(12, 4))  # Adjust the figure size to match the table's layout
-            ax.axis('tight')
-            ax.axis('off')
-            table_data = metrics_df.values
-            table_columns = metrics_df.columns.tolist()
-            table = ax.table(cellText=table_data, colLabels=table_columns, loc='center', cellLoc='center', colLoc='center')
-            table.auto_set_font_size(False)
-            table.set_fontsize(10)
-            table.scale(1.2, 1.2)  # Adjust the scale for better appearance
-            # Save the table as a PNG file
-            png_file = "model_report.png"
-            fig.savefig(png_file, bbox_inches='tight', dpi=300)
-            # Provide a download button for the PNG file
-            with open(png_file, "rb") as file:
                 st.download_button(
-                    label="Download as PNG",
                     data=file,
                     file_name="model_report.png",
                     mime="image/png"

 import streamlit as st
 import pandas as pd
+import matplotlib.pyplot as plt
+import io
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+from tabulate import tabulate
+# Function to convert DataFrame to Excel format
+def to_excel(df):
+    output = io.BytesIO()
+    with pd.ExcelWriter(output, engine='openpyxl') as writer:
+        df.to_excel(writer, index=False, sheet_name='Cleaned Dataset')
+    output.seek(0)
+    return output
+# Function to save table as PNG
+def save_table_as_png(df):
+    fig, ax = plt.subplots(figsize=(8, 6))
+    ax.axis('tight')
+    ax.axis('off')
+    # Create a table from the DataFrame
+    table = ax.table(cellText=df.values, colLabels=df.columns, loc='center', cellLoc='center')
+    table.auto_set_font_size(False)
+    table.set_fontsize(10)
+    table.scale(1.2, 1.2)
+    # Save the table as a PNG image
+    img_path = "/tmp/model_report.png"
+    plt.savefig(img_path, format="png", bbox_inches="tight")
+    plt.close(fig)
+    return img_path
 # File uploader
 st.title("Model Training with Metrics")
     if df.empty:
         st.warning("The dataset is empty. Please upload a valid CSV file.")
     else:
         target = st.selectbox("Select Target Variable", df.columns)
         features = [col for col in df.columns if col != target]
         X = df[features]
         y = df[target]
+        # Determine if the target is continuous or categorical
+        is_classification = y.dtype == 'object' or len(y.unique()) <= 10  # If target is categorical or has few unique values, treat as classification
         # Ensure there is enough data before proceeding with train-test split
         if len(X) == 0 or len(y) == 0:
             # Create a metrics DataFrame
             metrics_df = pd.DataFrame(metrics)
+            # Add bold formatting to the headers for tabulate
+            bold_headers = [f"\033[1m{header}\033[0m" for header in metrics_df.columns]
+            # Format table with tabulate
+            table = tabulate(
+                metrics_df,
+                headers=bold_headers,
+                tablefmt="fancy_grid",
+                showindex=False,
+                numalign="center",
+                stralign="center"
             )
+            # Display results in Streamlit
+            st.subheader("Model Performance Metrics")
+            st.markdown(f"**Model Performance Metrics**")
+            st.text(table)
+            # Option to download the model performance metrics (Results Table)
             st.download_button(
+                label="Download Model Report (Excel)",
+                data=to_excel(metrics_df),  # The metrics dataframe
                 file_name="model_report.xlsx",
                 mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
             )
+            # Option to download the cleaned dataset
             st.download_button(
+                label="Download Cleaned Dataset (Excel)",
+                data=to_excel(df),  # The cleaned dataset is 'df'
+                file_name="cleaned_dataset.xlsx",
+                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
             )
+            # Option to download the report as PNG
+            img_path = save_table_as_png(metrics_df)
+            with open(img_path, "rb") as file:
                 st.download_button(
+                    label="Download Model Report (PNG)",
                     data=file,
                     file_name="model_report.png",
                     mime="image/png"