Spaces:

saherPervaiz
/

Depression

Sleeping

App Files Files Community

saherPervaiz commited on Jan 14, 2025

Commit

43d6671

verified ·

1 Parent(s): d6bf5be

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -190

app.py CHANGED Viewed

@@ -14,31 +14,33 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 from io import BytesIO
-# File uploader
 st.title("Model Training with Metrics and Correlation Heatmap")
 uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
 if uploaded_file is not None:
     df = pd.read_csv(uploaded_file)
-    # Show the dataset
     st.write("Dataset:")
     st.dataframe(df)
     # Convert categorical (str) data to numerical
     st.write("Converting Categorical Columns to Numerical Values:")
     label_encoder = LabelEncoder()
     for col in df.columns:
         if df[col].dtype == 'object' or len(df[col].unique()) <= 10:
             st.write(f"Encoding Column: **{col}**")
             df[col] = label_encoder.fit_transform(df[col])
     # Display the dataset after conversion
     st.write("Dataset After Conversion:")
     st.dataframe(df)
-    # Handle Null Values (Missing Data)
     st.write("Handling Missing (Null) Values:")
     fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
     if fill_method == "Drop rows":
@@ -49,201 +51,85 @@ if uploaded_file is not None:
                 df[col].fillna(df[col].mean(), inplace=True)
             else:
                 df[col].fillna(df[col].mode()[0], inplace=True)
-    # Handle Outliers using IQR method
-    st.write("Handling Outliers:")
-    def remove_outliers_iqr(dataframe):
-        Q1 = dataframe.quantile(0.25)
-        Q3 = dataframe.quantile(0.75)
-        IQR = Q3 - Q1
-        return dataframe[~((dataframe < (Q1 - 1.5 * IQR)) | (dataframe > (Q3 + 1.5 * IQR))).any(axis=1)]
-    df = remove_outliers_iqr(df)
-    # Cap Extreme Values
-    st.write("Handling Extreme Values (Capping):")
-    def cap_extreme_values(dataframe):
-        for col in dataframe.select_dtypes(include=[np.number]).columns:
-            lower_limit = dataframe[col].quantile(0.05)
-            upper_limit = dataframe[col].quantile(0.95)
-            dataframe[col] = np.clip(dataframe[col], lower_limit, upper_limit)
-        return dataframe
-    df = cap_extreme_values(df)
     # Show cleaned dataset
     st.write("Cleaned Dataset:")
     st.dataframe(df)
-    # Add clean data download option
-    st.subheader("Download Cleaned Dataset")
-    st.download_button(
-        label="Download Cleaned Dataset (CSV)",
-        data=df.to_csv(index=False),
-        file_name="cleaned_dataset.csv",
-        mime="text/csv"
-    )
     # Correlation Heatmap
     st.subheader("Correlation Heatmap")
     corr = df.corr()
     plt.figure(figsize=(10, 8))
     sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
     st.pyplot(plt)
-    # Save heatmap as PNG
-    buf = BytesIO()
-    plt.savefig(buf, format="png")
-    buf.seek(0)
-    st.download_button(
-        label="Download Correlation Heatmap as PNG",
-        data=buf,
-        file_name="correlation_heatmap.png",
-        mime="image/png"
-    )
-    # Highlight highly correlated pairs
-    st.subheader("Highly Correlated Features")
-    high_corr = corr.abs().unstack().sort_values(ascending=False).drop_duplicates()
-    high_corr = high_corr[high_corr.index.get_level_values(0) != high_corr.index.get_level_values(1)]
-    high_corr_df = pd.DataFrame(high_corr)
-    st.write(high_corr_df)
     target = st.selectbox("Select Target Variable", df.columns)
     features = [col for col in df.columns if col != target]
     X = df[features]
     y = df[target]
-    if y.dtype == 'object' or len(y.unique()) <= 10:  # Categorical target (classification)
-        st.subheader("Classification Model Training")
-        classifiers = {
-            'Logistic Regression': LogisticRegression(max_iter=5000, solver='saga', penalty='l1'),
-            'Decision Tree': DecisionTreeClassifier(),
-            'Random Forest': RandomForestClassifier(),
-            'Support Vector Machine (SVM)': SVC(),
-            'K-Nearest Neighbors (k-NN)': KNeighborsClassifier(),
-            'Naive Bayes': GaussianNB()
-        }
-        metrics = []
-        train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
-        for name, classifier in classifiers.items():
-            classifier.fit(X_train, y_train)
-            y_pred = classifier.predict(X_test)
-            metrics.append({
-                'Model': name,
-                'Accuracy': round(accuracy_score(y_test, y_pred), 2),
-                'Precision': round(precision_score(y_test, y_pred, zero_division=1, average='macro'), 2),
-                'Recall': round(recall_score(y_test, y_pred, zero_division=1, average='macro'), 2),
-                'F1-Score': round(f1_score(y_test, y_pred, zero_division=1, average='macro'), 2)
-            })
-        metrics_df = pd.DataFrame(metrics)
-        st.subheader("Classification Model Performance Metrics")
-        st.dataframe(metrics_df)
-        # Save metrics as PNG (table form)
-        fig, ax = plt.subplots(figsize=(8, 4))
-        ax.axis('tight')
-        ax.axis('off')
-        table = plt.table(cellText=metrics_df.values, colLabels=metrics_df.columns, cellLoc='center', loc='center')
-        table.auto_set_font_size(False)
-        table.set_fontsize(10)
-        table.auto_set_column_width(col=list(range(len(metrics_df.columns))))
-        buf = BytesIO()
-        fig.savefig(buf, format="png")
-        buf.seek(0)
-        st.download_button(
-            label="Download Classification Metrics Table as PNG",
-            data=buf,
-            file_name="classification_metrics_table.png",
-            mime="image/png"
-        )
-        # Visualization (Bar Graphs for Classification)
-        st.subheader("Classification Model Performance Metrics Graph")
-        metrics_df.set_index('Model', inplace=True)
-        ax = metrics_df.plot(kind='bar', figsize=(10, 6), colormap='coolwarm', rot=45)
-        plt.title("Classification Models - Performance Metrics")
-        plt.ylabel("Scores")
-        plt.xlabel("Models")
-        st.pyplot(plt)
-        # Download button for the bar graph
-        buf = BytesIO()
-        ax.figure.savefig(buf, format="png")
-        buf.seek(0)
-        st.download_button(
-            label="Download Classification Performance Graph as PNG",
-            data=buf,
-            file_name="classification_performance_graph.png",
-            mime="image/png"
-        )
-    else:  # Continuous target (regression)
-        st.subheader("Regression Model Training")
-        regressors = {
-            'Linear Regression': LinearRegression(),
-            'Decision Tree Regressor': DecisionTreeRegressor(),
-            'Random Forest Regressor': RandomForestRegressor(),
-            'Support Vector Regressor (SVR)': SVR(),
-            'K-Nearest Neighbors Regressor (k-NN)': KNeighborsRegressor()
-        }
-        regression_metrics = []
-        train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
-        for name, regressor in regressors.items():
-            regressor.fit(X_train, y_train)
-            y_pred = regressor.predict(X_test)
-            regression_metrics.append({
-                'Model': name,
-                'Mean Squared Error (MSE)': round(mean_squared_error(y_test, y_pred), 2),
-                'Mean Absolute Error (MAE)': round(mean_absolute_error(y_test, y_pred), 2),
-                'R² Score': round(r2_score(y_test, y_pred), 2)
-            })
-        regression_metrics_df = pd.DataFrame(regression_metrics)
-        st.subheader("Regression Model Performance Metrics")
-        st.dataframe(regression_metrics_df)
-        # Save metrics as PNG (table form)
-        fig, ax = plt.subplots(figsize=(8, 4))
-        ax.axis('tight')
-        ax.axis('off')
-        table = plt.table(cellText=regression_metrics_df.values, colLabels=regression_metrics_df.columns, cellLoc='center', loc='center')
-        table.auto_set_font_size(False)
-        table.set_fontsize(10)
-        table.auto_set_column_width(col=list(range(len(regression_metrics_df.columns))))
-        buf = BytesIO()
-        fig.savefig(buf, format="png")
-        buf.seek(0)
-        st.download_button(
-            label="Download Regression Metrics Table as PNG",
-            data=buf,
-            file_name="regression_metrics_table.png",
-            mime="image/png"
-        )
-        # Visualization (Bar Graphs for Regression)
-        st.subheader("Regression Model Performance Metrics Graph")
-        regression_metrics_df.set_index('Model', inplace=True)
-        regression_metrics_df.plot(kind='bar', figsize=(10, 6), colormap='coolwarm', rot=45)
-        plt.title("Regression Models - Performance Metrics")
-        plt.ylabel("Scores")
-        plt.xlabel("Models")
-        st.pyplot(plt)
-        # Download button for the bar graph
-        buf = BytesIO()
-        plt.savefig(buf, format="png")
-        buf.seek(0)
-        st.download_button(
-            label="Download Regression Performance Graph as PNG",
-            data=buf,
-            file_name="regression_performance_graph.png",
-            mime="image/png"
-        )

 import seaborn as sns
 from io import BytesIO
+# Streamlit app title
 st.title("Model Training with Metrics and Correlation Heatmap")
+# File uploader
 uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
 if uploaded_file is not None:
+    # Read the uploaded CSV file
     df = pd.read_csv(uploaded_file)
+    # Display the dataset
     st.write("Dataset:")
     st.dataframe(df)
     # Convert categorical (str) data to numerical
     st.write("Converting Categorical Columns to Numerical Values:")
     label_encoder = LabelEncoder()
     for col in df.columns:
         if df[col].dtype == 'object' or len(df[col].unique()) <= 10:
             st.write(f"Encoding Column: **{col}**")
             df[col] = label_encoder.fit_transform(df[col])
     # Display the dataset after conversion
     st.write("Dataset After Conversion:")
     st.dataframe(df)
+    # Handle missing values
     st.write("Handling Missing (Null) Values:")
     fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
     if fill_method == "Drop rows":
                 df[col].fillna(df[col].mean(), inplace=True)
             else:
                 df[col].fillna(df[col].mode()[0], inplace=True)
     # Show cleaned dataset
     st.write("Cleaned Dataset:")
     st.dataframe(df)
     # Correlation Heatmap
     st.subheader("Correlation Heatmap")
     corr = df.corr()
     plt.figure(figsize=(10, 8))
     sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
     st.pyplot(plt)
+    # Select target variable
     target = st.selectbox("Select Target Variable", df.columns)
     features = [col for col in df.columns if col != target]
     X = df[features]
     y = df[target]
+    if len(y.unique()) > 1:  # Ensure the target variable has at least two unique classes/values
+        if y.dtype == 'object' or len(y.unique()) <= 10:  # Classification
+            st.subheader("Classification Model Training")
+            classifiers = {
+                'Logistic Regression': LogisticRegression(max_iter=5000),
+                'Decision Tree': DecisionTreeClassifier(),
+                'Random Forest': RandomForestClassifier(),
+                'Support Vector Machine (SVM)': SVC(),
+                'K-Nearest Neighbors (k-NN)': KNeighborsClassifier(),
+                'Naive Bayes': GaussianNB()
+            }
+            metrics = []
+            train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=1-train_size, stratify=y, random_state=42
+            )
+            for name, classifier in classifiers.items():
+                classifier.fit(X_train, y_train)
+                y_pred = classifier.predict(X_test)
+                metrics.append({
+                    'Model': name,
+                    'Accuracy': round(accuracy_score(y_test, y_pred), 2),
+                    'Precision': round(precision_score(y_test, y_pred, zero_division=1, average='macro'), 2),
+                    'Recall': round(recall_score(y_test, y_pred, zero_division=1, average='macro'), 2),
+                    'F1-Score': round(f1_score(y_test, y_pred, zero_division=1, average='macro'), 2)
+                })
+            metrics_df = pd.DataFrame(metrics)
+            st.subheader("Classification Model Performance Metrics")
+            st.dataframe(metrics_df)
+        else:  # Regression
+            st.subheader("Regression Model Training")
+            regressors = {
+                'Linear Regression': LinearRegression(),
+                'Decision Tree Regressor': DecisionTreeRegressor(),
+                'Random Forest Regressor': RandomForestRegressor(),
+                'Support Vector Regressor (SVR)': SVR(),
+                'K-Nearest Neighbors Regressor (k-NN)': KNeighborsRegressor()
+            }
+            regression_metrics = []
+            train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=1-train_size, random_state=42
+            )
+            for name, regressor in regressors.items():
+                regressor.fit(X_train, y_train)
+                y_pred = regressor.predict(X_test)
+                regression_metrics.append({
+                    'Model': name,
+                    'Mean Squared Error (MSE)': round(mean_squared_error(y_test, y_pred), 2),
+                    'Mean Absolute Error (MAE)': round(mean_absolute_error(y_test, y_pred), 2),
+                    'R² Score': round(r2_score(y_test, y_pred), 2)
+                })
+            regression_metrics_df = pd.DataFrame(regression_metrics)
+            st.subheader("Regression Model Performance Metrics")
+            st.dataframe(regression_metrics_df)
+    else:
+        st.error("The target variable must contain at least two unique values for classification or regression. Please check your dataset.")