Spaces:

saherPervaiz
/

ModelTrain

Running

App Files Files Community

saherPervaiz commited on Jan 12, 2025

Commit

97a2e91

verified ·

1 Parent(s): 4b535f6

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -89

app.py CHANGED Viewed

@@ -1,10 +1,6 @@
 import streamlit as st
 import pandas as pd
-import numpy as np
-import seaborn as sns
-import matplotlib.pyplot as plt
 from sklearn.model_selection import train_test_split
-from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import LabelEncoder, StandardScaler
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.linear_model import LogisticRegression, LinearRegression
@@ -12,76 +8,35 @@ from sklearn.svm import SVC, SVR
 from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.naive_bayes import GaussianNB
-from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
-from scipy import stats
 # File uploader
-st.title("Data Analysis and Model Training")
 uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
 if uploaded_file is not None:
     df = pd.read_csv(uploaded_file)
-    # Data Cleaning Function
-    def clean_dataset(df):
-        # Convert categorical data to numeric using Label Encoding (only for categorical with few unique values)
-        le = LabelEncoder()
-        for column in df.select_dtypes(include=['object']).columns:
-            if df[column].nunique() < 10:  # If the column has fewer unique values, encode it
-                df[column] = le.fit_transform(df[column].astype(str))
-        # Handle missing values (impute numerical columns with median and categorical columns with mode)
-        categorical_columns = df.select_dtypes(include=['object']).columns
-        if len(categorical_columns) > 0:
-            imputer = SimpleImputer(strategy='most_frequent')
-            df[categorical_columns] = imputer.fit_transform(df[categorical_columns])
-        # Handle numerical columns
-        numerical_columns = df.select_dtypes(include=['number']).columns
-        if len(numerical_columns) > 0:
-            imputer = SimpleImputer(strategy='median')
-            df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
-        # Remove outliers (using z-score method)
-        z_scores = np.abs(stats.zscore(df.select_dtypes(include=['number'])))
-        df = df[(z_scores < 3).all(axis=1)]
-        # Normalize numerical data
-        scaler = StandardScaler()
-        df[df.select_dtypes(include=['number']).columns] = scaler.fit_transform(df.select_dtypes(include=['number']))
-        # Drop rows with any null values
-        df = df.dropna()
-        # Ensure that all columns are numeric before using in models
-        for column in df.select_dtypes(include=['object']).columns:
-            df[column] = pd.to_numeric(df[column], errors='coerce')
-        return df
-    # Apply the clean_dataset function
-    df_cleaned = clean_dataset(df)
-    # Show the cleaned dataset
-    st.write("Cleaned Dataset:")
-    st.dataframe(df_cleaned)
     # Model Training Section
     st.subheader("Model Training")
-    if df_cleaned.empty:
-        st.warning("The dataset is empty after cleaning. Please adjust your cleaning settings.")
     else:
-        target = st.selectbox("Select Target Variable", df_cleaned.columns)
-        features = [col for col in df_cleaned.columns if col != target]
-        X = df_cleaned[features]
-        y = df_cleaned[target]
         # Determine if the target is continuous or categorical
         is_classification = y.dtype == 'object' or len(y.unique()) <= 10  # If target is categorical or has few unique values, treat as classification
         # Ensure there is enough data before proceeding with train-test split
         if len(X) == 0 or len(y) == 0:
-            st.warning("Insufficient data after cleaning. Please adjust the cleaning parameters.")
         else:
             # Split the data into training and test sets with customizable training size
             train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
@@ -143,38 +98,10 @@ if uploaded_file is not None:
                 mime="text/csv"
             )
-            # Option to download the cleaned dataset
             st.download_button(
-                label="Download Cleaned Dataset",
-                data=df_cleaned.to_csv(index=False),
-                file_name="cleaned_dataset.csv",
                 mime="text/csv"
             )
-            # Download correlation heatmap
-            st.subheader("Correlation Heatmap")
-            correlation_matrix = df_cleaned.select_dtypes(include=['number']).corr()
-            fig, ax = plt.subplots(figsize=(8, 6))
-            sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
-            st.pyplot(fig)
-            fig.savefig("/tmp/correlation_heatmap.png")
-            with open("/tmp/correlation_heatmap.png", "rb") as f:
-                st.download_button(
-                    label="Download Correlation Heatmap",
-                    data=f,
-                    file_name="correlation_heatmap.png",
-                    mime="image/png"
-                )
-            # Pair plot of numerical columns to visualize relationships
-            st.subheader("Pair Plot of Numerical Columns")
-            pair_plot = sns.pairplot(df_cleaned[features])  # Generate pair plot for features
-            st.pyplot(pair_plot)
-            pair_plot.savefig("/tmp/pair_plot.png")
-            with open("/tmp/pair_plot.png", "rb") as f:
-                st.download_button(
-                    label="Download Pair Plot",
-                    data=f,
-                    file_name="pair_plot.png",
-                    mime="image/png"
-                )

 import streamlit as st
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder, StandardScaler
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.linear_model import LogisticRegression, LinearRegression
 from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.naive_bayes import GaussianNB
+from sklearn.metrics import accuracy_score, mean_squared_error
 # File uploader
+st.title("Model Training")
 uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
 if uploaded_file is not None:
     df = pd.read_csv(uploaded_file)
+    # Show the dataset
+    st.write("Dataset:")
+    st.dataframe(df)
     # Model Training Section
     st.subheader("Model Training")
+    if df.empty:
+        st.warning("The dataset is empty. Please upload a valid CSV file.")
     else:
+        target = st.selectbox("Select Target Variable", df.columns)
+        features = [col for col in df.columns if col != target]
+        X = df[features]
+        y = df[target]
         # Determine if the target is continuous or categorical
         is_classification = y.dtype == 'object' or len(y.unique()) <= 10  # If target is categorical or has few unique values, treat as classification
         # Ensure there is enough data before proceeding with train-test split
         if len(X) == 0 or len(y) == 0:
+            st.warning("Insufficient data. Please ensure there are valid feature and target columns.")
         else:
             # Split the data into training and test sets with customizable training size
             train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
                 mime="text/csv"
             )
+            # Option to download the dataset
             st.download_button(
+                label="Download Dataset",
+                data=df.to_csv(index=False),
+                file_name="dataset.csv",
                 mime="text/csv"
             )