Spaces:

Gowthamvemula
/

chrun

Sleeping

App Files Files Community

Gowthamvemula commited on Jan 26, 2025

Commit

6f508e2

verified ·

1 Parent(s): a1a20e6

Create 2_Data_CLeaning_and_Preprocessing.py

Browse files

Files changed (1) hide show

pages/2_Data_CLeaning_and_Preprocessing.py +177 -0

pages/2_Data_CLeaning_and_Preprocessing.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import streamlit as st
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from io import StringIO
+# Page Title
+st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
+# Access dataset from session state
+df = st.session_state.get("dataset")
+# Exclude 'ProductID' from the dataset
+if df is not None:
+    df = df.drop(columns=['ProductID'], errors='ignore')  # Exclude 'ProductID' if it exists
+    st.subheader("Dataset Preview:")
+    st.write(df.head())
+    st.subheader("Info of the Dataset:")
+    # Redirect the output of df.info() to a string buffer
+    buffer = StringIO()
+    df.info(buf=buffer)
+    # Display the content in Streamlit
+    st.write(buffer.getvalue())
+    st.subheader("Dataset Description:")
+    st.write(df.describe())
+    st.subheader("Shape of the Dataset:")
+    st.write(df.shape)
+    st.markdown("### Import Necessary Libraries:")
+    st.code("""
+    import numpy as np
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    import plotly.express as px
+    import warnings
+    warnings.filterwarnings('ignore')
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.neighbors import KNeighborsClassifier
+    from sklearn.model_selection import train_test_split, cross_val_score
+    from sklearn.preprocessing import StandardScaler, LabelEncoder
+    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
+    import optuna
+    import imblearn
+    from imblearn.under_sampling import RandomUnderSampler
+    from imblearn.over_sampling import RandomOverSampler, SMOTE
+    import pickle
+    """, language="python")
+    # Visualize Numeric Data (Histograms and Boxplots in subplots)
+    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
+    if len(numeric_columns) > 0:
+        st.subheader("Histograms for Numeric Columns:")
+        # Create a multidimensional subplot (grid) for all histograms
+        num_plots = len(numeric_columns)
+        rows = (num_plots + 1) // 2  # To create a 2-column grid layout for histograms
+        fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
+        axs = axs.flatten()  # Flatten the 2D array of axes to iterate over
+        color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1']  # Different color palettes for histograms
+        for i, col in enumerate(numeric_columns):
+            palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)])  # Ensure different palette for each plot
+            sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i])  # Apply the color palette
+            axs[i].set_title(f'Histogram of {col}')
+        st.pyplot(fig)
+        plt.clf()
+        st.subheader("Boxplots for Numeric Columns:")
+        # Create a multidimensional subplot (grid) for all boxplots
+        fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
+        axs = axs.flatten()  # Flatten the 2D array of axes to iterate over
+        color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest']  # Different color palettes for boxplots
+        for i, col in enumerate(numeric_columns):
+            palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)])  # Ensure different palette for each plot
+            sns.boxplot(x=df[col], ax=axs[i], palette=palette)
+            axs[i].set_title(f'Boxplot of {col}')
+        st.pyplot(fig)
+        plt.clf()
+    else:
+        st.warning("No numeric columns available for visualization.")
+    # Visualize Categorical Data
+    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
+    if len(categorical_columns) > 0:
+        st.subheader("Bar Plots for Categorical Columns:")
+        selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
+        st.write(f"Value Counts for '{selected_cat_col}':")
+        st.write(df[selected_cat_col].value_counts())
+        plt.figure(figsize=(12, 6))
+        sns.countplot(x=selected_cat_col, data=df, palette='coolwarm')  # Unique palette for categorical data
+        plt.title(f'Bar Plot of {selected_cat_col}')
+        st.pyplot(plt)
+        plt.clf()
+    else:
+        st.warning("No categorical columns available for visualization.")
+    st.subheader("Cleaned Dataset:")
+    cleaned_data = df.drop_duplicates()
+    st.write(cleaned_data)
+    # Store cleaned data in session state for use in next page
+    st.session_state.cleaned_data = cleaned_data  # Store cleaned data in session state
+    # Convert cleaned data to CSV and provide a download button
+    cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
+    st.download_button(
+        label="Download Cleaned Dataset",
+        data=cleaned_csv,
+        file_name="cleaned_dataset.csv",
+        mime="text/csv"
+    )
+else:
+    st.warning("No dataset found. Please upload a dataset on the Home page.")
+# Define the URL of the background image (use your own image URL)
+# Apply custom CSS for the background image and overlay
+background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/JUxgk4Z7jvSNM7OnB4nOw.jpeg"
+st.markdown(
+    f"""
+    <style>
+        .stApp {{
+            background-image: url("{background_image_url}");
+            background-size: auto;  /* Ensures the image retains its original size */
+            background-repeat: repeat;  /* Makes the image repeat to cover the entire background */
+            background-position: top left;  /* Starts repeating from the top-left corner */
+            background-attachment: fixed;  /* Keeps the background fixed as you scroll */
+        }}
+        /* Semi-transparent overlay */
+        .stApp::before {{
+            content: "";
+            position: absolute;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            background: rgba(0, 0, 0, 0.4);  /* Adjust transparency here (0.4 for 40% transparency) */
+            z-index: -1;
+        }}
+        /* Container to center elements and limit width */
+        .content-container {{
+            max-width: 70%;  /* Limit content width to 70% */
+            margin: 0 auto;  /* Center the container horizontally */
+            padding: 50px;  /* Add padding for spacing */
+        }}
+        /* Styling the markdown content */
+        .stMarkdown {{
+            color: white;  /* White text for better visibility */
+            font-size: 100px;  /* Adjust font size for readability */
+        }}
+    </style>
+    """,
+    unsafe_allow_html=True
+)
+if st.button("Previous ⏮️"):
+    st.switch_page("pages/1_Data_Card_and_Data_collection.py")
+if st.button("Next ⏭️"):
+    st.switch_page("pages/3_EDA_and_Feature_Engineering.py")