Spaces:

Gowthamvemula
/

chrun

Sleeping

File size: 6,999 Bytes

import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO

# Page Title
st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)

# Access dataset from session state
df = st.session_state.get("dataset")

# Exclude 'ProductID' from the dataset
if df is not None:
    

    st.subheader("Dataset Preview:")
    st.write(df.head())

    st.subheader("Info of the Dataset:")
    # Redirect the output of df.info() to a string buffer
    buffer = StringIO()
    df.info(buf=buffer)
    
    # Display the content in Streamlit
    st.write(buffer.getvalue())

    st.subheader("Dataset Description:")
    st.write(df.describe())

    st.subheader("Shape of the Dataset:")
    st.write(df.shape)
    
    df.columns = [col.lower().replace(' ', '_') for col in df.columns]

    df.drop(['unnamed:_0','rownumber','customerid','surname'], axis=1, inplace=True, errors='ignore')

    df = df.select_dtypes(include=['int64', 'float64', 'object'])
    
    st.markdown("### Import Necessary Libraries:")
    st.code("""
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import plotly.express as px
    import warnings
    warnings.filterwarnings('ignore')
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
    
    import optuna
    import imblearn
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.over_sampling import RandomOverSampler, SMOTE
    
    import pickle
    """, language="python")

    # Visualize Numeric Data (Histograms and Boxplots in subplots)
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    if len(numeric_columns) > 0:
        st.subheader("Histograms for Numeric Columns:")
        # Create a multidimensional subplot (grid) for all histograms
        num_plots = len(numeric_columns)
        rows = (num_plots + 1) // 2  # To create a 2-column grid layout for histograms
        fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
        axs = axs.flatten()  # Flatten the 2D array of axes to iterate over
        
        color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1']  # Different color palettes for histograms
        for i, col in enumerate(numeric_columns):
            palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)])  # Ensure different palette for each plot
            sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i])  # Apply the color palette
            axs[i].set_title(f'Histogram of {col}')
        st.pyplot(fig)
        plt.clf()

        st.subheader("Boxplots for Numeric Columns:")
        # Create a multidimensional subplot (grid) for all boxplots
        fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
        axs = axs.flatten()  # Flatten the 2D array of axes to iterate over
        
        color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest']  # Different color palettes for boxplots
        for i, col in enumerate(numeric_columns):
            palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)])  # Ensure different palette for each plot
            sns.boxplot(x=df[col], ax=axs[i], palette=palette)
            axs[i].set_title(f'Boxplot of {col}')
        st.pyplot(fig)
        plt.clf()
    else:
        st.warning("No numeric columns available for visualization.")

    # Visualize Categorical Data
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    if len(categorical_columns) > 0:
        st.subheader("Bar Plots for Categorical Columns:")
        selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)

        st.write(f"Value Counts for '{selected_cat_col}':")
        st.write(df[selected_cat_col].value_counts())

        plt.figure(figsize=(12, 6))
        sns.countplot(x=selected_cat_col, data=df, palette='coolwarm')  # Unique palette for categorical data
        plt.title(f'Bar Plot of {selected_cat_col}')
        st.pyplot(plt)
        plt.clf()
    else:
        st.warning("No categorical columns available for visualization.")

    st.subheader("Cleaned Dataset:")
    df= df.drop_duplicates()
    st.write(df)

    # Store cleaned data in session state for use in next page
    st.session_state.cleaned_data = df  # Store cleaned data in session state

    # Convert cleaned data to CSV and provide a download button
    cleaned_csv = df.to_csv(index=False).encode('utf-8')
    st.download_button(
        label="Download Cleaned Dataset",
        data=cleaned_csv,
        file_name="cleaned_dataset.csv",
        mime="text/csv"
    )

else:
    st.warning("No dataset found. Please upload a dataset on the Home page.")


# Define the URL of the background image (use your own image URL)
# Apply custom CSS for the background image and overlay
background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67445925102349e867c92342/r2cTyHH3xpUiszvjBkcsL.png"

st.markdown(
    f"""
    <style>
        .stApp {{
            background-image: url("{background_image_url}");
            background-size: auto;  /* Ensures the image retains its original size */
            background-repeat: repeat;  /* Makes the image repeat to cover the entire background */
            background-position: top left;  /* Starts repeating from the top-left corner */
            background-attachment: fixed;  /* Keeps the background fixed as you scroll */
        }}
        
        /* Semi-transparent overlay */
        .stApp::before {{
            content: "";
            position: absolute;
            top: 0;
            left: 0;
            width: 100%;
            height: 100%;
            background: rgba(0, 0, 0, 0.4);  /* Adjust transparency here (0.4 for 40% transparency) */
            z-index: -1;
        }}
        
        /* Container to center elements and limit width */
        .content-container {{
            max-width: 70%;  /* Limit content width to 70% */
            margin: 0 auto;  /* Center the container horizontally */
            padding: 50px;  /* Add padding for spacing */
        }}
        
        /* Styling the markdown content */
        .stMarkdown {{
            color: white;  /* White text for better visibility */
            font-size: 100px;  /* Adjust font size for readability */
        }}
    </style>
    """, 
    unsafe_allow_html=True
)



if st.button("Previous ⏮️"):
    st.switch_page("pages/1_Data_Card_and_Data_collection.py")
if st.button("Next ⏭️"):
    st.switch_page("pages/3_EDA_and_Feature_Engineering.py")