import streamlit as st import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from io import StringIO # Page Title st.markdown("

Data Cleaning and Processing

", unsafe_allow_html=True) # Access dataset from session state df = st.session_state.get("dataset") # Exclude 'ProductID' from the dataset if df is not None: df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists st.subheader("Dataset Preview:") st.write(df.head()) st.subheader("Info of the Dataset:") # Redirect the output of df.info() to a string buffer buffer = StringIO() df.info(buf=buffer) # Display the content in Streamlit st.write(buffer.getvalue()) st.subheader("Dataset Description:") st.write(df.describe()) st.subheader("Shape of the Dataset:") st.write(df.shape) st.markdown("### Import Necessary Libraries:") st.code(""" import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import warnings warnings.filterwarnings('ignore') from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss import optuna import imblearn from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import RandomOverSampler, SMOTE import pickle """, language="python") # Visualize Numeric Data (Histograms and Boxplots in subplots) numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns if len(numeric_columns) > 0: st.subheader("Histograms for Numeric Columns:") # Create a multidimensional subplot (grid) for all histograms num_plots = len(numeric_columns) rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms fig, axs = plt.subplots(rows, 2, figsize=(12, 12)) axs = axs.flatten() # Flatten the 2D array of axes to iterate over color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms for i, col in enumerate(numeric_columns): palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette axs[i].set_title(f'Histogram of {col}') st.pyplot(fig) plt.clf() st.subheader("Boxplots for Numeric Columns:") # Create a multidimensional subplot (grid) for all boxplots fig, axs = plt.subplots(rows, 2, figsize=(12, 12)) axs = axs.flatten() # Flatten the 2D array of axes to iterate over color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots for i, col in enumerate(numeric_columns): palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot sns.boxplot(x=df[col], ax=axs[i], palette=palette) axs[i].set_title(f'Boxplot of {col}') st.pyplot(fig) plt.clf() else: st.warning("No numeric columns available for visualization.") # Visualize Categorical Data categorical_columns = df.select_dtypes(include=['object', 'category']).columns if len(categorical_columns) > 0: st.subheader("Bar Plots for Categorical Columns:") selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns) st.write(f"Value Counts for '{selected_cat_col}':") st.write(df[selected_cat_col].value_counts()) plt.figure(figsize=(12, 6)) sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data plt.title(f'Bar Plot of {selected_cat_col}') st.pyplot(plt) plt.clf() else: st.warning("No categorical columns available for visualization.") st.subheader("Cleaned Dataset:") cleaned_data = df.drop_duplicates() st.write(cleaned_data) # Store cleaned data in session state for use in next page st.session_state.cleaned_data = cleaned_data # Store cleaned data in session state # Convert cleaned data to CSV and provide a download button cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8') st.download_button( label="Download Cleaned Dataset", data=cleaned_csv, file_name="cleaned_dataset.csv", mime="text/csv" ) else: st.warning("No dataset found. Please upload a dataset on the Home page.") # Define the URL of the background image (use your own image URL) # Apply custom CSS for the background image and overlay background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/JUxgk4Z7jvSNM7OnB4nOw.jpeg" st.markdown( f""" """, unsafe_allow_html=True ) if st.button("Previous ⏮️"): st.switch_page("pages/1_Data_Card_and_Data_collection.py") if st.button("Next ⏭️"): st.switch_page("pages/3_EDA_and_Feature_Engineering.py")