Electronics-Sales-Classification / pages /2_Data_CLeaning_and_Preprocessing.py
trohith89's picture
Update pages/2_Data_CLeaning_and_Preprocessing.py
88f0a0a verified
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO
# Page Title
st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
# Access dataset from session state
df = st.session_state.get("dataset")
# Exclude 'ProductID' from the dataset
if df is not None:
df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists
st.subheader("Dataset Preview:")
st.write(df.head())
st.subheader("Info of the Dataset:")
# Redirect the output of df.info() to a string buffer
buffer = StringIO()
df.info(buf=buffer)
# Display the content in Streamlit
st.write(buffer.getvalue())
st.subheader("Dataset Description:")
st.write(df.describe())
st.subheader("Shape of the Dataset:")
st.write(df.shape)
st.markdown("### Import Necessary Libraries:")
st.code("""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
import optuna
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
import pickle
""", language="python")
# Visualize Numeric Data (Histograms and Boxplots in subplots)
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
if len(numeric_columns) > 0:
st.subheader("Histograms for Numeric Columns:")
# Create a multidimensional subplot (grid) for all histograms
num_plots = len(numeric_columns)
rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms
fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
axs = axs.flatten() # Flatten the 2D array of axes to iterate over
color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms
for i, col in enumerate(numeric_columns):
palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot
sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette
axs[i].set_title(f'Histogram of {col}')
st.pyplot(fig)
plt.clf()
st.subheader("Boxplots for Numeric Columns:")
# Create a multidimensional subplot (grid) for all boxplots
fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
axs = axs.flatten() # Flatten the 2D array of axes to iterate over
color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots
for i, col in enumerate(numeric_columns):
palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot
sns.boxplot(x=df[col], ax=axs[i], palette=palette)
axs[i].set_title(f'Boxplot of {col}')
st.pyplot(fig)
plt.clf()
else:
st.warning("No numeric columns available for visualization.")
# Visualize Categorical Data
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
if len(categorical_columns) > 0:
st.subheader("Bar Plots for Categorical Columns:")
selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
st.write(f"Value Counts for '{selected_cat_col}':")
st.write(df[selected_cat_col].value_counts())
plt.figure(figsize=(12, 6))
sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data
plt.title(f'Bar Plot of {selected_cat_col}')
st.pyplot(plt)
plt.clf()
else:
st.warning("No categorical columns available for visualization.")
st.subheader("Cleaned Dataset:")
cleaned_data = df.drop_duplicates()
st.write(cleaned_data)
# Store cleaned data in session state for use in next page
st.session_state.cleaned_data = cleaned_data # Store cleaned data in session state
# Convert cleaned data to CSV and provide a download button
cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download Cleaned Dataset",
data=cleaned_csv,
file_name="cleaned_dataset.csv",
mime="text/csv"
)
else:
st.warning("No dataset found. Please upload a dataset on the Home page.")
# Define the URL of the background image (use your own image URL)
# Apply custom CSS for the background image and overlay
background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/JUxgk4Z7jvSNM7OnB4nOw.jpeg"
st.markdown(
f"""
<style>
.stApp {{
background-image: url("{background_image_url}");
background-size: auto; /* Ensures the image retains its original size */
background-repeat: repeat; /* Makes the image repeat to cover the entire background */
background-position: top left; /* Starts repeating from the top-left corner */
background-attachment: fixed; /* Keeps the background fixed as you scroll */
}}
/* Semi-transparent overlay */
.stApp::before {{
content: "";
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
z-index: -1;
}}
/* Container to center elements and limit width */
.content-container {{
max-width: 70%; /* Limit content width to 70% */
margin: 0 auto; /* Center the container horizontally */
padding: 50px; /* Add padding for spacing */
}}
/* Styling the markdown content */
.stMarkdown {{
color: white; /* White text for better visibility */
font-size: 100px; /* Adjust font size for readability */
}}
</style>
""",
unsafe_allow_html=True
)
if st.button("Previous ⏮️"):
st.switch_page("pages/1_Data_Card_and_Data_collection.py")
if st.button("Next ⏭️"):
st.switch_page("pages/3_EDA_and_Feature_Engineering.py")