Spaces:

trohith89
/

Electronics-Sales-Classification

Sleeping

App Files Files Community

Electronics-Sales-Classification / pages /2_Data_CLeaning_and_Preprocessing.py

trohith89

Update pages/2_Data_CLeaning_and_Preprocessing.py

88f0a0a verified 12 months ago

raw

history blame contribute delete

6.88 kB

	import streamlit as st
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	from io import StringIO

	# Page Title
	st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)

	# Access dataset from session state
	df = st.session_state.get("dataset")

	# Exclude 'ProductID' from the dataset
	if df is not None:
	df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists

	st.subheader("Dataset Preview:")
	st.write(df.head())

	st.subheader("Info of the Dataset:")
	# Redirect the output of df.info() to a string buffer
	buffer = StringIO()
	df.info(buf=buffer)

	# Display the content in Streamlit
	st.write(buffer.getvalue())

	st.subheader("Dataset Description:")
	st.write(df.describe())

	st.subheader("Shape of the Dataset:")
	st.write(df.shape)

	st.markdown("### Import Necessary Libraries:")
	st.code("""
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import plotly.express as px
	import warnings
	warnings.filterwarnings('ignore')

	from sklearn.linear_model import LogisticRegression
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.model_selection import train_test_split, cross_val_score
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss

	import optuna
	import imblearn
	from imblearn.under_sampling import RandomUnderSampler
	from imblearn.over_sampling import RandomOverSampler, SMOTE

	import pickle
	""", language="python")

	# Visualize Numeric Data (Histograms and Boxplots in subplots)
	numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
	if len(numeric_columns) > 0:
	st.subheader("Histograms for Numeric Columns:")
	# Create a multidimensional subplot (grid) for all histograms
	num_plots = len(numeric_columns)
	rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms
	fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
	axs = axs.flatten() # Flatten the 2D array of axes to iterate over

	color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms
	for i, col in enumerate(numeric_columns):
	palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot
	sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette
	axs[i].set_title(f'Histogram of {col}')
	st.pyplot(fig)
	plt.clf()

	st.subheader("Boxplots for Numeric Columns:")
	# Create a multidimensional subplot (grid) for all boxplots
	fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
	axs = axs.flatten() # Flatten the 2D array of axes to iterate over

	color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots
	for i, col in enumerate(numeric_columns):
	palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot
	sns.boxplot(x=df[col], ax=axs[i], palette=palette)
	axs[i].set_title(f'Boxplot of {col}')
	st.pyplot(fig)
	plt.clf()
	else:
	st.warning("No numeric columns available for visualization.")

	# Visualize Categorical Data
	categorical_columns = df.select_dtypes(include=['object', 'category']).columns
	if len(categorical_columns) > 0:
	st.subheader("Bar Plots for Categorical Columns:")
	selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)

	st.write(f"Value Counts for '{selected_cat_col}':")
	st.write(df[selected_cat_col].value_counts())

	plt.figure(figsize=(12, 6))
	sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data
	plt.title(f'Bar Plot of {selected_cat_col}')
	st.pyplot(plt)
	plt.clf()
	else:
	st.warning("No categorical columns available for visualization.")

	st.subheader("Cleaned Dataset:")
	cleaned_data = df.drop_duplicates()
	st.write(cleaned_data)

	# Store cleaned data in session state for use in next page
	st.session_state.cleaned_data = cleaned_data # Store cleaned data in session state

	# Convert cleaned data to CSV and provide a download button
	cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
	st.download_button(
	label="Download Cleaned Dataset",
	data=cleaned_csv,
	file_name="cleaned_dataset.csv",
	mime="text/csv"
	)

	else:
	st.warning("No dataset found. Please upload a dataset on the Home page.")


	# Define the URL of the background image (use your own image URL)
	# Apply custom CSS for the background image and overlay
	background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/JUxgk4Z7jvSNM7OnB4nOw.jpeg"

	st.markdown(
	f"""
	<style>
	.stApp {{
	background-image: url("{background_image_url}");
	background-size: auto; /* Ensures the image retains its original size */
	background-repeat: repeat; /* Makes the image repeat to cover the entire background */
	background-position: top left; /* Starts repeating from the top-left corner */
	background-attachment: fixed; /* Keeps the background fixed as you scroll */
	}}

	/* Semi-transparent overlay */
	.stApp::before {{
	content: "";
	position: absolute;
	top: 0;
	left: 0;
	width: 100%;
	height: 100%;
	background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
	z-index: -1;
	}}

	/* Container to center elements and limit width */
	.content-container {{
	max-width: 70%; /* Limit content width to 70% */
	margin: 0 auto; /* Center the container horizontally */
	padding: 50px; /* Add padding for spacing */
	}}

	/* Styling the markdown content */
	.stMarkdown {{
	color: white; /* White text for better visibility */
	font-size: 100px; /* Adjust font size for readability */
	}}
	</style>
	""",
	unsafe_allow_html=True
	)



	if st.button("Previous ⏮️"):
	st.switch_page("pages/1_Data_Card_and_Data_collection.py")
	if st.button("Next ⏭️"):
	st.switch_page("pages/3_EDA_and_Feature_Engineering.py")