Spaces:

Gowthamvemula
/

chrun

Sleeping

App Files Files Community

chrun / pages /2_Data_CLeaning_and_Preprocessing.py

Gowthamvemula

Update pages/2_Data_CLeaning_and_Preprocessing.py

1386d69 verified 12 months ago

raw

history blame contribute delete

7 kB

	import streamlit as st
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	from io import StringIO

	# Page Title
	st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)

	# Access dataset from session state
	df = st.session_state.get("dataset")

	# Exclude 'ProductID' from the dataset
	if df is not None:


	st.subheader("Dataset Preview:")
	st.write(df.head())

	st.subheader("Info of the Dataset:")
	# Redirect the output of df.info() to a string buffer
	buffer = StringIO()
	df.info(buf=buffer)

	# Display the content in Streamlit
	st.write(buffer.getvalue())

	st.subheader("Dataset Description:")
	st.write(df.describe())

	st.subheader("Shape of the Dataset:")
	st.write(df.shape)

	df.columns = [col.lower().replace(' ', '_') for col in df.columns]

	df.drop(['unnamed:_0','rownumber','customerid','surname'], axis=1, inplace=True, errors='ignore')

	df = df.select_dtypes(include=['int64', 'float64', 'object'])

	st.markdown("### Import Necessary Libraries:")
	st.code("""
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import plotly.express as px
	import warnings
	warnings.filterwarnings('ignore')

	from sklearn.linear_model import LogisticRegression
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.model_selection import train_test_split, cross_val_score
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss

	import optuna
	import imblearn
	from imblearn.under_sampling import RandomUnderSampler
	from imblearn.over_sampling import RandomOverSampler, SMOTE

	import pickle
	""", language="python")

	# Visualize Numeric Data (Histograms and Boxplots in subplots)
	numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
	if len(numeric_columns) > 0:
	st.subheader("Histograms for Numeric Columns:")
	# Create a multidimensional subplot (grid) for all histograms
	num_plots = len(numeric_columns)
	rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms
	fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
	axs = axs.flatten() # Flatten the 2D array of axes to iterate over

	color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms
	for i, col in enumerate(numeric_columns):
	palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot
	sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette
	axs[i].set_title(f'Histogram of {col}')
	st.pyplot(fig)
	plt.clf()

	st.subheader("Boxplots for Numeric Columns:")
	# Create a multidimensional subplot (grid) for all boxplots
	fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
	axs = axs.flatten() # Flatten the 2D array of axes to iterate over

	color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots
	for i, col in enumerate(numeric_columns):
	palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot
	sns.boxplot(x=df[col], ax=axs[i], palette=palette)
	axs[i].set_title(f'Boxplot of {col}')
	st.pyplot(fig)
	plt.clf()
	else:
	st.warning("No numeric columns available for visualization.")

	# Visualize Categorical Data
	categorical_columns = df.select_dtypes(include=['object', 'category']).columns
	if len(categorical_columns) > 0:
	st.subheader("Bar Plots for Categorical Columns:")
	selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)

	st.write(f"Value Counts for '{selected_cat_col}':")
	st.write(df[selected_cat_col].value_counts())

	plt.figure(figsize=(12, 6))
	sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data
	plt.title(f'Bar Plot of {selected_cat_col}')
	st.pyplot(plt)
	plt.clf()
	else:
	st.warning("No categorical columns available for visualization.")

	st.subheader("Cleaned Dataset:")
	df= df.drop_duplicates()
	st.write(df)

	# Store cleaned data in session state for use in next page
	st.session_state.cleaned_data = df # Store cleaned data in session state

	# Convert cleaned data to CSV and provide a download button
	cleaned_csv = df.to_csv(index=False).encode('utf-8')
	st.download_button(
	label="Download Cleaned Dataset",
	data=cleaned_csv,
	file_name="cleaned_dataset.csv",
	mime="text/csv"
	)

	else:
	st.warning("No dataset found. Please upload a dataset on the Home page.")


	# Define the URL of the background image (use your own image URL)
	# Apply custom CSS for the background image and overlay
	background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67445925102349e867c92342/r2cTyHH3xpUiszvjBkcsL.png"

	st.markdown(
	f"""
	<style>
	.stApp {{
	background-image: url("{background_image_url}");
	background-size: auto; /* Ensures the image retains its original size */
	background-repeat: repeat; /* Makes the image repeat to cover the entire background */
	background-position: top left; /* Starts repeating from the top-left corner */
	background-attachment: fixed; /* Keeps the background fixed as you scroll */
	}}

	/* Semi-transparent overlay */
	.stApp::before {{
	content: "";
	position: absolute;
	top: 0;
	left: 0;
	width: 100%;
	height: 100%;
	background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
	z-index: -1;
	}}

	/* Container to center elements and limit width */
	.content-container {{
	max-width: 70%; /* Limit content width to 70% */
	margin: 0 auto; /* Center the container horizontally */
	padding: 50px; /* Add padding for spacing */
	}}

	/* Styling the markdown content */
	.stMarkdown {{
	color: white; /* White text for better visibility */
	font-size: 100px; /* Adjust font size for readability */
	}}
	</style>
	""",
	unsafe_allow_html=True
	)



	if st.button("Previous ⏮️"):
	st.switch_page("pages/1_Data_Card_and_Data_collection.py")
	if st.button("Next ⏭️"):
	st.switch_page("pages/3_EDA_and_Feature_Engineering.py")