Spaces:
Sleeping
Sleeping
File size: 6,999 Bytes
6f508e2 f00e18d 5432328 6f508e2 5432328 1386d69 5432328 6f508e2 fb8372c 5136773 6f508e2 5432328 6f508e2 811720b 6f508e2 3dbf88e 6f508e2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO
# Page Title
st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
# Access dataset from session state
df = st.session_state.get("dataset")
# Exclude 'ProductID' from the dataset
if df is not None:
st.subheader("Dataset Preview:")
st.write(df.head())
st.subheader("Info of the Dataset:")
# Redirect the output of df.info() to a string buffer
buffer = StringIO()
df.info(buf=buffer)
# Display the content in Streamlit
st.write(buffer.getvalue())
st.subheader("Dataset Description:")
st.write(df.describe())
st.subheader("Shape of the Dataset:")
st.write(df.shape)
df.columns = [col.lower().replace(' ', '_') for col in df.columns]
df.drop(['unnamed:_0','rownumber','customerid','surname'], axis=1, inplace=True, errors='ignore')
df = df.select_dtypes(include=['int64', 'float64', 'object'])
st.markdown("### Import Necessary Libraries:")
st.code("""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
import optuna
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
import pickle
""", language="python")
# Visualize Numeric Data (Histograms and Boxplots in subplots)
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
if len(numeric_columns) > 0:
st.subheader("Histograms for Numeric Columns:")
# Create a multidimensional subplot (grid) for all histograms
num_plots = len(numeric_columns)
rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms
fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
axs = axs.flatten() # Flatten the 2D array of axes to iterate over
color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms
for i, col in enumerate(numeric_columns):
palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot
sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette
axs[i].set_title(f'Histogram of {col}')
st.pyplot(fig)
plt.clf()
st.subheader("Boxplots for Numeric Columns:")
# Create a multidimensional subplot (grid) for all boxplots
fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
axs = axs.flatten() # Flatten the 2D array of axes to iterate over
color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots
for i, col in enumerate(numeric_columns):
palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot
sns.boxplot(x=df[col], ax=axs[i], palette=palette)
axs[i].set_title(f'Boxplot of {col}')
st.pyplot(fig)
plt.clf()
else:
st.warning("No numeric columns available for visualization.")
# Visualize Categorical Data
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
if len(categorical_columns) > 0:
st.subheader("Bar Plots for Categorical Columns:")
selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
st.write(f"Value Counts for '{selected_cat_col}':")
st.write(df[selected_cat_col].value_counts())
plt.figure(figsize=(12, 6))
sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data
plt.title(f'Bar Plot of {selected_cat_col}')
st.pyplot(plt)
plt.clf()
else:
st.warning("No categorical columns available for visualization.")
st.subheader("Cleaned Dataset:")
df= df.drop_duplicates()
st.write(df)
# Store cleaned data in session state for use in next page
st.session_state.cleaned_data = df # Store cleaned data in session state
# Convert cleaned data to CSV and provide a download button
cleaned_csv = df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download Cleaned Dataset",
data=cleaned_csv,
file_name="cleaned_dataset.csv",
mime="text/csv"
)
else:
st.warning("No dataset found. Please upload a dataset on the Home page.")
# Define the URL of the background image (use your own image URL)
# Apply custom CSS for the background image and overlay
background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67445925102349e867c92342/r2cTyHH3xpUiszvjBkcsL.png"
st.markdown(
f"""
<style>
.stApp {{
background-image: url("{background_image_url}");
background-size: auto; /* Ensures the image retains its original size */
background-repeat: repeat; /* Makes the image repeat to cover the entire background */
background-position: top left; /* Starts repeating from the top-left corner */
background-attachment: fixed; /* Keeps the background fixed as you scroll */
}}
/* Semi-transparent overlay */
.stApp::before {{
content: "";
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
z-index: -1;
}}
/* Container to center elements and limit width */
.content-container {{
max-width: 70%; /* Limit content width to 70% */
margin: 0 auto; /* Center the container horizontally */
padding: 50px; /* Add padding for spacing */
}}
/* Styling the markdown content */
.stMarkdown {{
color: white; /* White text for better visibility */
font-size: 100px; /* Adjust font size for readability */
}}
</style>
""",
unsafe_allow_html=True
)
if st.button("Previous ⏮️"):
st.switch_page("pages/1_Data_Card_and_Data_collection.py")
if st.button("Next ⏭️"):
st.switch_page("pages/3_EDA_and_Feature_Engineering.py")
|