|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import seaborn as sns |
|
|
import matplotlib.pyplot as plt |
|
|
from io import StringIO |
|
|
|
|
|
|
|
|
st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
df = st.session_state.get("dataset") |
|
|
|
|
|
|
|
|
if df is not None: |
|
|
df = df.drop(columns=['ProductID'], errors='ignore') |
|
|
|
|
|
st.subheader("Dataset Preview:") |
|
|
st.write(df.head()) |
|
|
|
|
|
st.subheader("Info of the Dataset:") |
|
|
|
|
|
buffer = StringIO() |
|
|
df.info(buf=buffer) |
|
|
|
|
|
|
|
|
st.write(buffer.getvalue()) |
|
|
|
|
|
st.subheader("Dataset Description:") |
|
|
st.write(df.describe()) |
|
|
|
|
|
st.subheader("Shape of the Dataset:") |
|
|
st.write(df.shape) |
|
|
|
|
|
st.markdown("### Import Necessary Libraries:") |
|
|
st.code(""" |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
import plotly.express as px |
|
|
import warnings |
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.neighbors import KNeighborsClassifier |
|
|
from sklearn.model_selection import train_test_split, cross_val_score |
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder |
|
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss |
|
|
|
|
|
import optuna |
|
|
import imblearn |
|
|
from imblearn.under_sampling import RandomUnderSampler |
|
|
from imblearn.over_sampling import RandomOverSampler, SMOTE |
|
|
|
|
|
import pickle |
|
|
""", language="python") |
|
|
|
|
|
|
|
|
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns |
|
|
if len(numeric_columns) > 0: |
|
|
st.subheader("Histograms for Numeric Columns:") |
|
|
|
|
|
num_plots = len(numeric_columns) |
|
|
rows = (num_plots + 1) // 2 |
|
|
fig, axs = plt.subplots(rows, 2, figsize=(12, 12)) |
|
|
axs = axs.flatten() |
|
|
|
|
|
color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] |
|
|
for i, col in enumerate(numeric_columns): |
|
|
palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) |
|
|
sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) |
|
|
axs[i].set_title(f'Histogram of {col}') |
|
|
st.pyplot(fig) |
|
|
plt.clf() |
|
|
|
|
|
st.subheader("Boxplots for Numeric Columns:") |
|
|
|
|
|
fig, axs = plt.subplots(rows, 2, figsize=(12, 12)) |
|
|
axs = axs.flatten() |
|
|
|
|
|
color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] |
|
|
for i, col in enumerate(numeric_columns): |
|
|
palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) |
|
|
sns.boxplot(x=df[col], ax=axs[i], palette=palette) |
|
|
axs[i].set_title(f'Boxplot of {col}') |
|
|
st.pyplot(fig) |
|
|
plt.clf() |
|
|
else: |
|
|
st.warning("No numeric columns available for visualization.") |
|
|
|
|
|
|
|
|
categorical_columns = df.select_dtypes(include=['object', 'category']).columns |
|
|
if len(categorical_columns) > 0: |
|
|
st.subheader("Bar Plots for Categorical Columns:") |
|
|
selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns) |
|
|
|
|
|
st.write(f"Value Counts for '{selected_cat_col}':") |
|
|
st.write(df[selected_cat_col].value_counts()) |
|
|
|
|
|
plt.figure(figsize=(12, 6)) |
|
|
sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') |
|
|
plt.title(f'Bar Plot of {selected_cat_col}') |
|
|
st.pyplot(plt) |
|
|
plt.clf() |
|
|
else: |
|
|
st.warning("No categorical columns available for visualization.") |
|
|
|
|
|
st.subheader("Cleaned Dataset:") |
|
|
cleaned_data = df.drop_duplicates() |
|
|
st.write(cleaned_data) |
|
|
|
|
|
|
|
|
st.session_state.cleaned_data = cleaned_data |
|
|
|
|
|
|
|
|
cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8') |
|
|
st.download_button( |
|
|
label="Download Cleaned Dataset", |
|
|
data=cleaned_csv, |
|
|
file_name="cleaned_dataset.csv", |
|
|
mime="text/csv" |
|
|
) |
|
|
|
|
|
else: |
|
|
st.warning("No dataset found. Please upload a dataset on the Home page.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/JUxgk4Z7jvSNM7OnB4nOw.jpeg" |
|
|
|
|
|
st.markdown( |
|
|
f""" |
|
|
<style> |
|
|
.stApp {{ |
|
|
background-image: url("{background_image_url}"); |
|
|
background-size: auto; /* Ensures the image retains its original size */ |
|
|
background-repeat: repeat; /* Makes the image repeat to cover the entire background */ |
|
|
background-position: top left; /* Starts repeating from the top-left corner */ |
|
|
background-attachment: fixed; /* Keeps the background fixed as you scroll */ |
|
|
}} |
|
|
|
|
|
/* Semi-transparent overlay */ |
|
|
.stApp::before {{ |
|
|
content: ""; |
|
|
position: absolute; |
|
|
top: 0; |
|
|
left: 0; |
|
|
width: 100%; |
|
|
height: 100%; |
|
|
background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */ |
|
|
z-index: -1; |
|
|
}} |
|
|
|
|
|
/* Container to center elements and limit width */ |
|
|
.content-container {{ |
|
|
max-width: 70%; /* Limit content width to 70% */ |
|
|
margin: 0 auto; /* Center the container horizontally */ |
|
|
padding: 50px; /* Add padding for spacing */ |
|
|
}} |
|
|
|
|
|
/* Styling the markdown content */ |
|
|
.stMarkdown {{ |
|
|
color: white; /* White text for better visibility */ |
|
|
font-size: 100px; /* Adjust font size for readability */ |
|
|
}} |
|
|
</style> |
|
|
""", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if st.button("Previous ⏮️"): |
|
|
st.switch_page("pages/1_Data_Card_and_Data_collection.py") |
|
|
if st.button("Next ⏭️"): |
|
|
st.switch_page("pages/3_EDA_and_Feature_Engineering.py") |
|
|
|