File size: 6,879 Bytes
419bb0b 9f93e54 bb9c90b 419bb0b b8f183f fcc29a0 b8f183f d6c24b0 b8f183f a858e41 81da568 fcc29a0 a04898b 81da568 a858e41 46c9577 a858e41 fcc29a0 7723572 81da568 7723572 81da568 a858e41 fcc29a0 a858e41 7d15fee 7723572 e3ff19f 1428269 88f0a0a 1428269 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO
# Page Title
st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
# Access dataset from session state
df = st.session_state.get("dataset")
# Exclude 'ProductID' from the dataset
if df is not None:
df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists
st.subheader("Dataset Preview:")
st.write(df.head())
st.subheader("Info of the Dataset:")
# Redirect the output of df.info() to a string buffer
buffer = StringIO()
df.info(buf=buffer)
# Display the content in Streamlit
st.write(buffer.getvalue())
st.subheader("Dataset Description:")
st.write(df.describe())
st.subheader("Shape of the Dataset:")
st.write(df.shape)
st.markdown("### Import Necessary Libraries:")
st.code("""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
import optuna
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
import pickle
""", language="python")
# Visualize Numeric Data (Histograms and Boxplots in subplots)
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
if len(numeric_columns) > 0:
st.subheader("Histograms for Numeric Columns:")
# Create a multidimensional subplot (grid) for all histograms
num_plots = len(numeric_columns)
rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms
fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
axs = axs.flatten() # Flatten the 2D array of axes to iterate over
color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms
for i, col in enumerate(numeric_columns):
palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot
sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette
axs[i].set_title(f'Histogram of {col}')
st.pyplot(fig)
plt.clf()
st.subheader("Boxplots for Numeric Columns:")
# Create a multidimensional subplot (grid) for all boxplots
fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
axs = axs.flatten() # Flatten the 2D array of axes to iterate over
color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots
for i, col in enumerate(numeric_columns):
palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot
sns.boxplot(x=df[col], ax=axs[i], palette=palette)
axs[i].set_title(f'Boxplot of {col}')
st.pyplot(fig)
plt.clf()
else:
st.warning("No numeric columns available for visualization.")
# Visualize Categorical Data
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
if len(categorical_columns) > 0:
st.subheader("Bar Plots for Categorical Columns:")
selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
st.write(f"Value Counts for '{selected_cat_col}':")
st.write(df[selected_cat_col].value_counts())
plt.figure(figsize=(12, 6))
sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data
plt.title(f'Bar Plot of {selected_cat_col}')
st.pyplot(plt)
plt.clf()
else:
st.warning("No categorical columns available for visualization.")
st.subheader("Cleaned Dataset:")
cleaned_data = df.drop_duplicates()
st.write(cleaned_data)
# Store cleaned data in session state for use in next page
st.session_state.cleaned_data = cleaned_data # Store cleaned data in session state
# Convert cleaned data to CSV and provide a download button
cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download Cleaned Dataset",
data=cleaned_csv,
file_name="cleaned_dataset.csv",
mime="text/csv"
)
else:
st.warning("No dataset found. Please upload a dataset on the Home page.")
# Define the URL of the background image (use your own image URL)
# Apply custom CSS for the background image and overlay
background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/JUxgk4Z7jvSNM7OnB4nOw.jpeg"
st.markdown(
f"""
<style>
.stApp {{
background-image: url("{background_image_url}");
background-size: auto; /* Ensures the image retains its original size */
background-repeat: repeat; /* Makes the image repeat to cover the entire background */
background-position: top left; /* Starts repeating from the top-left corner */
background-attachment: fixed; /* Keeps the background fixed as you scroll */
}}
/* Semi-transparent overlay */
.stApp::before {{
content: "";
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
z-index: -1;
}}
/* Container to center elements and limit width */
.content-container {{
max-width: 70%; /* Limit content width to 70% */
margin: 0 auto; /* Center the container horizontally */
padding: 50px; /* Add padding for spacing */
}}
/* Styling the markdown content */
.stMarkdown {{
color: white; /* White text for better visibility */
font-size: 100px; /* Adjust font size for readability */
}}
</style>
""",
unsafe_allow_html=True
)
if st.button("Previous ⏮️"):
st.switch_page("pages/1_Data_Card_and_Data_collection.py")
if st.button("Next ⏭️"):
st.switch_page("pages/3_EDA_and_Feature_Engineering.py")
|