Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| from io import StringIO | |
| # Page Title | |
| st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True) | |
| # Access dataset from session state | |
| df = st.session_state.get("dataset") | |
| # Exclude 'ProductID' from the dataset | |
| if df is not None: | |
| st.subheader("Dataset Preview:") | |
| st.write(df.head()) | |
| st.subheader("Info of the Dataset:") | |
| # Redirect the output of df.info() to a string buffer | |
| buffer = StringIO() | |
| df.info(buf=buffer) | |
| # Display the content in Streamlit | |
| st.write(buffer.getvalue()) | |
| st.subheader("Dataset Description:") | |
| st.write(df.describe()) | |
| st.subheader("Shape of the Dataset:") | |
| st.write(df.shape) | |
| df.columns = [col.lower().replace(' ', '_') for col in df.columns] | |
| df.drop(['unnamed:_0','rownumber','customerid','surname'], axis=1, inplace=True, errors='ignore') | |
| df = df.select_dtypes(include=['int64', 'float64', 'object']) | |
| st.markdown("### Import Necessary Libraries:") | |
| st.code(""" | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.model_selection import train_test_split, cross_val_score | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss | |
| import optuna | |
| import imblearn | |
| from imblearn.under_sampling import RandomUnderSampler | |
| from imblearn.over_sampling import RandomOverSampler, SMOTE | |
| import pickle | |
| """, language="python") | |
| # Visualize Numeric Data (Histograms and Boxplots in subplots) | |
| numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns | |
| if len(numeric_columns) > 0: | |
| st.subheader("Histograms for Numeric Columns:") | |
| # Create a multidimensional subplot (grid) for all histograms | |
| num_plots = len(numeric_columns) | |
| rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms | |
| fig, axs = plt.subplots(rows, 2, figsize=(12, 12)) | |
| axs = axs.flatten() # Flatten the 2D array of axes to iterate over | |
| color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms | |
| for i, col in enumerate(numeric_columns): | |
| palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot | |
| sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette | |
| axs[i].set_title(f'Histogram of {col}') | |
| st.pyplot(fig) | |
| plt.clf() | |
| st.subheader("Boxplots for Numeric Columns:") | |
| # Create a multidimensional subplot (grid) for all boxplots | |
| fig, axs = plt.subplots(rows, 2, figsize=(12, 12)) | |
| axs = axs.flatten() # Flatten the 2D array of axes to iterate over | |
| color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots | |
| for i, col in enumerate(numeric_columns): | |
| palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot | |
| sns.boxplot(x=df[col], ax=axs[i], palette=palette) | |
| axs[i].set_title(f'Boxplot of {col}') | |
| st.pyplot(fig) | |
| plt.clf() | |
| else: | |
| st.warning("No numeric columns available for visualization.") | |
| # Visualize Categorical Data | |
| categorical_columns = df.select_dtypes(include=['object', 'category']).columns | |
| if len(categorical_columns) > 0: | |
| st.subheader("Bar Plots for Categorical Columns:") | |
| selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns) | |
| st.write(f"Value Counts for '{selected_cat_col}':") | |
| st.write(df[selected_cat_col].value_counts()) | |
| plt.figure(figsize=(12, 6)) | |
| sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data | |
| plt.title(f'Bar Plot of {selected_cat_col}') | |
| st.pyplot(plt) | |
| plt.clf() | |
| else: | |
| st.warning("No categorical columns available for visualization.") | |
| st.subheader("Cleaned Dataset:") | |
| df= df.drop_duplicates() | |
| st.write(df) | |
| # Store cleaned data in session state for use in next page | |
| st.session_state.cleaned_data = df # Store cleaned data in session state | |
| # Convert cleaned data to CSV and provide a download button | |
| cleaned_csv = df.to_csv(index=False).encode('utf-8') | |
| st.download_button( | |
| label="Download Cleaned Dataset", | |
| data=cleaned_csv, | |
| file_name="cleaned_dataset.csv", | |
| mime="text/csv" | |
| ) | |
| else: | |
| st.warning("No dataset found. Please upload a dataset on the Home page.") | |
| # Define the URL of the background image (use your own image URL) | |
| # Apply custom CSS for the background image and overlay | |
| background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67445925102349e867c92342/r2cTyHH3xpUiszvjBkcsL.png" | |
| st.markdown( | |
| f""" | |
| <style> | |
| .stApp {{ | |
| background-image: url("{background_image_url}"); | |
| background-size: auto; /* Ensures the image retains its original size */ | |
| background-repeat: repeat; /* Makes the image repeat to cover the entire background */ | |
| background-position: top left; /* Starts repeating from the top-left corner */ | |
| background-attachment: fixed; /* Keeps the background fixed as you scroll */ | |
| }} | |
| /* Semi-transparent overlay */ | |
| .stApp::before {{ | |
| content: ""; | |
| position: absolute; | |
| top: 0; | |
| left: 0; | |
| width: 100%; | |
| height: 100%; | |
| background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */ | |
| z-index: -1; | |
| }} | |
| /* Container to center elements and limit width */ | |
| .content-container {{ | |
| max-width: 70%; /* Limit content width to 70% */ | |
| margin: 0 auto; /* Center the container horizontally */ | |
| padding: 50px; /* Add padding for spacing */ | |
| }} | |
| /* Styling the markdown content */ | |
| .stMarkdown {{ | |
| color: white; /* White text for better visibility */ | |
| font-size: 100px; /* Adjust font size for readability */ | |
| }} | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| if st.button("Previous ⏮️"): | |
| st.switch_page("pages/1_Data_Card_and_Data_collection.py") | |
| if st.button("Next ⏭️"): | |
| st.switch_page("pages/3_EDA_and_Feature_Engineering.py") | |