File size: 6,879 Bytes
419bb0b
 
9f93e54
 
bb9c90b
419bb0b
b8f183f
 
fcc29a0
b8f183f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6c24b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8f183f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a858e41
81da568
 
fcc29a0
a04898b
81da568
a858e41
 
 
 
 
46c9577
 
 
 
a858e41
fcc29a0
 
 
 
 
 
 
 
 
 
 
 
7723572
 
 
 
 
 
 
 
 
81da568
7723572
 
81da568
a858e41
fcc29a0
a858e41
7d15fee
7723572
e3ff19f
 
 
1428269
88f0a0a
1428269
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO

# Page Title
st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)

# Access dataset from session state
df = st.session_state.get("dataset")

# Exclude 'ProductID' from the dataset
if df is not None:
    df = df.drop(columns=['ProductID'], errors='ignore')  # Exclude 'ProductID' if it exists

    st.subheader("Dataset Preview:")
    st.write(df.head())

    st.subheader("Info of the Dataset:")
    # Redirect the output of df.info() to a string buffer
    buffer = StringIO()
    df.info(buf=buffer)
    
    # Display the content in Streamlit
    st.write(buffer.getvalue())

    st.subheader("Dataset Description:")
    st.write(df.describe())

    st.subheader("Shape of the Dataset:")
    st.write(df.shape)

    st.markdown("### Import Necessary Libraries:")
    st.code("""
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import plotly.express as px
    import warnings
    warnings.filterwarnings('ignore')
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
    
    import optuna
    import imblearn
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.over_sampling import RandomOverSampler, SMOTE
    
    import pickle
    """, language="python")

    # Visualize Numeric Data (Histograms and Boxplots in subplots)
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    if len(numeric_columns) > 0:
        st.subheader("Histograms for Numeric Columns:")
        # Create a multidimensional subplot (grid) for all histograms
        num_plots = len(numeric_columns)
        rows = (num_plots + 1) // 2  # To create a 2-column grid layout for histograms
        fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
        axs = axs.flatten()  # Flatten the 2D array of axes to iterate over
        
        color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1']  # Different color palettes for histograms
        for i, col in enumerate(numeric_columns):
            palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)])  # Ensure different palette for each plot
            sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i])  # Apply the color palette
            axs[i].set_title(f'Histogram of {col}')
        st.pyplot(fig)
        plt.clf()

        st.subheader("Boxplots for Numeric Columns:")
        # Create a multidimensional subplot (grid) for all boxplots
        fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
        axs = axs.flatten()  # Flatten the 2D array of axes to iterate over
        
        color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest']  # Different color palettes for boxplots
        for i, col in enumerate(numeric_columns):
            palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)])  # Ensure different palette for each plot
            sns.boxplot(x=df[col], ax=axs[i], palette=palette)
            axs[i].set_title(f'Boxplot of {col}')
        st.pyplot(fig)
        plt.clf()
    else:
        st.warning("No numeric columns available for visualization.")

    # Visualize Categorical Data
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    if len(categorical_columns) > 0:
        st.subheader("Bar Plots for Categorical Columns:")
        selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)

        st.write(f"Value Counts for '{selected_cat_col}':")
        st.write(df[selected_cat_col].value_counts())

        plt.figure(figsize=(12, 6))
        sns.countplot(x=selected_cat_col, data=df, palette='coolwarm')  # Unique palette for categorical data
        plt.title(f'Bar Plot of {selected_cat_col}')
        st.pyplot(plt)
        plt.clf()
    else:
        st.warning("No categorical columns available for visualization.")

    st.subheader("Cleaned Dataset:")
    cleaned_data = df.drop_duplicates()
    st.write(cleaned_data)

    # Store cleaned data in session state for use in next page
    st.session_state.cleaned_data = cleaned_data  # Store cleaned data in session state

    # Convert cleaned data to CSV and provide a download button
    cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
    st.download_button(
        label="Download Cleaned Dataset",
        data=cleaned_csv,
        file_name="cleaned_dataset.csv",
        mime="text/csv"
    )

else:
    st.warning("No dataset found. Please upload a dataset on the Home page.")


# Define the URL of the background image (use your own image URL)
# Apply custom CSS for the background image and overlay
background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/JUxgk4Z7jvSNM7OnB4nOw.jpeg"

st.markdown(
    f"""
    <style>
        .stApp {{
            background-image: url("{background_image_url}");
            background-size: auto;  /* Ensures the image retains its original size */
            background-repeat: repeat;  /* Makes the image repeat to cover the entire background */
            background-position: top left;  /* Starts repeating from the top-left corner */
            background-attachment: fixed;  /* Keeps the background fixed as you scroll */
        }}
        
        /* Semi-transparent overlay */
        .stApp::before {{
            content: "";
            position: absolute;
            top: 0;
            left: 0;
            width: 100%;
            height: 100%;
            background: rgba(0, 0, 0, 0.4);  /* Adjust transparency here (0.4 for 40% transparency) */
            z-index: -1;
        }}
        
        /* Container to center elements and limit width */
        .content-container {{
            max-width: 70%;  /* Limit content width to 70% */
            margin: 0 auto;  /* Center the container horizontally */
            padding: 50px;  /* Add padding for spacing */
        }}
        
        /* Styling the markdown content */
        .stMarkdown {{
            color: white;  /* White text for better visibility */
            font-size: 100px;  /* Adjust font size for readability */
        }}
    </style>
    """, 
    unsafe_allow_html=True
)



if st.button("Previous ⏮️"):
    st.switch_page("pages/1_Data_Card_and_Data_collection.py")
if st.button("Next ⏭️"):
    st.switch_page("pages/3_EDA_and_Feature_Engineering.py")