File size: 10,142 Bytes
c79f7ff
67cadf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c79f7ff
67cadf7
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
import plotly.express as px
import io

# Metadata
AUTHOR = "Eduardo Nacimiento García"
EMAIL = "enacimie@ull.edu.es"
LICENSE = "Apache 2.0"

# Page config
st.set_page_config(
    page_title="SimpleClean",
    page_icon="🧹",
    layout="wide",
    initial_sidebar_state="expanded",
)

# Title
st.title("🧹 SimpleClean")
st.markdown(f"**Author:** {AUTHOR} | **Email:** {EMAIL} | **License:** {LICENSE}")
st.write("""
Upload a CSV or use the demo dataset to interactively clean your data: handle missing values, encode categories, scale features, and more.
""")

# === GENERATE DEMO DATASET ===
@st.cache_data
def create_demo_data():
    np.random.seed(42)
    n = 300
    data = {
        "Age": np.random.normal(35, 12, n).astype(int),
        "Income": np.random.normal(45000, 15000, n),
        "City": np.random.choice(["Madrid", "Barcelona", "Valencia", "Seville", None], n, p=[0.25, 0.25, 0.25, 0.2, 0.05]),
        "Gender": np.random.choice(["M", "F", None], n, p=[0.45, 0.45, 0.10]),
        "Has_Children": np.random.choice([0, 1, None], n, p=[0.4, 0.4, 0.2]),
        "Satisfaction": np.random.randint(1, 11, n)
    }
    df = pd.DataFrame(data)
    # Introduce some nulls
    df.loc[np.random.choice(df.index, 15), "Age"] = np.nan
    df.loc[np.random.choice(df.index, 20), "Income"] = np.nan
    df.loc[np.random.choice(df.index, 10), "Satisfaction"] = np.nan
    return df

# === LOAD DATA ===
if st.button("🧪 Load Demo Dataset"):
    st.session_state['df_original'] = create_demo_data()
    st.session_state['df_clean'] = st.session_state['df_original'].copy()
    st.success("✅ Demo dataset loaded!")

uploaded_file = st.file_uploader("📂 Upload your CSV file", type=["csv"])

if uploaded_file:
    df = pd.read_csv(uploaded_file)
    st.session_state['df_original'] = df
    st.session_state['df_clean'] = df.copy()
    st.success("✅ File uploaded successfully.")

if 'df_clean' not in st.session_state:
    st.info("👆 Upload a CSV or click 'Load Demo Dataset' to begin.")
    st.stop()

df_original = st.session_state['df_original']
df_clean = st.session_state['df_clean']

# Show data preview
st.subheader("🔍 Data Preview")
with st.expander("Original Data (first 10 rows)"):
    st.dataframe(df_original.head(10))

with st.expander("Current Cleaned Data (first 10 rows)"):
    st.dataframe(df_clean.head(10))

# === DATA QUALITY REPORT ===
st.header("📊 Data Quality Report")

col1, col2, col3, col4 = st.columns(4)
col1.metric("Rows", df_clean.shape[0])
col2.metric("Columns", df_clean.shape[1])
col3.metric("Missing Cells", df_clean.isnull().sum().sum())
col4.metric("Duplicate Rows", df_clean.duplicated().sum())

# Missing values by column
st.subheader("🕳️ Missing Values by Column")
missing_data = df_clean.isnull().sum()
fig_missing = px.bar(
    missing_data,
    title="Missing Values per Column",
    labels={'value': 'Count', 'index': 'Column'},
    color=missing_data
)
st.plotly_chart(fig_missing, use_container_width=True)

# Data types
st.subheader("🔤 Column Data Types")
dtypes_df = pd.DataFrame(df_clean.dtypes, columns=['Data Type']).reset_index()
dtypes_df.columns = ['Column', 'Data Type']
st.dataframe(dtypes_df)

# === CLEANING OPTIONS ===
st.header("🧼 Cleaning Actions")

tab1, tab2, tab3, tab4 = st.tabs([
    "🧹 Remove Duplicates",
    "🩹 Handle Missing Values",
    "🔠 Encode Categorical Variables",
    "📏 Scale Numeric Variables"
])

# Tab 1: Remove Duplicates
with tab1:
    st.subheader("Remove Duplicate Rows")
    if st.button("Remove All Duplicates"):
        original_count = len(df_clean)
        df_clean = df_clean.drop_duplicates().reset_index(drop=True)
        st.session_state['df_clean'] = df_clean
        removed = original_count - len(df_clean)
        st.success(f"✅ Removed {removed} duplicate rows.")

# Tab 2: Handle Missing Values
with tab2:
    st.subheader("Impute Missing Values")

    # Select column with missing values
    cols_with_missing = df_clean.columns[df_clean.isnull().any()].tolist()
    if not cols_with_missing:
        st.success("✅ No missing values to impute.")
    else:
        col_to_impute = st.selectbox("Select column to impute:", cols_with_missing)

        # Detect column type
        col_dtype = df_clean[col_to_impute].dtype
        if col_dtype in ['object', 'category']:
            strategy = st.selectbox(
                f"Imputation strategy for {col_to_impute} (categorical):",
                ["Most Frequent", "Constant"]
            )
            if strategy == "Constant":
                fill_value = st.text_input("Constant value:", value="Unknown")
            else:
                fill_value = None
        else:
            strategy = st.selectbox(
                f"Imputation strategy for {col_to_impute} (numeric):",
                ["Mean", "Median", "Most Frequent", "Constant", "KNN Imputer"]
            )
            if strategy == "Constant":
                fill_value = st.number_input("Constant value:", value=0.0)
            else:
                fill_value = None

        if st.button(f"Apply Imputation to '{col_to_impute}'"):
            try:
                if strategy == "Mean":
                    imputer = SimpleImputer(strategy='mean')
                elif strategy == "Median":
                    imputer = SimpleImputer(strategy='median')
                elif strategy == "Most Frequent":
                    imputer = SimpleImputer(strategy='most_frequent')
                elif strategy == "Constant":
                    imputer = SimpleImputer(strategy='constant', fill_value=fill_value)
                elif strategy == "KNN Imputer" and col_dtype in [np.number, 'float64', 'int64']:
                    # Only for numeric
                    imputer = KNNImputer(n_neighbors=5)
                else:
                    st.error("Invalid strategy for this column type.")
                    st.stop()

                # Apply imputation
                if strategy == "KNN Imputer":
                    # Only apply to numeric columns for KNN
                    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
                    df_clean[numeric_cols] = imputer.fit_transform(df_clean[numeric_cols])
                else:
                    df_clean[col_to_impute] = imputer.fit_transform(df_clean[[col_to_impute]]).ravel()

                st.session_state['df_clean'] = df_clean
                st.success(f"✅ Missing values in '{col_to_impute}' imputed using '{strategy}'.")
            except Exception as e:
                st.error(f"❌ Error during imputation: {e}")

# Tab 3: Encode Categorical Variables
with tab3:
    st.subheader("Encode Categorical Variables")

    categorical_cols = df_clean.select_dtypes(include=['object', 'category']).columns.tolist()
    if not categorical_cols:
        st.info("ℹ️ No categorical columns to encode.")
    else:
        col_to_encode = st.selectbox("Select categorical column to encode:", categorical_cols)
        encoding_method = st.radio(
            "Encoding method:",
            ["Label Encoding", "One-Hot Encoding"]
        )

        if st.button(f"Apply {encoding_method} to '{col_to_encode}'"):
            try:
                if encoding_method == "Label Encoding":
                    le = LabelEncoder()
                    df_clean[col_to_encode] = le.fit_transform(df_clean[col_to_encode].astype(str))
                    st.session_state['df_clean'] = df_clean
                    st.success(f"✅ '{col_to_encode}' label encoded.")
                else:  # One-Hot Encoding
                    df_encoded = pd.get_dummies(df_clean[col_to_encode], prefix=col_to_encode)
                    df_clean = df_clean.drop(columns=[col_to_encode])
                    df_clean = pd.concat([df_clean, df_encoded], axis=1)
                    st.session_state['df_clean'] = df_clean
                    st.success(f"✅ '{col_to_encode}' one-hot encoded. {df_encoded.shape[1]} new columns added.")
            except Exception as e:
                st.error(f"❌ Error during encoding: {e}")

# Tab 4: Scale Numeric Variables
with tab4:
    st.subheader("Scale Numeric Variables")

    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
    if not numeric_cols:
        st.info("ℹ️ No numeric columns to scale.")
    else:
        cols_to_scale = st.multiselect("Select numeric columns to scale:", numeric_cols, default=numeric_cols[:2] if len(numeric_cols) >= 2 else numeric_cols)
        scaling_method = st.radio("Scaling method:", ["StandardScaler (Z-score)", "MinMaxScaler (0-1)"])

        if st.button("Apply Scaling"):
            try:
                if scaling_method == "StandardScaler (Z-score)":
                    scaler = StandardScaler()
                else:
                    scaler = MinMaxScaler()

                df_clean[cols_to_scale] = scaler.fit_transform(df_clean[cols_to_scale])
                st.session_state['df_clean'] = df_clean
                st.success(f"✅ Columns {cols_to_scale} scaled using {scaling_method}.")
            except Exception as e:
                st.error(f"❌ Error during scaling: {e}")

# === DOWNLOAD CLEANED DATA ===
st.header("📥 Download Cleaned Data")

df_clean_final = st.session_state['df_clean']

# Show final preview
with st.expander("Final Cleaned Data Preview"):
    st.dataframe(df_clean_final.head(10))

# Download button
csv = df_clean_final.to_csv(index=False).encode('utf-8')
st.download_button(
    label="💾 Download Cleaned CSV",
    data=csv,
    file_name="cleaned_data.csv",
    mime="text/csv",
)

# Reset button
if st.button("🔄 Reset to Original Data"):
    st.session_state['df_clean'] = st.session_state['df_original'].copy()
    st.success("✅ Data reset to original state.")

# Footer
st.markdown("---")
st.caption(f"© {AUTHOR} | License {LICENSE} | Contact: {EMAIL}")