Spaces:

enacimie
/

SimpleClean

Sleeping

App Files Files Community

enacimie commited on Sep 18, 2025

Commit

67cadf7

verified ·

1 Parent(s): 3877afb

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +264 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,266 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.impute import SimpleImputer, KNNImputer
+from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
+from sklearn.compose import ColumnTransformer
+import plotly.express as px
+import io
+# Metadata
+AUTHOR = "Eduardo Nacimiento García"
+EMAIL = "enacimie@ull.edu.es"
+LICENSE = "Apache 2.0"
+# Page config
+st.set_page_config(
+    page_title="SimpleClean",
+    page_icon="🧹",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# Title
+st.title("🧹 SimpleClean")
+st.markdown(f"**Author:** {AUTHOR} | **Email:** {EMAIL} | **License:** {LICENSE}")
+st.write("""
+Upload a CSV or use the demo dataset to interactively clean your data: handle missing values, encode categories, scale features, and more.
+""")
+# === GENERATE DEMO DATASET ===
+@st.cache_data
+def create_demo_data():
+    np.random.seed(42)
+    n = 300
+    data = {
+        "Age": np.random.normal(35, 12, n).astype(int),
+        "Income": np.random.normal(45000, 15000, n),
+        "City": np.random.choice(["Madrid", "Barcelona", "Valencia", "Seville", None], n, p=[0.25, 0.25, 0.25, 0.2, 0.05]),
+        "Gender": np.random.choice(["M", "F", None], n, p=[0.45, 0.45, 0.10]),
+        "Has_Children": np.random.choice([0, 1, None], n, p=[0.4, 0.4, 0.2]),
+        "Satisfaction": np.random.randint(1, 11, n)
+    }
+    df = pd.DataFrame(data)
+    # Introduce some nulls
+    df.loc[np.random.choice(df.index, 15), "Age"] = np.nan
+    df.loc[np.random.choice(df.index, 20), "Income"] = np.nan
+    df.loc[np.random.choice(df.index, 10), "Satisfaction"] = np.nan
+    return df
+# === LOAD DATA ===
+if st.button("🧪 Load Demo Dataset"):
+    st.session_state['df_original'] = create_demo_data()
+    st.session_state['df_clean'] = st.session_state['df_original'].copy()
+    st.success("✅ Demo dataset loaded!")
+uploaded_file = st.file_uploader("📂 Upload your CSV file", type=["csv"])
+if uploaded_file:
+    df = pd.read_csv(uploaded_file)
+    st.session_state['df_original'] = df
+    st.session_state['df_clean'] = df.copy()
+    st.success("✅ File uploaded successfully.")
+if 'df_clean' not in st.session_state:
+    st.info("👆 Upload a CSV or click 'Load Demo Dataset' to begin.")
+    st.stop()
+df_original = st.session_state['df_original']
+df_clean = st.session_state['df_clean']
+# Show data preview
+st.subheader("🔍 Data Preview")
+with st.expander("Original Data (first 10 rows)"):
+    st.dataframe(df_original.head(10))
+with st.expander("Current Cleaned Data (first 10 rows)"):
+    st.dataframe(df_clean.head(10))
+# === DATA QUALITY REPORT ===
+st.header("📊 Data Quality Report")
+col1, col2, col3, col4 = st.columns(4)
+col1.metric("Rows", df_clean.shape[0])
+col2.metric("Columns", df_clean.shape[1])
+col3.metric("Missing Cells", df_clean.isnull().sum().sum())
+col4.metric("Duplicate Rows", df_clean.duplicated().sum())
+# Missing values by column
+st.subheader("🕳️ Missing Values by Column")
+missing_data = df_clean.isnull().sum()
+fig_missing = px.bar(
+    missing_data,
+    title="Missing Values per Column",
+    labels={'value': 'Count', 'index': 'Column'},
+    color=missing_data
+)
+st.plotly_chart(fig_missing, use_container_width=True)
+# Data types
+st.subheader("🔤 Column Data Types")
+dtypes_df = pd.DataFrame(df_clean.dtypes, columns=['Data Type']).reset_index()
+dtypes_df.columns = ['Column', 'Data Type']
+st.dataframe(dtypes_df)
+# === CLEANING OPTIONS ===
+st.header("🧼 Cleaning Actions")
+tab1, tab2, tab3, tab4 = st.tabs([
+    "🧹 Remove Duplicates",
+    "🩹 Handle Missing Values",
+    "🔠 Encode Categorical Variables",
+    "📏 Scale Numeric Variables"
+])
+# Tab 1: Remove Duplicates
+with tab1:
+    st.subheader("Remove Duplicate Rows")
+    if st.button("Remove All Duplicates"):
+        original_count = len(df_clean)
+        df_clean = df_clean.drop_duplicates().reset_index(drop=True)
+        st.session_state['df_clean'] = df_clean
+        removed = original_count - len(df_clean)
+        st.success(f"✅ Removed {removed} duplicate rows.")
+# Tab 2: Handle Missing Values
+with tab2:
+    st.subheader("Impute Missing Values")
+    # Select column with missing values
+    cols_with_missing = df_clean.columns[df_clean.isnull().any()].tolist()
+    if not cols_with_missing:
+        st.success("✅ No missing values to impute.")
+    else:
+        col_to_impute = st.selectbox("Select column to impute:", cols_with_missing)
+        # Detect column type
+        col_dtype = df_clean[col_to_impute].dtype
+        if col_dtype in ['object', 'category']:
+            strategy = st.selectbox(
+                f"Imputation strategy for {col_to_impute} (categorical):",
+                ["Most Frequent", "Constant"]
+            )
+            if strategy == "Constant":
+                fill_value = st.text_input("Constant value:", value="Unknown")
+            else:
+                fill_value = None
+        else:
+            strategy = st.selectbox(
+                f"Imputation strategy for {col_to_impute} (numeric):",
+                ["Mean", "Median", "Most Frequent", "Constant", "KNN Imputer"]
+            )
+            if strategy == "Constant":
+                fill_value = st.number_input("Constant value:", value=0.0)
+            else:
+                fill_value = None
+        if st.button(f"Apply Imputation to '{col_to_impute}'"):
+            try:
+                if strategy == "Mean":
+                    imputer = SimpleImputer(strategy='mean')
+                elif strategy == "Median":
+                    imputer = SimpleImputer(strategy='median')
+                elif strategy == "Most Frequent":
+                    imputer = SimpleImputer(strategy='most_frequent')
+                elif strategy == "Constant":
+                    imputer = SimpleImputer(strategy='constant', fill_value=fill_value)
+                elif strategy == "KNN Imputer" and col_dtype in [np.number, 'float64', 'int64']:
+                    # Only for numeric
+                    imputer = KNNImputer(n_neighbors=5)
+                else:
+                    st.error("Invalid strategy for this column type.")
+                    st.stop()
+                # Apply imputation
+                if strategy == "KNN Imputer":
+                    # Only apply to numeric columns for KNN
+                    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
+                    df_clean[numeric_cols] = imputer.fit_transform(df_clean[numeric_cols])
+                else:
+                    df_clean[col_to_impute] = imputer.fit_transform(df_clean[[col_to_impute]]).ravel()
+                st.session_state['df_clean'] = df_clean
+                st.success(f"✅ Missing values in '{col_to_impute}' imputed using '{strategy}'.")
+            except Exception as e:
+                st.error(f"❌ Error during imputation: {e}")
+# Tab 3: Encode Categorical Variables
+with tab3:
+    st.subheader("Encode Categorical Variables")
+    categorical_cols = df_clean.select_dtypes(include=['object', 'category']).columns.tolist()
+    if not categorical_cols:
+        st.info("ℹ️ No categorical columns to encode.")
+    else:
+        col_to_encode = st.selectbox("Select categorical column to encode:", categorical_cols)
+        encoding_method = st.radio(
+            "Encoding method:",
+            ["Label Encoding", "One-Hot Encoding"]
+        )
+        if st.button(f"Apply {encoding_method} to '{col_to_encode}'"):
+            try:
+                if encoding_method == "Label Encoding":
+                    le = LabelEncoder()
+                    df_clean[col_to_encode] = le.fit_transform(df_clean[col_to_encode].astype(str))
+                    st.session_state['df_clean'] = df_clean
+                    st.success(f"✅ '{col_to_encode}' label encoded.")
+                else:  # One-Hot Encoding
+                    df_encoded = pd.get_dummies(df_clean[col_to_encode], prefix=col_to_encode)
+                    df_clean = df_clean.drop(columns=[col_to_encode])
+                    df_clean = pd.concat([df_clean, df_encoded], axis=1)
+                    st.session_state['df_clean'] = df_clean
+                    st.success(f"✅ '{col_to_encode}' one-hot encoded. {df_encoded.shape[1]} new columns added.")
+            except Exception as e:
+                st.error(f"❌ Error during encoding: {e}")
+# Tab 4: Scale Numeric Variables
+with tab4:
+    st.subheader("Scale Numeric Variables")
+    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
+    if not numeric_cols:
+        st.info("ℹ️ No numeric columns to scale.")
+    else:
+        cols_to_scale = st.multiselect("Select numeric columns to scale:", numeric_cols, default=numeric_cols[:2] if len(numeric_cols) >= 2 else numeric_cols)
+        scaling_method = st.radio("Scaling method:", ["StandardScaler (Z-score)", "MinMaxScaler (0-1)"])
+        if st.button("Apply Scaling"):
+            try:
+                if scaling_method == "StandardScaler (Z-score)":
+                    scaler = StandardScaler()
+                else:
+                    scaler = MinMaxScaler()
+                df_clean[cols_to_scale] = scaler.fit_transform(df_clean[cols_to_scale])
+                st.session_state['df_clean'] = df_clean
+                st.success(f"✅ Columns {cols_to_scale} scaled using {scaling_method}.")
+            except Exception as e:
+                st.error(f"❌ Error during scaling: {e}")
+# === DOWNLOAD CLEANED DATA ===
+st.header("📥 Download Cleaned Data")
+df_clean_final = st.session_state['df_clean']
+# Show final preview
+with st.expander("Final Cleaned Data Preview"):
+    st.dataframe(df_clean_final.head(10))
+# Download button
+csv = df_clean_final.to_csv(index=False).encode('utf-8')
+st.download_button(
+    label="💾 Download Cleaned CSV",
+    data=csv,
+    file_name="cleaned_data.csv",
+    mime="text/csv",
+)
+# Reset button
+if st.button("🔄 Reset to Original Data"):
+    st.session_state['df_clean'] = st.session_state['df_original'].copy()
+    st.success("✅ Data reset to original state.")
+# Footer
+st.markdown("---")
+st.caption(f"© {AUTHOR} | License {LICENSE} | Contact: {EMAIL}")