Spaces:

Haseeb-001
/

Smart_Data_Cleaner

Runtime error

File size: 5,920 Bytes

import pandas as pd
import numpy as np
import streamlit as st
import os
import matplotlib.pyplot as plt
import seaborn as sns

try:
    import tabula
    from tabula import read_pdf
except:
    read_pdf = None

# ----------- File Upload Handler ----------- #
def file_upload(file):
    file_ext = os.path.splitext(file.name)[1].lower()
    try:
        if file_ext == '.csv':
            df = pd.read_csv(file)
        elif file_ext in ['.xls', '.xlsx']:
            df = pd.read_excel(file)
        elif file_ext == '.json':
            df = pd.read_json(file)
        elif file_ext == '.pdf' and read_pdf:
            df = read_pdf(file, pages='all', multiple_tables=False)[0]
        else:
            st.error("❌ Unsupported file type or missing dependencies for PDF.")
            return None
        return df
    except Exception as e:
        st.error(f"⚠️ Error loading file: {e}")
        return None

# ----------- Cleaning Functions ----------- #
def remove_empty_rows(df):
    st.info("🔍 Null values before cleaning:")
    st.write(df.isnull().sum())
    df_cleaned = df.dropna()
    st.success("✅ Null values removed.")
    return df_cleaned

def replace_nulls(df, value):
    st.info("🔍 Null values before replacement:")
    st.write(df.isnull().sum())
    df_filled = df.fillna(value)
    st.success("✅ Null values replaced.")
    return df_filled

def remove_noise(df):
    noise_words = {'the', 'is', 'an', 'a', 'in', 'of', 'to'}
    def clean_text(val):
        if isinstance(val, str):
            return ' '.join(word for word in val.split() if word.lower() not in noise_words)
        return val
    df_cleaned = df.applymap(clean_text)
    st.success("✅ Noise words removed.")
    return df_cleaned

def remove_duplicates(df):
    df_deduped = df.drop_duplicates()
    st.success("✅ Duplicate rows removed.")
    return df_deduped

def convert_column_dtype(df, column, dtype):
    try:
        df[column] = df[column].astype(dtype)
        st.success(f"✅ Converted '{column}' to {dtype}")
    except Exception as e:
        st.error(f"⚠️ Conversion error: {e}")
    return df

def detect_outliers(df, column):
    if column in df.select_dtypes(include=['float', 'int']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        outliers = df[(df[column] < lower) | (df[column] > upper)]
        st.write(f"🔍 Found {len(outliers)} outliers in column '{column}'")
        return outliers
    else:
        st.warning("⚠️ Column must be numeric to detect outliers.")
        return pd.DataFrame()

def plot_distributions(df):
    st.subheader("📊 Data Distributions")
    numeric_cols = df.select_dtypes(include=['float', 'int']).columns
    for col in numeric_cols:
        fig, ax = plt.subplots()
        sns.histplot(df[col].dropna(), kde=True, ax=ax)
        ax.set_title(f"Distribution of {col}")
        st.pyplot(fig)

def plot_missing_data(df):
    st.subheader("📉 Missing Data Heatmap")
    fig, ax = plt.subplots()
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    st.pyplot(fig)

def main():
    st.set_page_config(page_title="🧹 Smart Dataset Cleaner", layout="wide")
    st.title("🧹 Smart Dataset Cleaner")
    st.caption("✨ Clean, analyze, and preprocess your dataset with ease")

    uploaded_file = st.file_uploader("📂 Upload your dataset", type=["csv", "xlsx", "xls", "json", "pdf"])
    if uploaded_file:
        df = file_upload(uploaded_file)
        if df is not None:
            st.subheader("📋 Original Dataset Preview")
            st.dataframe(df.head())

            st.markdown("## 🧰 Data Cleaning Tools")
            with st.expander("➕ Replace Null Values"):
                fill_value = st.text_input("Enter value to replace nulls with:")
                if st.button("Replace Nulls"):
                    df = replace_nulls(df, fill_value)
                    st.dataframe(df)

            if st.button("🧼 Remove Empty Rows"):
                df = remove_empty_rows(df)
                st.dataframe(df)

            if st.button("🧹 Remove Duplicate Rows"):
                df = remove_duplicates(df)
                st.dataframe(df)

            if st.button("📉 Remove Noise Words from Text"):
                df = remove_noise(df)
                st.dataframe(df)

            with st.expander("🔁 Convert Column DataType"):
                selected_col = st.selectbox("Select column", df.columns)
                dtype = st.selectbox("Select target type", ["int", "float", "str", "bool"])
                if st.button("Convert"):
                    df = convert_column_dtype(df, selected_col, dtype)
                    st.dataframe(df)

            st.markdown("## 📊 Data Visualizations")
            if st.checkbox("📈 Show Summary Stats"):
                st.write(df.describe(include='all'))

            if st.checkbox("📌 Plot Column Distributions"):
                plot_distributions(df)

            if st.checkbox("📍 Show Missing Data Heatmap"):
                plot_missing_data(df)

            st.markdown("## 🚨 Outlier Detection")
            outlier_col = st.selectbox("Select numeric column", df.select_dtypes(include=['float', 'int']).columns)
            if st.button("Detect Outliers"):
                outliers = detect_outliers(df, outlier_col)
                if not outliers.empty:
                    st.write(outliers)

            st.markdown("## 💾 Download Cleaned Dataset")
            file_name = st.text_input("Filename:", "cleaned_dataset.csv")
            if st.button("Download CSV"):
                st.download_button("📄 Download", df.to_csv(index=False), file_name, mime="text/csv")
        else:
            st.warning("⚠️ Please upload a supported file to begin.")

if __name__ == "__main__":
    main()