File size: 5,920 Bytes
5464c58
ac98ac9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5464c58
ac98ac9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import pandas as pd
import numpy as np
import streamlit as st
import os
import matplotlib.pyplot as plt
import seaborn as sns

try:
    import tabula
    from tabula import read_pdf
except:
    read_pdf = None

# ----------- File Upload Handler ----------- #
def file_upload(file):
    file_ext = os.path.splitext(file.name)[1].lower()
    try:
        if file_ext == '.csv':
            df = pd.read_csv(file)
        elif file_ext in ['.xls', '.xlsx']:
            df = pd.read_excel(file)
        elif file_ext == '.json':
            df = pd.read_json(file)
        elif file_ext == '.pdf' and read_pdf:
            df = read_pdf(file, pages='all', multiple_tables=False)[0]
        else:
            st.error("❌ Unsupported file type or missing dependencies for PDF.")
            return None
        return df
    except Exception as e:
        st.error(f"⚠️ Error loading file: {e}")
        return None

# ----------- Cleaning Functions ----------- #
def remove_empty_rows(df):
    st.info("πŸ” Null values before cleaning:")
    st.write(df.isnull().sum())
    df_cleaned = df.dropna()
    st.success("βœ… Null values removed.")
    return df_cleaned

def replace_nulls(df, value):
    st.info("πŸ” Null values before replacement:")
    st.write(df.isnull().sum())
    df_filled = df.fillna(value)
    st.success("βœ… Null values replaced.")
    return df_filled

def remove_noise(df):
    noise_words = {'the', 'is', 'an', 'a', 'in', 'of', 'to'}
    def clean_text(val):
        if isinstance(val, str):
            return ' '.join(word for word in val.split() if word.lower() not in noise_words)
        return val
    df_cleaned = df.applymap(clean_text)
    st.success("βœ… Noise words removed.")
    return df_cleaned

def remove_duplicates(df):
    df_deduped = df.drop_duplicates()
    st.success("βœ… Duplicate rows removed.")
    return df_deduped

def convert_column_dtype(df, column, dtype):
    try:
        df[column] = df[column].astype(dtype)
        st.success(f"βœ… Converted '{column}' to {dtype}")
    except Exception as e:
        st.error(f"⚠️ Conversion error: {e}")
    return df

def detect_outliers(df, column):
    if column in df.select_dtypes(include=['float', 'int']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        outliers = df[(df[column] < lower) | (df[column] > upper)]
        st.write(f"πŸ” Found {len(outliers)} outliers in column '{column}'")
        return outliers
    else:
        st.warning("⚠️ Column must be numeric to detect outliers.")
        return pd.DataFrame()

def plot_distributions(df):
    st.subheader("πŸ“Š Data Distributions")
    numeric_cols = df.select_dtypes(include=['float', 'int']).columns
    for col in numeric_cols:
        fig, ax = plt.subplots()
        sns.histplot(df[col].dropna(), kde=True, ax=ax)
        ax.set_title(f"Distribution of {col}")
        st.pyplot(fig)

def plot_missing_data(df):
    st.subheader("πŸ“‰ Missing Data Heatmap")
    fig, ax = plt.subplots()
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    st.pyplot(fig)

def main():
    st.set_page_config(page_title="🧹 Smart Dataset Cleaner", layout="wide")
    st.title("🧹 Smart Dataset Cleaner")
    st.caption("✨ Clean, analyze, and preprocess your dataset with ease")

    uploaded_file = st.file_uploader("πŸ“‚ Upload your dataset", type=["csv", "xlsx", "xls", "json", "pdf"])
    if uploaded_file:
        df = file_upload(uploaded_file)
        if df is not None:
            st.subheader("πŸ“‹ Original Dataset Preview")
            st.dataframe(df.head())

            st.markdown("## 🧰 Data Cleaning Tools")
            with st.expander("βž• Replace Null Values"):
                fill_value = st.text_input("Enter value to replace nulls with:")
                if st.button("Replace Nulls"):
                    df = replace_nulls(df, fill_value)
                    st.dataframe(df)

            if st.button("🧼 Remove Empty Rows"):
                df = remove_empty_rows(df)
                st.dataframe(df)

            if st.button("🧹 Remove Duplicate Rows"):
                df = remove_duplicates(df)
                st.dataframe(df)

            if st.button("πŸ“‰ Remove Noise Words from Text"):
                df = remove_noise(df)
                st.dataframe(df)

            with st.expander("πŸ” Convert Column DataType"):
                selected_col = st.selectbox("Select column", df.columns)
                dtype = st.selectbox("Select target type", ["int", "float", "str", "bool"])
                if st.button("Convert"):
                    df = convert_column_dtype(df, selected_col, dtype)
                    st.dataframe(df)

            st.markdown("## πŸ“Š Data Visualizations")
            if st.checkbox("πŸ“ˆ Show Summary Stats"):
                st.write(df.describe(include='all'))

            if st.checkbox("πŸ“Œ Plot Column Distributions"):
                plot_distributions(df)

            if st.checkbox("πŸ“ Show Missing Data Heatmap"):
                plot_missing_data(df)

            st.markdown("## 🚨 Outlier Detection")
            outlier_col = st.selectbox("Select numeric column", df.select_dtypes(include=['float', 'int']).columns)
            if st.button("Detect Outliers"):
                outliers = detect_outliers(df, outlier_col)
                if not outliers.empty:
                    st.write(outliers)

            st.markdown("## πŸ’Ύ Download Cleaned Dataset")
            file_name = st.text_input("Filename:", "cleaned_dataset.csv")
            if st.button("Download CSV"):
                st.download_button("πŸ“„ Download", df.to_csv(index=False), file_name, mime="text/csv")
        else:
            st.warning("⚠️ Please upload a supported file to begin.")

if __name__ == "__main__":
    main()