Spaces:

Haseeb-001
/

Smart_Data_Cleaner

Runtime error

App Files Files Community

Haseeb-001 commited on Apr 22, 2025

Commit

ac98ac9

verified ·

1 Parent(s): 0bf34ea

Update app.py

Browse files

Files changed (1) hide show

app.py +162 -130

app.py CHANGED Viewed

@@ -1,132 +1,164 @@
-import os
 import pandas as pd
-import re
-from groq import Groq
-import gradio as gr
-from nltk.corpus import stopwords
-import nltk
-# Download stopwords for text cleaning
-nltk.download('stopwords')
-STOPWORDS = set(stopwords.words('english'))
-# Set Groq API Key
-GROQ_API_KEY = "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"
-client = Groq(api_key=GROQ_API_KEY)
-# Function: Generate Missing Data Report
-def missing_data_report(data):
-    missing_report = data.isnull().sum()
-    total_missing = missing_report.sum()
-    return f"Missing Data Report:\n\n{missing_report}\n\nTotal Missing Values: {total_missing}"
-# Function: Auto-label Columns
-def auto_label_columns(data):
-    if not all(data.columns):
-        data.columns = [f"column_{i}" if not col else col for i, col in enumerate(data.columns)]
-    return data
-# Function: Clean Dataset
-def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
-    # Auto-label columns if missing
-    data = auto_label_columns(data)
-    # Fill missing values
-    data.fillna(method='ffill', inplace=True)
-    data.fillna(method='bfill', inplace=True)
-    # Remove duplicates
-    data = data.drop_duplicates()
-    # Normalize and clean text columns
-    for col in data.select_dtypes(include=['object']).columns:
-        if lowercase:
-            data[col] = data[col].str.lower()
-        if remove_punctuation:
-            data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
-        if remove_stopwords:
-            data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
-    return data
-# Function: Chunk Text
-def chunk_text(text, max_length=100):
-    words = text.split()
-    return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
-# Function: Generate Embeddings
-def generate_embeddings(chunk):
-    chat_completion = client.chat.completions.create(
-        messages=[{"role": "user", "content": chunk}],
-        model="llama3-8b-8192",
-        stream=False,
-    )
-    return chat_completion.choices[0].message.content
-# Main Function: Process Data
-def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
-    # Load data
-    data = pd.read_csv(file)
-    # Generate missing data report
-    missing_report = missing_data_report(data)
-    # Step 1: Clean data
-    cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
-    # Step 2: Create chunks
-    cleaned_data['chunks'] = cleaned_data['text_column'].apply(lambda x: chunk_text(x, max_length=chunk_size))
-    # Step 3: Generate embeddings
-    cleaned_data['embeddings'] = cleaned_data['chunks'].apply(
-        lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
-    )
-    # Save cleaned data with embeddings
-    output_file = 'processed_data.csv'
-    cleaned_data.to_csv(output_file, index=False)
-    # Display sample embeddings
-    embedding_sample = cleaned_data['embeddings'].head(5)
-    return missing_report, embedding_sample, output_file
-# Gradio UI
-def gradio_interface(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
-    missing_report, embedding_sample, output_file = process_dataset(
-        file, chunk_size, lowercase, remove_punctuation, remove_stopwords
-    )
-    return (
-        missing_report,
-        f"Sample Embeddings:\n{embedding_sample}",
-        output_file
-    )
-# Gradio App
-ui = gr.Interface(
-    fn=gradio_interface,
-    inputs=[
-        gr.File(label="📁 Upload CSV Dataset"),
-        gr.Slider(50, 500, step=50, value=100, label="🔢 Chunk Size (words)"),
-        gr.Checkbox(label="🔠 Convert Text to Lowercase", value=True),
-        gr.Checkbox(label="❌ Remove Punctuation", value=True),
-        gr.Checkbox(label="📝 Remove Stopwords", value=False),
-    ],
-    outputs=[
-        gr.Textbox(label="📊 Missing Data Report"),
-        gr.Textbox(label="🧩 Embedding Sample"),
-        gr.File(label="📥 Download Processed Dataset"),
-    ],
-    title="✨ Professional Data Cleaning & Embedding Tool",
-    description=(
-        "Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. "
-        "Customize text cleaning options and chunk size to suit your needs, or use the default settings. "
-        "Missing column labels will be auto-generated."
-    ),
-    theme="huggingface",
-    live=True,
-)
-# Launch App
 if __name__ == "__main__":
-    ui.launch()

 import pandas as pd
+import numpy as np
+import streamlit as st
+import os
+import matplotlib.pyplot as plt
+import seaborn as sns
+try:
+    import tabula
+    from tabula import read_pdf
+except:
+    read_pdf = None
+# ----------- File Upload Handler ----------- #
+def file_upload(file):
+    file_ext = os.path.splitext(file.name)[1].lower()
+    try:
+        if file_ext == '.csv':
+            df = pd.read_csv(file)
+        elif file_ext in ['.xls', '.xlsx']:
+            df = pd.read_excel(file)
+        elif file_ext == '.json':
+            df = pd.read_json(file)
+        elif file_ext == '.pdf' and read_pdf:
+            df = read_pdf(file, pages='all', multiple_tables=False)[0]
+        else:
+            st.error("❌ Unsupported file type or missing dependencies for PDF.")
+            return None
+        return df
+    except Exception as e:
+        st.error(f"⚠️ Error loading file: {e}")
+        return None
+# ----------- Cleaning Functions ----------- #
+def remove_empty_rows(df):
+    st.info("🔍 Null values before cleaning:")
+    st.write(df.isnull().sum())
+    df_cleaned = df.dropna()
+    st.success("✅ Null values removed.")
+    return df_cleaned
+def replace_nulls(df, value):
+    st.info("🔍 Null values before replacement:")
+    st.write(df.isnull().sum())
+    df_filled = df.fillna(value)
+    st.success("✅ Null values replaced.")
+    return df_filled
+def remove_noise(df):
+    noise_words = {'the', 'is', 'an', 'a', 'in', 'of', 'to'}
+    def clean_text(val):
+        if isinstance(val, str):
+            return ' '.join(word for word in val.split() if word.lower() not in noise_words)
+        return val
+    df_cleaned = df.applymap(clean_text)
+    st.success("✅ Noise words removed.")
+    return df_cleaned
+def remove_duplicates(df):
+    df_deduped = df.drop_duplicates()
+    st.success("✅ Duplicate rows removed.")
+    return df_deduped
+def convert_column_dtype(df, column, dtype):
+    try:
+        df[column] = df[column].astype(dtype)
+        st.success(f"✅ Converted '{column}' to {dtype}")
+    except Exception as e:
+        st.error(f"⚠️ Conversion error: {e}")
+    return df
+def detect_outliers(df, column):
+    if column in df.select_dtypes(include=['float', 'int']).columns:
+        Q1 = df[column].quantile(0.25)
+        Q3 = df[column].quantile(0.75)
+        IQR = Q3 - Q1
+        lower = Q1 - 1.5 * IQR
+        upper = Q3 + 1.5 * IQR
+        outliers = df[(df[column] < lower) | (df[column] > upper)]
+        st.write(f"🔍 Found {len(outliers)} outliers in column '{column}'")
+        return outliers
+    else:
+        st.warning("⚠️ Column must be numeric to detect outliers.")
+        return pd.DataFrame()
+def plot_distributions(df):
+    st.subheader("📊 Data Distributions")
+    numeric_cols = df.select_dtypes(include=['float', 'int']).columns
+    for col in numeric_cols:
+        fig, ax = plt.subplots()
+        sns.histplot(df[col].dropna(), kde=True, ax=ax)
+        ax.set_title(f"Distribution of {col}")
+        st.pyplot(fig)
+def plot_missing_data(df):
+    st.subheader("📉 Missing Data Heatmap")
+    fig, ax = plt.subplots()
+    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
+    st.pyplot(fig)
+def main():
+    st.set_page_config(page_title="🧹 Smart Dataset Cleaner", layout="wide")
+    st.title("🧹 Smart Dataset Cleaner")
+    st.caption("✨ Clean, analyze, and preprocess your dataset with ease")
+    uploaded_file = st.file_uploader("📂 Upload your dataset", type=["csv", "xlsx", "xls", "json", "pdf"])
+    if uploaded_file:
+        df = file_upload(uploaded_file)
+        if df is not None:
+            st.subheader("📋 Original Dataset Preview")
+            st.dataframe(df.head())
+            st.markdown("## 🧰 Data Cleaning Tools")
+            with st.expander("➕ Replace Null Values"):
+                fill_value = st.text_input("Enter value to replace nulls with:")
+                if st.button("Replace Nulls"):
+                    df = replace_nulls(df, fill_value)
+                    st.dataframe(df)
+            if st.button("🧼 Remove Empty Rows"):
+                df = remove_empty_rows(df)
+                st.dataframe(df)
+            if st.button("🧹 Remove Duplicate Rows"):
+                df = remove_duplicates(df)
+                st.dataframe(df)
+            if st.button("📉 Remove Noise Words from Text"):
+                df = remove_noise(df)
+                st.dataframe(df)
+            with st.expander("🔁 Convert Column DataType"):
+                selected_col = st.selectbox("Select column", df.columns)
+                dtype = st.selectbox("Select target type", ["int", "float", "str", "bool"])
+                if st.button("Convert"):
+                    df = convert_column_dtype(df, selected_col, dtype)
+                    st.dataframe(df)
+            st.markdown("## 📊 Data Visualizations")
+            if st.checkbox("📈 Show Summary Stats"):
+                st.write(df.describe(include='all'))
+            if st.checkbox("📌 Plot Column Distributions"):
+                plot_distributions(df)
+            if st.checkbox("📍 Show Missing Data Heatmap"):
+                plot_missing_data(df)
+            st.markdown("## 🚨 Outlier Detection")
+            outlier_col = st.selectbox("Select numeric column", df.select_dtypes(include=['float', 'int']).columns)
+            if st.button("Detect Outliers"):
+                outliers = detect_outliers(df, outlier_col)
+                if not outliers.empty:
+                    st.write(outliers)
+            st.markdown("## 💾 Download Cleaned Dataset")
+            file_name = st.text_input("Filename:", "cleaned_dataset.csv")
+            if st.button("Download CSV"):
+                st.download_button("📄 Download", df.to_csv(index=False), file_name, mime="text/csv")
+        else:
+            st.warning("⚠️ Please upload a supported file to begin.")
 if __name__ == "__main__":
+    main()