Spaces:

Haseeb-001
/

Smart_Data_Cleaner

Runtime error

App Files Files Community

Haseeb-001 commited on Jan 16, 2025

Commit

0bf34ea

verified ·

1 Parent(s): c409139

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -23

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import os
 import pandas as pd
 import re
-import gradio as gr
 from groq import Groq
 from nltk.corpus import stopwords
 import nltk
-# Download stopwords
 nltk.download('stopwords')
 STOPWORDS = set(stopwords.words('english'))
@@ -20,19 +20,24 @@ def missing_data_report(data):
     total_missing = missing_report.sum()
     return f"Missing Data Report:\n\n{missing_report}\n\nTotal Missing Values: {total_missing}"
 # Function: Clean Dataset
 def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     # Fill missing values
     data.fillna(method='ffill', inplace=True)
     data.fillna(method='bfill', inplace=True)
-    # Auto-generate column labels if missing
-    if data.columns.isnull().any():
-        data.columns = [f"Column_{i + 1}" for i in range(data.shape[1])]
     # Remove duplicates
     data = data.drop_duplicates()
     # Normalize and clean text columns
     for col in data.select_dtypes(include=['object']).columns:
         if lowercase:
@@ -41,7 +46,7 @@ def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=F
             data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
         if remove_stopwords:
             data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
     return data
 # Function: Chunk Text
@@ -68,25 +73,25 @@ def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=Tru
     # Step 1: Clean data
     cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
     # Step 2: Create chunks
     cleaned_data['chunks'] = cleaned_data['text_column'].apply(lambda x: chunk_text(x, max_length=chunk_size))
     # Step 3: Generate embeddings
     cleaned_data['embeddings'] = cleaned_data['chunks'].apply(
         lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
     )
     # Save cleaned data with embeddings
     output_file = 'processed_data.csv'
     cleaned_data.to_csv(output_file, index=False)
     # Display sample embeddings
     embedding_sample = cleaned_data['embeddings'].head(5)
     return missing_report, embedding_sample, output_file
-# Gradio Interface Function
 def gradio_interface(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     missing_report, embedding_sample, output_file = process_dataset(
         file, chunk_size, lowercase, remove_punctuation, remove_stopwords
@@ -102,21 +107,23 @@ ui = gr.Interface(
     fn=gradio_interface,
     inputs=[
         gr.File(label="📁 Upload CSV Dataset"),
-        gr.Slider(50, 500, step=50, default=100, label="📝 Chunk Size (words)"),
-        gr.Checkbox(label="🔠 Convert Text to Lowercase", default=True),
-        gr.Checkbox(label="❌ Remove Punctuation", default=True),
-        gr.Checkbox(label="🗑️ Remove Stopwords", default=False),
     ],
     outputs=[
         gr.Textbox(label="📊 Missing Data Report"),
-        gr.Textbox(label="✨ Embedding Sample"),
-        gr.File(label="⬇️ Download Processed Dataset"),
     ],
-    title="🔍 Advanced Data Cleaning & Embedding Tool",
     description=(
         "Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. "
-        "Customize text cleaning options, chunk size, and more. Automatically adds column labels if missing."
     ),
     live=True,
 )

 import os
 import pandas as pd
 import re
 from groq import Groq
+import gradio as gr
 from nltk.corpus import stopwords
 import nltk
+# Download stopwords for text cleaning
 nltk.download('stopwords')
 STOPWORDS = set(stopwords.words('english'))
     total_missing = missing_report.sum()
     return f"Missing Data Report:\n\n{missing_report}\n\nTotal Missing Values: {total_missing}"
+# Function: Auto-label Columns
+def auto_label_columns(data):
+    if not all(data.columns):
+        data.columns = [f"column_{i}" if not col else col for i, col in enumerate(data.columns)]
+    return data
 # Function: Clean Dataset
 def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
+    # Auto-label columns if missing
+    data = auto_label_columns(data)
     # Fill missing values
     data.fillna(method='ffill', inplace=True)
     data.fillna(method='bfill', inplace=True)
     # Remove duplicates
     data = data.drop_duplicates()
     # Normalize and clean text columns
     for col in data.select_dtypes(include=['object']).columns:
         if lowercase:
             data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
         if remove_stopwords:
             data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
     return data
 # Function: Chunk Text
     # Step 1: Clean data
     cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
     # Step 2: Create chunks
     cleaned_data['chunks'] = cleaned_data['text_column'].apply(lambda x: chunk_text(x, max_length=chunk_size))
     # Step 3: Generate embeddings
     cleaned_data['embeddings'] = cleaned_data['chunks'].apply(
         lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
     )
     # Save cleaned data with embeddings
     output_file = 'processed_data.csv'
     cleaned_data.to_csv(output_file, index=False)
     # Display sample embeddings
     embedding_sample = cleaned_data['embeddings'].head(5)
     return missing_report, embedding_sample, output_file
+# Gradio UI
 def gradio_interface(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     missing_report, embedding_sample, output_file = process_dataset(
         file, chunk_size, lowercase, remove_punctuation, remove_stopwords
     fn=gradio_interface,
     inputs=[
         gr.File(label="📁 Upload CSV Dataset"),
+        gr.Slider(50, 500, step=50, value=100, label="🔢 Chunk Size (words)"),
+        gr.Checkbox(label="🔠 Convert Text to Lowercase", value=True),
+        gr.Checkbox(label="❌ Remove Punctuation", value=True),
+        gr.Checkbox(label="📝 Remove Stopwords", value=False),
     ],
     outputs=[
         gr.Textbox(label="📊 Missing Data Report"),
+        gr.Textbox(label="🧩 Embedding Sample"),
+        gr.File(label="📥 Download Processed Dataset"),
     ],
+    title="✨ Professional Data Cleaning & Embedding Tool",
     description=(
         "Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. "
+        "Customize text cleaning options and chunk size to suit your needs, or use the default settings. "
+        "Missing column labels will be auto-generated."
     ),
+    theme="huggingface",
     live=True,
 )