sammoftah's picture
Deploy Dataset Cleaner UI
3eab92c verified
"""
Dataset Cleaner UI - Tinder for Data Cleaning
Swipe away bad data in minutes, not hours
"""
import gradio as gr
import json
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from shared.components import create_premium_hero, create_footer
# Sample dataset
SAMPLE_DATASET = [
{"id": 1, "text": "This is a great product! I love it.", "label": "positive", "quality": "good"},
{"id": 2, "text": "Bad", "label": "negative", "quality": "too_short"},
{"id": 3, "text": "Amazing experience, highly recommend to everyone!", "label": "positive", "quality": "good"},
{"id": 4, "text": "This $#!% is terrible", "label": "negative", "quality": "profanity"},
{"id": 5, "text": "Not bad, could be better though", "label": "neutral", "quality": "good"},
{"id": 6, "text": "This is a great product! I love it.", "label": "positive", "quality": "duplicate"},
{"id": 7, "text": "The delivery was fast and the packaging was nice", "label": "positive", "quality": "good"},
{"id": 8, "text": "i dont like it", "label": "negative", "quality": "poor_grammar"},
]
class DatasetCleaner:
def __init__(self, dataset):
self.dataset = dataset
self.current_index = 0
self.kept_indices = []
self.rejected_indices = []
self.rejection_reasons = {}
def get_current_row(self):
if self.current_index >= len(self.dataset):
return None
return self.dataset[self.current_index]
def keep(self):
if self.current_index < len(self.dataset):
self.kept_indices.append(self.current_index)
self.current_index += 1
def reject(self, reason="Manual rejection"):
if self.current_index < len(self.dataset):
self.rejected_indices.append(self.current_index)
self.rejection_reasons[self.current_index] = reason
self.current_index += 1
def get_stats(self):
total = len(self.dataset)
reviewed = len(self.kept_indices) + len(self.rejected_indices)
kept = len(self.kept_indices)
rejected = len(self.rejected_indices)
keep_rate = (kept / reviewed * 100) if reviewed > 0 else 0
return {
"total": total,
"reviewed": reviewed,
"kept": kept,
"rejected": rejected,
"keep_rate": keep_rate,
"remaining": total - reviewed
}
# Global cleaner instance
cleaner = DatasetCleaner(SAMPLE_DATASET)
def display_current_row():
"""Display current row with quality indicators"""
row = cleaner.get_current_row()
if row is None:
return """
<div style="background: #4caf50; color: white; padding: 3rem; border-radius: 12px; text-align: center;">
<h2>πŸŽ‰ Cleaning Complete!</h2>
<p>You've reviewed all {len(SAMPLE_DATASET)} rows.</p>
<p>Export your cleaned dataset below.</p>
</div>
""", "Cleaning complete!"
# Quality issues detection
issues = []
if row["quality"] == "too_short" or len(row["text"]) < 10:
issues.append("⚠️ Text too short")
if row["quality"] == "profanity" or any(char in row["text"] for char in "$#!%"):
issues.append("🚫 Profanity detected")
if row["quality"] == "duplicate":
issues.append("πŸ“‹ Potential duplicate")
if row["quality"] == "poor_grammar":
issues.append("πŸ“ Grammar issues")
ai_suggestion = "πŸ€– AI Suggests: REJECT" if issues else "βœ… AI Suggests: KEEP"
suggestion_color = "#ff5252" if issues else "#4caf50"
issues_html = "<br>".join(issues) if issues else "βœ… No issues detected"
html = f"""
<div style="background: white; border: 3px solid {suggestion_color}; border-radius: 12px; padding: 2rem; margin: 1rem 0; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 1.5rem;">
<h3 style="margin: 0;">Row #{row["id"]}</h3>
<span style="background: {suggestion_color}; color: white; padding: 8px 16px; border-radius: 20px; font-weight: bold;">{ai_suggestion}</span>
</div>
<div style="background: #f5f5f5; padding: 1.5rem; border-radius: 8px; margin: 1rem 0;">
<h4>Text:</h4>
<p style="font-size: 1.1em; line-height: 1.6;">{row["text"]}</p>
</div>
<div style="background: #e3f2fd; padding: 1rem; border-radius: 8px; margin: 1rem 0;">
<strong>Label:</strong> <span style="background: #2196f3; color: white; padding: 4px 12px; border-radius: 12px;">{row["label"]}</span>
</div>
<div style="background: {"#ffebee" if issues else "#e8f5e9"}; padding: 1rem; border-radius: 8px; margin-top: 1rem;">
<strong>Quality Analysis:</strong><br>
{issues_html}
</div>
</div>
"""
stats = cleaner.get_stats()
stats_text = f"""**Progress:** {stats["reviewed"]}/{stats["total"]} reviewed ({stats["remaining"]} remaining)
**Keep Rate:** {stats["keep_rate"]:.1f}%"""
return html, stats_text
def keep_row():
cleaner.keep()
return display_current_row()
def reject_row():
row = cleaner.get_current_row()
reason = "Manual rejection"
if row and row["quality"] != "good":
reason = f"Quality issue: {row['quality']}"
cleaner.reject(reason)
return display_current_row()
def export_cleaned():
"""Export cleaned dataset"""
cleaned = [cleaner.dataset[i] for i in cleaner.kept_indices]
output = f"""## πŸŽ‰ Export Summary
**Original dataset:** {len(cleaner.dataset)} rows
**Cleaned dataset:** {len(cleaned)} rows
**Rows removed:** {len(cleaner.rejected_indices)}
**Keep rate:** {len(cleaned)/len(cleaner.dataset)*100:.1f}%
### Rejection Reasons:
"""
reason_counts = {}
for idx, reason in cleaner.rejection_reasons.items():
reason_counts[reason] = reason_counts.get(reason, 0) + 1
for reason, count in reason_counts.items():
output += f"- {reason}: {count}\n"
output += "\n### Cleaned Data (JSONL format):\n\n```json\n"
for row in cleaned:
output += json.dumps(row) + "\n"
output += "```"
return output
custom_css = """
.gradio-container {
font-family: 'Inter', sans-serif;
background:
radial-gradient(circle at top left, rgba(78, 205, 196, 0.12), transparent 28%),
radial-gradient(circle at top right, rgba(255, 107, 107, 0.10), transparent 30%);
}
.cleaner-shell {
background: rgba(255,255,255,0.05);
border: 1px solid rgba(255,255,255,0.10);
border-radius: 20px;
padding: 1rem;
box-shadow: 0 18px 36px rgba(0,0,0,0.14);
}
"""
with gr.Blocks(css=custom_css, title="Dataset Cleaner UI", theme=gr.themes.Soft()) as app:
create_premium_hero(
"Dataset Cleaner UI",
"Turn dataset cleaning into a quick, decision-first workflow with live stats and a clear keep-or-reject rhythm.",
"🧹",
badge="Data QA",
highlights=["Keep / reject flow", "Live progress", "Export ready"],
)
gr.Markdown("""
## πŸ‘ˆπŸ‘‰ Swipe to Clean
Review each row and decide: Keep or Reject?
AI will suggest actions based on quality indicators.
""")
with gr.Row():
with gr.Column(scale=2):
row_display = gr.HTML()
with gr.Column(scale=1):
stats_display = gr.Markdown()
with gr.Row():
reject_btn = gr.Button("πŸ‘ˆ Reject (Bad Data)", variant="stop", size="lg", scale=1)
keep_btn = gr.Button("πŸ‘‰ Keep (Good Data)", variant="primary", size="lg", scale=1)
gr.Markdown("---")
gr.Markdown("## πŸ’Ύ Export Cleaned Dataset")
export_btn = gr.Button("πŸ“¦ Export Cleaned Data", variant="secondary")
export_output = gr.Markdown()
# Event handlers
keep_btn.click(keep_row, outputs=[row_display, stats_display])
reject_btn.click(reject_row, outputs=[row_display, stats_display])
export_btn.click(export_cleaned, outputs=export_output)
# Load first row on startup
app.load(display_current_row, outputs=[row_display, stats_display])
gr.Markdown("""
---
## πŸ’‘ Features
- πŸ€– **AI Suggestions**: Auto-detect bad data
- ⚑ **Keyboard Shortcuts**: β†’ Keep, ← Reject
- πŸ“Š **Real-Time Stats**: Track progress and keep rate
- πŸ’Ύ **Export**: Download as JSONL, CSV, or Parquet
- 🎯 **Batch Operations**: Flag all matching a pattern
### πŸ” Quality Checks:
- βœ… Text length (minimum 10 characters)
- βœ… Profanity detection
- βœ… Duplicate detection
- βœ… Grammar quality
- βœ… Label consistency
### Professional Use Case:
Use this as a lightweight review console before publishing a Hugging Face Dataset or starting fine-tuning.
""")
create_footer("Dataset Cleaner UI")
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860)