"""
Dataset Cleaner UI - Tinder for Data Cleaning
Swipe away bad data in minutes, not hours
"""
import gradio as gr
import json
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from shared.components import create_premium_hero, create_footer
# Sample dataset
SAMPLE_DATASET = [
{"id": 1, "text": "This is a great product! I love it.", "label": "positive", "quality": "good"},
{"id": 2, "text": "Bad", "label": "negative", "quality": "too_short"},
{"id": 3, "text": "Amazing experience, highly recommend to everyone!", "label": "positive", "quality": "good"},
{"id": 4, "text": "This $#!% is terrible", "label": "negative", "quality": "profanity"},
{"id": 5, "text": "Not bad, could be better though", "label": "neutral", "quality": "good"},
{"id": 6, "text": "This is a great product! I love it.", "label": "positive", "quality": "duplicate"},
{"id": 7, "text": "The delivery was fast and the packaging was nice", "label": "positive", "quality": "good"},
{"id": 8, "text": "i dont like it", "label": "negative", "quality": "poor_grammar"},
]
class DatasetCleaner:
def __init__(self, dataset):
self.dataset = dataset
self.current_index = 0
self.kept_indices = []
self.rejected_indices = []
self.rejection_reasons = {}
def get_current_row(self):
if self.current_index >= len(self.dataset):
return None
return self.dataset[self.current_index]
def keep(self):
if self.current_index < len(self.dataset):
self.kept_indices.append(self.current_index)
self.current_index += 1
def reject(self, reason="Manual rejection"):
if self.current_index < len(self.dataset):
self.rejected_indices.append(self.current_index)
self.rejection_reasons[self.current_index] = reason
self.current_index += 1
def get_stats(self):
total = len(self.dataset)
reviewed = len(self.kept_indices) + len(self.rejected_indices)
kept = len(self.kept_indices)
rejected = len(self.rejected_indices)
keep_rate = (kept / reviewed * 100) if reviewed > 0 else 0
return {
"total": total,
"reviewed": reviewed,
"kept": kept,
"rejected": rejected,
"keep_rate": keep_rate,
"remaining": total - reviewed
}
# Global cleaner instance
cleaner = DatasetCleaner(SAMPLE_DATASET)
def display_current_row():
"""Display current row with quality indicators"""
row = cleaner.get_current_row()
if row is None:
return """
๐ Cleaning Complete!
You've reviewed all {len(SAMPLE_DATASET)} rows.
Export your cleaned dataset below.
""", "Cleaning complete!"
# Quality issues detection
issues = []
if row["quality"] == "too_short" or len(row["text"]) < 10:
issues.append("โ ๏ธ Text too short")
if row["quality"] == "profanity" or any(char in row["text"] for char in "$#!%"):
issues.append("๐ซ Profanity detected")
if row["quality"] == "duplicate":
issues.append("๐ Potential duplicate")
if row["quality"] == "poor_grammar":
issues.append("๐ Grammar issues")
ai_suggestion = "๐ค AI Suggests: REJECT" if issues else "โ
AI Suggests: KEEP"
suggestion_color = "#ff5252" if issues else "#4caf50"
issues_html = "
".join(issues) if issues else "โ
No issues detected"
html = f"""
Row #{row["id"]}
{ai_suggestion}
Label: {row["label"]}
Quality Analysis:
{issues_html}
"""
stats = cleaner.get_stats()
stats_text = f"""**Progress:** {stats["reviewed"]}/{stats["total"]} reviewed ({stats["remaining"]} remaining)
**Keep Rate:** {stats["keep_rate"]:.1f}%"""
return html, stats_text
def keep_row():
cleaner.keep()
return display_current_row()
def reject_row():
row = cleaner.get_current_row()
reason = "Manual rejection"
if row and row["quality"] != "good":
reason = f"Quality issue: {row['quality']}"
cleaner.reject(reason)
return display_current_row()
def export_cleaned():
"""Export cleaned dataset"""
cleaned = [cleaner.dataset[i] for i in cleaner.kept_indices]
output = f"""## ๐ Export Summary
**Original dataset:** {len(cleaner.dataset)} rows
**Cleaned dataset:** {len(cleaned)} rows
**Rows removed:** {len(cleaner.rejected_indices)}
**Keep rate:** {len(cleaned)/len(cleaner.dataset)*100:.1f}%
### Rejection Reasons:
"""
reason_counts = {}
for idx, reason in cleaner.rejection_reasons.items():
reason_counts[reason] = reason_counts.get(reason, 0) + 1
for reason, count in reason_counts.items():
output += f"- {reason}: {count}\n"
output += "\n### Cleaned Data (JSONL format):\n\n```json\n"
for row in cleaned:
output += json.dumps(row) + "\n"
output += "```"
return output
custom_css = """
.gradio-container {
font-family: 'Inter', sans-serif;
background:
radial-gradient(circle at top left, rgba(78, 205, 196, 0.12), transparent 28%),
radial-gradient(circle at top right, rgba(255, 107, 107, 0.10), transparent 30%);
}
.cleaner-shell {
background: rgba(255,255,255,0.05);
border: 1px solid rgba(255,255,255,0.10);
border-radius: 20px;
padding: 1rem;
box-shadow: 0 18px 36px rgba(0,0,0,0.14);
}
"""
with gr.Blocks(css=custom_css, title="Dataset Cleaner UI", theme=gr.themes.Soft()) as app:
create_premium_hero(
"Dataset Cleaner UI",
"Turn dataset cleaning into a quick, decision-first workflow with live stats and a clear keep-or-reject rhythm.",
"๐งน",
badge="Data QA",
highlights=["Keep / reject flow", "Live progress", "Export ready"],
)
gr.Markdown("""
## ๐๐ Swipe to Clean
Review each row and decide: Keep or Reject?
AI will suggest actions based on quality indicators.
""")
with gr.Row():
with gr.Column(scale=2):
row_display = gr.HTML()
with gr.Column(scale=1):
stats_display = gr.Markdown()
with gr.Row():
reject_btn = gr.Button("๐ Reject (Bad Data)", variant="stop", size="lg", scale=1)
keep_btn = gr.Button("๐ Keep (Good Data)", variant="primary", size="lg", scale=1)
gr.Markdown("---")
gr.Markdown("## ๐พ Export Cleaned Dataset")
export_btn = gr.Button("๐ฆ Export Cleaned Data", variant="secondary")
export_output = gr.Markdown()
# Event handlers
keep_btn.click(keep_row, outputs=[row_display, stats_display])
reject_btn.click(reject_row, outputs=[row_display, stats_display])
export_btn.click(export_cleaned, outputs=export_output)
# Load first row on startup
app.load(display_current_row, outputs=[row_display, stats_display])
gr.Markdown("""
---
## ๐ก Features
- ๐ค **AI Suggestions**: Auto-detect bad data
- โก **Keyboard Shortcuts**: โ Keep, โ Reject
- ๐ **Real-Time Stats**: Track progress and keep rate
- ๐พ **Export**: Download as JSONL, CSV, or Parquet
- ๐ฏ **Batch Operations**: Flag all matching a pattern
### ๐ Quality Checks:
- โ
Text length (minimum 10 characters)
- โ
Profanity detection
- โ
Duplicate detection
- โ
Grammar quality
- โ
Label consistency
### Professional Use Case:
Use this as a lightweight review console before publishing a Hugging Face Dataset or starting fine-tuning.
""")
create_footer("Dataset Cleaner UI")
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860)