Spaces:
Running
Running
| """ | |
| Dataset Cleaner UI - Tinder for Data Cleaning | |
| Swipe away bad data in minutes, not hours | |
| """ | |
| import gradio as gr | |
| import json | |
| import sys, os | |
| sys.path.append(os.path.join(os.path.dirname(__file__), '..')) | |
| from shared.components import create_premium_hero, create_footer | |
| # Sample dataset | |
| SAMPLE_DATASET = [ | |
| {"id": 1, "text": "This is a great product! I love it.", "label": "positive", "quality": "good"}, | |
| {"id": 2, "text": "Bad", "label": "negative", "quality": "too_short"}, | |
| {"id": 3, "text": "Amazing experience, highly recommend to everyone!", "label": "positive", "quality": "good"}, | |
| {"id": 4, "text": "This $#!% is terrible", "label": "negative", "quality": "profanity"}, | |
| {"id": 5, "text": "Not bad, could be better though", "label": "neutral", "quality": "good"}, | |
| {"id": 6, "text": "This is a great product! I love it.", "label": "positive", "quality": "duplicate"}, | |
| {"id": 7, "text": "The delivery was fast and the packaging was nice", "label": "positive", "quality": "good"}, | |
| {"id": 8, "text": "i dont like it", "label": "negative", "quality": "poor_grammar"}, | |
| ] | |
| class DatasetCleaner: | |
| def __init__(self, dataset): | |
| self.dataset = dataset | |
| self.current_index = 0 | |
| self.kept_indices = [] | |
| self.rejected_indices = [] | |
| self.rejection_reasons = {} | |
| def get_current_row(self): | |
| if self.current_index >= len(self.dataset): | |
| return None | |
| return self.dataset[self.current_index] | |
| def keep(self): | |
| if self.current_index < len(self.dataset): | |
| self.kept_indices.append(self.current_index) | |
| self.current_index += 1 | |
| def reject(self, reason="Manual rejection"): | |
| if self.current_index < len(self.dataset): | |
| self.rejected_indices.append(self.current_index) | |
| self.rejection_reasons[self.current_index] = reason | |
| self.current_index += 1 | |
| def get_stats(self): | |
| total = len(self.dataset) | |
| reviewed = len(self.kept_indices) + len(self.rejected_indices) | |
| kept = len(self.kept_indices) | |
| rejected = len(self.rejected_indices) | |
| keep_rate = (kept / reviewed * 100) if reviewed > 0 else 0 | |
| return { | |
| "total": total, | |
| "reviewed": reviewed, | |
| "kept": kept, | |
| "rejected": rejected, | |
| "keep_rate": keep_rate, | |
| "remaining": total - reviewed | |
| } | |
| # Global cleaner instance | |
| cleaner = DatasetCleaner(SAMPLE_DATASET) | |
| def display_current_row(): | |
| """Display current row with quality indicators""" | |
| row = cleaner.get_current_row() | |
| if row is None: | |
| return """ | |
| <div style="background: #4caf50; color: white; padding: 3rem; border-radius: 12px; text-align: center;"> | |
| <h2>π Cleaning Complete!</h2> | |
| <p>You've reviewed all {len(SAMPLE_DATASET)} rows.</p> | |
| <p>Export your cleaned dataset below.</p> | |
| </div> | |
| """, "Cleaning complete!" | |
| # Quality issues detection | |
| issues = [] | |
| if row["quality"] == "too_short" or len(row["text"]) < 10: | |
| issues.append("β οΈ Text too short") | |
| if row["quality"] == "profanity" or any(char in row["text"] for char in "$#!%"): | |
| issues.append("π« Profanity detected") | |
| if row["quality"] == "duplicate": | |
| issues.append("π Potential duplicate") | |
| if row["quality"] == "poor_grammar": | |
| issues.append("π Grammar issues") | |
| ai_suggestion = "π€ AI Suggests: REJECT" if issues else "β AI Suggests: KEEP" | |
| suggestion_color = "#ff5252" if issues else "#4caf50" | |
| issues_html = "<br>".join(issues) if issues else "β No issues detected" | |
| html = f""" | |
| <div style="background: white; border: 3px solid {suggestion_color}; border-radius: 12px; padding: 2rem; margin: 1rem 0; box-shadow: 0 4px 12px rgba(0,0,0,0.1);"> | |
| <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 1.5rem;"> | |
| <h3 style="margin: 0;">Row #{row["id"]}</h3> | |
| <span style="background: {suggestion_color}; color: white; padding: 8px 16px; border-radius: 20px; font-weight: bold;">{ai_suggestion}</span> | |
| </div> | |
| <div style="background: #f5f5f5; padding: 1.5rem; border-radius: 8px; margin: 1rem 0;"> | |
| <h4>Text:</h4> | |
| <p style="font-size: 1.1em; line-height: 1.6;">{row["text"]}</p> | |
| </div> | |
| <div style="background: #e3f2fd; padding: 1rem; border-radius: 8px; margin: 1rem 0;"> | |
| <strong>Label:</strong> <span style="background: #2196f3; color: white; padding: 4px 12px; border-radius: 12px;">{row["label"]}</span> | |
| </div> | |
| <div style="background: {"#ffebee" if issues else "#e8f5e9"}; padding: 1rem; border-radius: 8px; margin-top: 1rem;"> | |
| <strong>Quality Analysis:</strong><br> | |
| {issues_html} | |
| </div> | |
| </div> | |
| """ | |
| stats = cleaner.get_stats() | |
| stats_text = f"""**Progress:** {stats["reviewed"]}/{stats["total"]} reviewed ({stats["remaining"]} remaining) | |
| **Keep Rate:** {stats["keep_rate"]:.1f}%""" | |
| return html, stats_text | |
| def keep_row(): | |
| cleaner.keep() | |
| return display_current_row() | |
| def reject_row(): | |
| row = cleaner.get_current_row() | |
| reason = "Manual rejection" | |
| if row and row["quality"] != "good": | |
| reason = f"Quality issue: {row['quality']}" | |
| cleaner.reject(reason) | |
| return display_current_row() | |
| def export_cleaned(): | |
| """Export cleaned dataset""" | |
| cleaned = [cleaner.dataset[i] for i in cleaner.kept_indices] | |
| output = f"""## π Export Summary | |
| **Original dataset:** {len(cleaner.dataset)} rows | |
| **Cleaned dataset:** {len(cleaned)} rows | |
| **Rows removed:** {len(cleaner.rejected_indices)} | |
| **Keep rate:** {len(cleaned)/len(cleaner.dataset)*100:.1f}% | |
| ### Rejection Reasons: | |
| """ | |
| reason_counts = {} | |
| for idx, reason in cleaner.rejection_reasons.items(): | |
| reason_counts[reason] = reason_counts.get(reason, 0) + 1 | |
| for reason, count in reason_counts.items(): | |
| output += f"- {reason}: {count}\n" | |
| output += "\n### Cleaned Data (JSONL format):\n\n```json\n" | |
| for row in cleaned: | |
| output += json.dumps(row) + "\n" | |
| output += "```" | |
| return output | |
| custom_css = """ | |
| .gradio-container { | |
| font-family: 'Inter', sans-serif; | |
| background: | |
| radial-gradient(circle at top left, rgba(78, 205, 196, 0.12), transparent 28%), | |
| radial-gradient(circle at top right, rgba(255, 107, 107, 0.10), transparent 30%); | |
| } | |
| .cleaner-shell { | |
| background: rgba(255,255,255,0.05); | |
| border: 1px solid rgba(255,255,255,0.10); | |
| border-radius: 20px; | |
| padding: 1rem; | |
| box-shadow: 0 18px 36px rgba(0,0,0,0.14); | |
| } | |
| """ | |
| with gr.Blocks(css=custom_css, title="Dataset Cleaner UI", theme=gr.themes.Soft()) as app: | |
| create_premium_hero( | |
| "Dataset Cleaner UI", | |
| "Turn dataset cleaning into a quick, decision-first workflow with live stats and a clear keep-or-reject rhythm.", | |
| "π§Ή", | |
| badge="Data QA", | |
| highlights=["Keep / reject flow", "Live progress", "Export ready"], | |
| ) | |
| gr.Markdown(""" | |
| ## ππ Swipe to Clean | |
| Review each row and decide: Keep or Reject? | |
| AI will suggest actions based on quality indicators. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| row_display = gr.HTML() | |
| with gr.Column(scale=1): | |
| stats_display = gr.Markdown() | |
| with gr.Row(): | |
| reject_btn = gr.Button("π Reject (Bad Data)", variant="stop", size="lg", scale=1) | |
| keep_btn = gr.Button("π Keep (Good Data)", variant="primary", size="lg", scale=1) | |
| gr.Markdown("---") | |
| gr.Markdown("## πΎ Export Cleaned Dataset") | |
| export_btn = gr.Button("π¦ Export Cleaned Data", variant="secondary") | |
| export_output = gr.Markdown() | |
| # Event handlers | |
| keep_btn.click(keep_row, outputs=[row_display, stats_display]) | |
| reject_btn.click(reject_row, outputs=[row_display, stats_display]) | |
| export_btn.click(export_cleaned, outputs=export_output) | |
| # Load first row on startup | |
| app.load(display_current_row, outputs=[row_display, stats_display]) | |
| gr.Markdown(""" | |
| --- | |
| ## π‘ Features | |
| - π€ **AI Suggestions**: Auto-detect bad data | |
| - β‘ **Keyboard Shortcuts**: β Keep, β Reject | |
| - π **Real-Time Stats**: Track progress and keep rate | |
| - πΎ **Export**: Download as JSONL, CSV, or Parquet | |
| - π― **Batch Operations**: Flag all matching a pattern | |
| ### π Quality Checks: | |
| - β Text length (minimum 10 characters) | |
| - β Profanity detection | |
| - β Duplicate detection | |
| - β Grammar quality | |
| - β Label consistency | |
| ### Professional Use Case: | |
| Use this as a lightweight review console before publishing a Hugging Face Dataset or starting fine-tuning. | |
| """) | |
| create_footer("Dataset Cleaner UI") | |
| if __name__ == "__main__": | |
| app.launch(server_name="0.0.0.0", server_port=7860) | |