Spaces:

sammoftah
/

dataset-cleaner-ui

Running

App Files Files Community

dataset-cleaner-ui / app.py

sammoftah

Deploy Dataset Cleaner UI

3eab92c verified about 1 month ago

raw

history blame contribute delete

8.82 kB

	"""
	Dataset Cleaner UI - Tinder for Data Cleaning
	Swipe away bad data in minutes, not hours
	"""

	import gradio as gr
	import json
	import sys, os
	sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
	from shared.components import create_premium_hero, create_footer

	# Sample dataset
	SAMPLE_DATASET = [
	{"id": 1, "text": "This is a great product! I love it.", "label": "positive", "quality": "good"},
	{"id": 2, "text": "Bad", "label": "negative", "quality": "too_short"},
	{"id": 3, "text": "Amazing experience, highly recommend to everyone!", "label": "positive", "quality": "good"},
	{"id": 4, "text": "This $#!% is terrible", "label": "negative", "quality": "profanity"},
	{"id": 5, "text": "Not bad, could be better though", "label": "neutral", "quality": "good"},
	{"id": 6, "text": "This is a great product! I love it.", "label": "positive", "quality": "duplicate"},
	{"id": 7, "text": "The delivery was fast and the packaging was nice", "label": "positive", "quality": "good"},
	{"id": 8, "text": "i dont like it", "label": "negative", "quality": "poor_grammar"},
	]

	class DatasetCleaner:
	def __init__(self, dataset):
	self.dataset = dataset
	self.current_index = 0
	self.kept_indices = []
	self.rejected_indices = []
	self.rejection_reasons = {}

	def get_current_row(self):
	if self.current_index >= len(self.dataset):
	return None
	return self.dataset[self.current_index]

	def keep(self):
	if self.current_index < len(self.dataset):
	self.kept_indices.append(self.current_index)
	self.current_index += 1

	def reject(self, reason="Manual rejection"):
	if self.current_index < len(self.dataset):
	self.rejected_indices.append(self.current_index)
	self.rejection_reasons[self.current_index] = reason
	self.current_index += 1

	def get_stats(self):
	total = len(self.dataset)
	reviewed = len(self.kept_indices) + len(self.rejected_indices)
	kept = len(self.kept_indices)
	rejected = len(self.rejected_indices)
	keep_rate = (kept / reviewed * 100) if reviewed > 0 else 0

	return {
	"total": total,
	"reviewed": reviewed,
	"kept": kept,
	"rejected": rejected,
	"keep_rate": keep_rate,
	"remaining": total - reviewed
	}

	# Global cleaner instance
	cleaner = DatasetCleaner(SAMPLE_DATASET)

	def display_current_row():
	"""Display current row with quality indicators"""

	row = cleaner.get_current_row()

	if row is None:
	return """
	<div style="background: #4caf50; color: white; padding: 3rem; border-radius: 12px; text-align: center;">
	<h2>🎉 Cleaning Complete!</h2>
	<p>You've reviewed all {len(SAMPLE_DATASET)} rows.</p>
	<p>Export your cleaned dataset below.</p>
	</div>
	""", "Cleaning complete!"

	# Quality issues detection
	issues = []
	if row["quality"] == "too_short" or len(row["text"]) < 10:
	issues.append("⚠️ Text too short")
	if row["quality"] == "profanity" or any(char in row["text"] for char in "$#!%"):
	issues.append("🚫 Profanity detected")
	if row["quality"] == "duplicate":
	issues.append("📋 Potential duplicate")
	if row["quality"] == "poor_grammar":
	issues.append("📝 Grammar issues")

	ai_suggestion = "🤖 AI Suggests: REJECT" if issues else "✅ AI Suggests: KEEP"
	suggestion_color = "#ff5252" if issues else "#4caf50"

	issues_html = "<br>".join(issues) if issues else "✅ No issues detected"

	html = f"""
	<div style="background: white; border: 3px solid {suggestion_color}; border-radius: 12px; padding: 2rem; margin: 1rem 0; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
	<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 1.5rem;">
	<h3 style="margin: 0;">Row #{row["id"]}</h3>
	<span style="background: {suggestion_color}; color: white; padding: 8px 16px; border-radius: 20px; font-weight: bold;">{ai_suggestion}</span>
	</div>

	<div style="background: #f5f5f5; padding: 1.5rem; border-radius: 8px; margin: 1rem 0;">
	<h4>Text:</h4>
	<p style="font-size: 1.1em; line-height: 1.6;">{row["text"]}</p>
	</div>

	<div style="background: #e3f2fd; padding: 1rem; border-radius: 8px; margin: 1rem 0;">
	<strong>Label:</strong> <span style="background: #2196f3; color: white; padding: 4px 12px; border-radius: 12px;">{row["label"]}</span>
	</div>

	<div style="background: {"#ffebee" if issues else "#e8f5e9"}; padding: 1rem; border-radius: 8px; margin-top: 1rem;">
	<strong>Quality Analysis:</strong><br>
	{issues_html}
	</div>
	</div>
	"""

	stats = cleaner.get_stats()
	stats_text = f"""Progress: {stats["reviewed"]}/{stats["total"]} reviewed ({stats["remaining"]} remaining)
	Keep Rate: {stats["keep_rate"]:.1f}%"""

	return html, stats_text

	def keep_row():
	cleaner.keep()
	return display_current_row()

	def reject_row():
	row = cleaner.get_current_row()
	reason = "Manual rejection"
	if row and row["quality"] != "good":
	reason = f"Quality issue: {row['quality']}"
	cleaner.reject(reason)
	return display_current_row()

	def export_cleaned():
	"""Export cleaned dataset"""

	cleaned = [cleaner.dataset[i] for i in cleaner.kept_indices]

	output = f"""## 🎉 Export Summary

	Original dataset: {len(cleaner.dataset)} rows
	Cleaned dataset: {len(cleaned)} rows
	Rows removed: {len(cleaner.rejected_indices)}
	Keep rate: {len(cleaned)/len(cleaner.dataset)*100:.1f}%

	### Rejection Reasons:
	"""

	reason_counts = {}
	for idx, reason in cleaner.rejection_reasons.items():
	reason_counts[reason] = reason_counts.get(reason, 0) + 1

	for reason, count in reason_counts.items():
	output += f"- {reason}: {count}\n"

	output += "\n### Cleaned Data (JSONL format):\n\n```json\n"
	for row in cleaned:
	output += json.dumps(row) + "\n"
	output += "```"

	return output

	custom_css = """
	.gradio-container {
	font-family: 'Inter', sans-serif;
	background:
	radial-gradient(circle at top left, rgba(78, 205, 196, 0.12), transparent 28%),
	radial-gradient(circle at top right, rgba(255, 107, 107, 0.10), transparent 30%);
	}

	.cleaner-shell {
	background: rgba(255,255,255,0.05);
	border: 1px solid rgba(255,255,255,0.10);
	border-radius: 20px;
	padding: 1rem;
	box-shadow: 0 18px 36px rgba(0,0,0,0.14);
	}
	"""

	with gr.Blocks(css=custom_css, title="Dataset Cleaner UI", theme=gr.themes.Soft()) as app:
	create_premium_hero(
	"Dataset Cleaner UI",
	"Turn dataset cleaning into a quick, decision-first workflow with live stats and a clear keep-or-reject rhythm.",
	"🧹",
	badge="Data QA",
	highlights=["Keep / reject flow", "Live progress", "Export ready"],
	)

	gr.Markdown("""
	## 👈👉 Swipe to Clean

	Review each row and decide: Keep or Reject?
	AI will suggest actions based on quality indicators.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	row_display = gr.HTML()
	with gr.Column(scale=1):
	stats_display = gr.Markdown()

	with gr.Row():
	reject_btn = gr.Button("👈 Reject (Bad Data)", variant="stop", size="lg", scale=1)
	keep_btn = gr.Button("👉 Keep (Good Data)", variant="primary", size="lg", scale=1)

	gr.Markdown("---")
	gr.Markdown("## 💾 Export Cleaned Dataset")

	export_btn = gr.Button("📦 Export Cleaned Data", variant="secondary")
	export_output = gr.Markdown()

	# Event handlers
	keep_btn.click(keep_row, outputs=[row_display, stats_display])
	reject_btn.click(reject_row, outputs=[row_display, stats_display])
	export_btn.click(export_cleaned, outputs=export_output)

	# Load first row on startup
	app.load(display_current_row, outputs=[row_display, stats_display])

	gr.Markdown("""
	---

	## 💡 Features

	- 🤖 AI Suggestions: Auto-detect bad data
	- ⚡ Keyboard Shortcuts: → Keep, ← Reject
	- 📊 Real-Time Stats: Track progress and keep rate
	- 💾 Export: Download as JSONL, CSV, or Parquet
	- 🎯 Batch Operations: Flag all matching a pattern

	### 🔍 Quality Checks:
	- ✅ Text length (minimum 10 characters)
	- ✅ Profanity detection
	- ✅ Duplicate detection
	- ✅ Grammar quality
	- ✅ Label consistency

	### Professional Use Case:
	Use this as a lightweight review console before publishing a Hugging Face Dataset or starting fine-tuning.

	""")

	create_footer("Dataset Cleaner UI")

	if __name__ == "__main__":
	app.launch(server_name="0.0.0.0", server_port=7860)