Spaces:

axelsirota
/

chunking-visualizer

Build error

App Files Files Community

chunking-visualizer / app.py

axelsirota

Upload folder using huggingface_hub

d49e945 verified about 1 month ago

raw

history blame contribute delete

9.49 kB

	import gradio as gr
	import re

	SAMPLE_DOCS = {
	"FAQ Document": """Q: What is your return policy?
	A: You can return most items within 30 days of purchase for a full refund. Items must be in original condition with tags attached.

	Q: How long does shipping take?
	A: Standard shipping takes 5-7 business days. Express shipping takes 2-3 business days.

	Q: Do you offer international shipping?
	A: Yes, we ship to over 50 countries. International shipping typically takes 10-14 business days.

	Q: How do I track my order?
	A: Once your order ships, you'll receive an email with tracking information. You can also check order status in your account.

	Q: What payment methods do you accept?
	A: We accept Visa, Mastercard, American Express, PayPal, and Apple Pay.""",

	"Product Documentation": """Smart Thermostat Pro - User Guide

	Installation:
	Turn off power at the circuit breaker before beginning installation. Remove your old thermostat and take a photo of the wiring. The Smart Thermostat Pro is compatible with most 24V heating and cooling systems.

	Setup:
	Download the SmartHome app and create an account. The thermostat will automatically enter pairing mode when powered on. Follow the in-app instructions to connect to your WiFi network.

	Daily Use:
	The touchscreen displays current temperature and humidity. Swipe left or right to adjust target temperature. Tap the calendar icon to view and edit your schedule.

	Energy Saving Features:
	Auto-Away detects when you leave and adjusts temperature to save energy. The monthly energy report shows your usage patterns and savings. Eco mode reduces heating/cooling by 2 degrees to save up to 15% on energy bills.

	Troubleshooting:
	If the display is blank, check that power is connected at the circuit breaker. If WiFi won't connect, ensure your network is 2.4GHz (5GHz is not supported). For heating/cooling issues, verify the system wires match the terminal labels.""",

	"Policy Document": """Employee Remote Work Policy

	1. Eligibility
	All full-time employees who have completed their probationary period are eligible for remote work. Certain roles requiring physical presence are exempt from this policy.

	2. Core Hours
	Remote employees must be available from 10am to 3pm in their local timezone. This ensures overlap for team collaboration and meetings.

	3. Equipment
	The company provides a laptop and external monitor. Employees are responsible for maintaining reliable internet connectivity with minimum 25 Mbps speed.

	4. Communication
	Employees must respond to messages within 2 hours during core hours. All meetings should be attended with camera on unless otherwise specified.

	5. Performance
	Remote work privileges are contingent on meeting performance expectations. Managers will review remote work arrangements quarterly.

	6. Expenses
	Home office setup stipend: $500 one-time. Monthly internet reimbursement: up to $50. Coworking space usage requires pre-approval."""
	}


	def chunk_fixed_size(text, size, overlap_pct):
	"""Fixed-size chunking."""
	overlap = int(size * overlap_pct / 100)
	chunks = []
	start = 0
	while start < len(text):
	end = min(start + size, len(text))
	chunks.append(text[start:end])
	start = end - overlap if overlap > 0 else end
	return chunks


	def chunk_sentence(text, sentences_per_chunk):
	"""Sentence-based chunking."""
	sentences = re.split(r'(?<=[.!?])\s+', text)
	chunks = []
	for i in range(0, len(sentences), sentences_per_chunk):
	chunk = ' '.join(sentences[i:i + sentences_per_chunk])
	if chunk.strip():
	chunks.append(chunk.strip())
	return chunks


	def chunk_paragraph(text):
	"""Paragraph-based chunking."""
	paragraphs = text.split('\n\n')
	return [p.strip() for p in paragraphs if p.strip()]


	def chunk_qa_pairs(text):
	"""Q&A pair chunking (for FAQ documents)."""
	pattern = r'(Q:.?A:.?)(?=Q:\|$)'
	matches = re.findall(pattern, text, re.DOTALL)
	return [m.strip() for m in matches if m.strip()]


	def visualize_chunks(text, strategy, chunk_size, overlap_pct, sentences_per_chunk):
	"""Generate chunk visualization."""
	if not text.strip():
	return "Please provide text to chunk.", "", ""

	# Apply chunking strategy
	if strategy == "Fixed Size":
	chunks = chunk_fixed_size(text, chunk_size, overlap_pct)
	elif strategy == "Sentence-Based":
	chunks = chunk_sentence(text, sentences_per_chunk)
	elif strategy == "Paragraph-Based":
	chunks = chunk_paragraph(text)
	elif strategy == "Q&A Pairs":
	chunks = chunk_qa_pairs(text)
	else:
	chunks = [text]

	if not chunks:
	return "No chunks generated. Try a different strategy.", "", ""

	# Calculate stats
	total_chars = sum(len(c) for c in chunks)
	avg_size = total_chars / len(chunks)
	min_size = min(len(c) for c in chunks)
	max_size = max(len(c) for c in chunks)

	# Check for problems
	problems = []
	split_sentences = 0
	for i, chunk in enumerate(chunks):
	if not chunk.rstrip().endswith(('.', '!', '?', '"')) and i < len(chunks) - 1:
	split_sentences += 1

	if split_sentences > 0:
	problems.append(f"⚠️ {split_sentences} chunks end mid-sentence")
	if min_size < 50:
	problems.append(f"⚠️ Some chunks are very small ({min_size} chars)")
	if max_size > 2000:
	problems.append(f"⚠️ Some chunks are very large ({max_size} chars)")

	# Stats display
	stats = f"""### Chunking Statistics

	\| Metric \| Value \|
	\|--------\|-------\|
	\| Total Chunks \| {len(chunks)} \|
	\| Average Size \| {avg_size:.0f} characters \|
	\| Min Size \| {min_size} characters \|
	\| Max Size \| {max_size} characters \|
	\| Total Characters \| {total_chars} \|

	"""
	if problems:
	stats += "### ⚠️ Potential Issues\n" + "\n".join(problems)
	else:
	stats += "### ✅ No obvious issues detected"

	# Chunk display with color coding
	colors = ['#E6F7F5', '#D4F0EC', '#C2E9E3', '#B0E2DA', '#9EDBD1', '#8CD4C8', '#7ACDBF']
	chunk_display = "### Chunk Preview\n\n"

	for i, chunk in enumerate(chunks[:10]): # Show first 10
	color = colors[i % len(colors)]
	ends_mid_sentence = not chunk.rstrip().endswith(('.', '!', '?', '"')) and i < len(chunks) - 1
	border = "2px solid #dc2626" if ends_mid_sentence else "1px solid #40B8A6"
	warning = " ⚠️ ends mid-sentence" if ends_mid_sentence else ""

	preview = chunk[:200] + "..." if len(chunk) > 200 else chunk
	chunk_display += f"Chunk {i+1} ({len(chunk)} chars){warning}\n```\n{preview}\n```\n\n"

	if len(chunks) > 10:
	chunk_display += f"... and {len(chunks) - 10} more chunks"

	return stats, chunk_display, f"Strategy: {strategy} \| Chunks: {len(chunks)}"


	def load_sample(sample_name):
	return SAMPLE_DOCS.get(sample_name, "")


	# Build interface
	with gr.Blocks(title="Chunking Visualizer", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"# Chunking Visualizer\n\n"
	"PM Decision: Your engineering team says they'll 'chunk the documents.' "
	"This tool shows you exactly what that means and helps you spot potential problems "
	"before they affect retrieval quality.\n\n"
	"Try different strategies and see how they split your documents."
	)

	with gr.Row():
	with gr.Column(scale=1):
	sample_dropdown = gr.Dropdown(
	choices=list(SAMPLE_DOCS.keys()),
	label="Load Sample Document",
	value="FAQ Document"
	)
	text_input = gr.Textbox(
	label="Document Text",
	placeholder="Paste your document here...",
	lines=12,
	value=SAMPLE_DOCS["FAQ Document"]
	)

	strategy = gr.Radio(
	choices=["Fixed Size", "Sentence-Based", "Paragraph-Based", "Q&A Pairs"],
	label="Chunking Strategy",
	value="Fixed Size"
	)

	with gr.Row():
	chunk_size = gr.Slider(100, 1000, value=300, step=50, label="Chunk Size (chars)")
	overlap = gr.Slider(0, 50, value=10, step=5, label="Overlap (%)")

	sentences = gr.Slider(1, 10, value=3, step=1, label="Sentences per Chunk")

	visualize_btn = gr.Button("Visualize Chunks", variant="primary")

	with gr.Column(scale=1):
	summary_output = gr.Textbox(label="Summary", interactive=False)
	stats_output = gr.Markdown(label="Statistics")
	chunks_output = gr.Markdown(label="Chunks")

	# Events
	sample_dropdown.change(load_sample, sample_dropdown, text_input)

	visualize_btn.click(
	visualize_chunks,
	inputs=[text_input, strategy, chunk_size, overlap, sentences],
	outputs=[stats_output, chunks_output, summary_output]
	)

	# Auto-update on strategy change
	strategy.change(
	visualize_chunks,
	inputs=[text_input, strategy, chunk_size, overlap, sentences],
	outputs=[stats_output, chunks_output, summary_output]
	)

	gr.Markdown(
	"---\n"
	"PM Takeaway: The right chunking strategy depends on your document type. "
	"FAQs work best with Q&A pair chunking. Product docs work with paragraph or sentence-based. "
	"Always test with real queries to verify retrieval quality.\n\n"
	"AI for Product Managers"
	)

	if __name__ == "__main__":
	demo.launch()