""" LAB 2: Document Ingestion & Chunking Strategy Comparison ========================================================= Compare Fixed-Size vs Recursive chunking with adjustable parameters. FREE — No API key needed! """ import gradio as gr from langchain_text_splitters import ( CharacterTextSplitter, RecursiveCharacterTextSplitter, ) SAMPLE_DOC = """ Artificial Intelligence is transforming every industry. Machine Learning is a subset of AI that enables systems to learn from data. Deep Learning uses neural networks with many layers to model complex patterns. Natural Language Processing (NLP) allows computers to understand human language. Large Language Models like GPT-4 are trained on billions of tokens of text. Retrieval Augmented Generation (RAG) combines retrieval systems with LLMs. RAG grounds the LLM's responses in real documents, reducing hallucinations. """ def compare_chunking(chunk_size: int, overlap: int) -> tuple[str, str]: fixed = CharacterTextSplitter( chunk_size=int(chunk_size), chunk_overlap=int(overlap), separator=" " ) recursive = RecursiveCharacterTextSplitter( chunk_size=int(chunk_size), chunk_overlap=int(overlap) ) fixed_chunks = fixed.split_text(SAMPLE_DOC) recursive_chunks = recursive.split_text(SAMPLE_DOC) fixed_out = f"📦 Fixed-Size: {len(fixed_chunks)} chunks\n\n" for i, c in enumerate(fixed_chunks): fixed_out += f"[Chunk {i+1}] ({len(c)} chars)\n{c}\n{'-'*40}\n" recursive_out = f"🔄 Recursive: {len(recursive_chunks)} chunks\n\n" for i, c in enumerate(recursive_chunks): recursive_out += f"[Chunk {i+1}] ({len(c)} chars)\n{c}\n{'-'*40}\n" return fixed_out, recursive_out with gr.Blocks(title="Lab 2: Chunking Strategies", theme=gr.themes.Soft()) as demo: gr.Markdown("## ✂️ Lab 2: Chunking Strategy Comparison") with gr.Row(): chunk_size = gr.Slider(100, 600, value=300, step=50, label="Chunk Size (chars)") overlap = gr.Slider(0, 100, value=30, step=10, label="Overlap (chars)") btn = gr.Button("✂️ Compare Chunking", variant="primary") with gr.Row(): out_fixed = gr.Textbox(label="Fixed-Size Chunking", lines=18) out_recursive = gr.Textbox(label="Recursive Chunking", lines=18) btn.click(fn=compare_chunking, inputs=[chunk_size, overlap], outputs=[out_fixed, out_recursive]) demo.launch()