| """ |
| LAB 2: Document Ingestion & Chunking Strategy Comparison |
| ========================================================= |
| Compare Fixed-Size vs Recursive chunking with adjustable parameters. |
| FREE β No API key needed! |
| """ |
|
|
| import gradio as gr |
| from langchain_text_splitters import ( |
| CharacterTextSplitter, |
| RecursiveCharacterTextSplitter, |
| ) |
|
|
| SAMPLE_DOC = """ |
| Artificial Intelligence is transforming every industry. |
| Machine Learning is a subset of AI that enables systems to learn from data. |
| Deep Learning uses neural networks with many layers to model complex patterns. |
| |
| Natural Language Processing (NLP) allows computers to understand human language. |
| Large Language Models like GPT-4 are trained on billions of tokens of text. |
| |
| Retrieval Augmented Generation (RAG) combines retrieval systems with LLMs. |
| RAG grounds the LLM's responses in real documents, reducing hallucinations. |
| """ |
|
|
| def compare_chunking(chunk_size: int, overlap: int) -> tuple[str, str]: |
| fixed = CharacterTextSplitter( |
| chunk_size=int(chunk_size), chunk_overlap=int(overlap), separator=" " |
| ) |
| recursive = RecursiveCharacterTextSplitter( |
| chunk_size=int(chunk_size), chunk_overlap=int(overlap) |
| ) |
| fixed_chunks = fixed.split_text(SAMPLE_DOC) |
| recursive_chunks = recursive.split_text(SAMPLE_DOC) |
|
|
| fixed_out = f"π¦ Fixed-Size: {len(fixed_chunks)} chunks\n\n" |
| for i, c in enumerate(fixed_chunks): |
| fixed_out += f"[Chunk {i+1}] ({len(c)} chars)\n{c}\n{'-'*40}\n" |
|
|
| recursive_out = f"π Recursive: {len(recursive_chunks)} chunks\n\n" |
| for i, c in enumerate(recursive_chunks): |
| recursive_out += f"[Chunk {i+1}] ({len(c)} chars)\n{c}\n{'-'*40}\n" |
|
|
| return fixed_out, recursive_out |
|
|
| with gr.Blocks(title="Lab 2: Chunking Strategies", theme=gr.themes.Soft()) as demo: |
| gr.Markdown("## βοΈ Lab 2: Chunking Strategy Comparison") |
| with gr.Row(): |
| chunk_size = gr.Slider(100, 600, value=300, step=50, label="Chunk Size (chars)") |
| overlap = gr.Slider(0, 100, value=30, step=10, label="Overlap (chars)") |
| btn = gr.Button("βοΈ Compare Chunking", variant="primary") |
| with gr.Row(): |
| out_fixed = gr.Textbox(label="Fixed-Size Chunking", lines=18) |
| out_recursive = gr.Textbox(label="Recursive Chunking", lines=18) |
| btn.click(fn=compare_chunking, inputs=[chunk_size, overlap], outputs=[out_fixed, out_recursive]) |
|
|
| demo.launch() |