Spaces:

Soha85
/

RAG_Study

Running

App Files Files Community

Soha85 commited on 8 days ago

Commit

0fe3898

verified ·

1 Parent(s): f2f9885

Update index.html

Browse files

Files changed (1) hide show

index.html +126 -52

index.html CHANGED Viewed

@@ -3,7 +3,7 @@
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>RAG</title>
     <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/3.9.1/chart.min.js"></script>
     <style>
         * {
@@ -330,15 +330,61 @@
         .chunking-brief .reference a:hover {
             text-decoration: underline;
         }
     </style>
 </head>
 <body>
     <div class="container">
-        <h1>🔍 RAG Dashboard</h1>
         <div class="tabs">
             <button class="tab active" onclick="switchTab('rag')">RAG Types</button>
-            <button class="tab" onclick="switchTab('chunking')">Chunking</button>
         </div>
         <!-- RAG TYPES TAB -->
@@ -405,13 +451,12 @@
                     <div class="chart-container">
                         <canvas id="chunkingChart"></canvas>
                     </div>
                 </div>
                 <div class="card">
-                    <h2>Chunking Evolution Timeline</h2>
-                    <div class="chart-container">
-                        <canvas id="chunkingTimelineChart"></canvas>
-                    </div>
                 </div>
             </div>
@@ -592,6 +637,7 @@
             {
                 type: "Fixed-size Chunking",
                 year: 2001,
                 workflow: "Input Document → Split by Fixed Token/Word Count (200-500 tokens) → Create Chunks → Output Chunks",
                 description: "Splits text into chunks based on a fixed number of tokens/words (e.g., 200-500 tokens). Simple but may break semantic boundaries.",
                 references: "Collobert, R., et al. (2011). Natural language processing (almost) from scratch. Journal of Machine Learning Research 12:2493-2537.",
@@ -600,6 +646,7 @@
             {
                 type: "Sliding Window Chunking",
                 year: 1998,
                 workflow: "Input Document → Define Chunk Size & Overlap → Slide Window (e.g., size 300, overlap 100) → Create Overlapping Chunks → Output Chunks",
                 description: "Uses a fixed chunk size plus overlap (e.g., chunk size 300, overlap 100). Maintains context continuity across chunks.",
                 references: "Carbonell, J. & Goldstein, J. (1998). The use of MMR, diversity-based reranking for reordering documents. SIGIR 1998.",
@@ -608,6 +655,7 @@
             {
                 type: "Semantic Chunking",
                 year: 2024,
                 workflow: "Input Document → Analyze Topic Shifts/Semantic Boundaries → Split at Semantic Breaks → Create Semantically Coherent Chunks → Output Chunks",
                 description: "Chunks based on topic shifts or semantic boundaries rather than size. Preserves meaning and context.",
                 references: "LangChain (2024). Semantic chunker. Accessed 2024-09-14.",
@@ -616,6 +664,7 @@
             {
                 type: "Sentence-based Chunking",
                 year: 2024,
                 workflow: "Input Document → Split by Sentences → Group Sentences (Optional) → Create Sentence Chunks → Output Chunks",
                 description: "Splits text by sentences. Maintains grammatical integrity and natural language boundaries.",
                 references: "Dong, K., et al. (2024). Multi-view content-aware indexing for long document retrieval. arXiv:2404.15103",
@@ -624,6 +673,7 @@
             {
                 type: "Paragraph-based Chunking",
                 year: 2014,
                 workflow: "Input Document → Identify Paragraph Boundaries → Split at Paragraphs → Create Paragraph Chunks → Output Chunks",
                 description: "Uses natural paragraph boundaries for chunking. Preserves document structure and thematic grouping.",
                 references: "Dudhabaware, R. S., et al. (2014). Review on natural language processing tasks for text documents. IEEE ICCIC 2014.",
@@ -632,22 +682,25 @@
             {
                 type: "Recursive/Hierarchical Chunking",
                 year: 2023,
                 workflow: "Input Document → Break into Sections → Split Sections into Paragraphs → Split Paragraphs into Sentences → Split Sentences into Tokens → Output Multi-level Chunks",
                 description: "Breaks document into progressively smaller units (section → paragraph → sentence → tokens). Enables multi-level retrieval.",
-                references: "LangChain (2023). Recursively split by character. Accessed 2024-09-14. | Latif, S., et al. (2025). The Chunking Paradigm: Recursive Semantic for RAG. ICNLSP 2025.",
                 paperUrl: "https://aclanthology.org/2025.icnlsp-1.16/"
             },
             {
                 type: "Hybrid Chunking",
                 year: 2024,
                 workflow: "Input Document → Apply Semantic Analysis → Apply Fixed-size Constraint → Apply Overlap Strategy → Create Hybrid Chunks → Output Chunks",
                 description: "Combines semantic + fixed-size + overlap strategies. Balances context preservation with practical constraints.",
-                references: "Kamradt, G. (2024). 5 levels of text splitting. LangChain blog.",
-                paperUrl: "https://python.langchain.com/docs/how_to/semantic-chunker/"
             },
             {
                 type: "Discourse-aware Chunking",
                 year: 2005,
                 workflow: "Input Document → Identify Discourse Markers (however, in contrast, therefore) → Detect Conceptual Shifts → Split at Discourse Boundaries → Output Discourse Chunks",
                 description: "Uses discourse markers (e.g., 'however', 'in contrast', 'therefore') to detect conceptual shifts and chunk accordingly.",
                 references: "Sporleder, C. & Lapata, M. (2005). Discourse Chunking and its Application to Sentence Compression. HLT 2005.",
@@ -656,14 +709,16 @@
             {
                 type: "Embedding-similarity-based Chunking",
                 year: 2024,
                 workflow: "Input Document → Generate Sentence Embeddings → Calculate Similarity Scores → Detect Similarity Drops Below Threshold → Split at Low Similarity Points → Output Chunks",
                 description: "Sequentially processes text and opens a new chunk when embedding similarity drops below a threshold. Data-driven approach.",
-                references: "Kamradt, G. (2024). 5 levels of text splitting. LangChain blog.",
-                paperUrl: "https://python.langchain.com/docs/how_to/semantic-chunker/"
             },
             {
                 type: "Metadata-based Chunking",
                 year: 2025,
                 workflow: "Input Document → Extract Metadata (headers, bullets, sections) → Split Using Metadata Structure → Create Metadata-aware Chunks → Output Chunks",
                 description: "Splits text using document metadata (e.g., headers, bullets). Preserves document structure and hierarchy.",
                 references: "Zhao, J., et al. (2025). MoC: Mixtures of Text Chunking Learners for Retrieval-Augmented Generation. ACL 2025.",
@@ -672,14 +727,16 @@
             {
                 type: "Page-based Chunking",
                 year: 2024,
                 workflow: "Input PDF/Scanned Document → Identify Page Boundaries → Split by Page → Create Page Chunks → Output Chunks",
                 description: "Splits by PDF page or scanned document page. Useful for document-level retrieval and citation.",
-                references: "Standard PDF processing practice. No specific paper required.",
-                paperUrl: "#"
             },
             {
                 type: "Domain-specific Chunking",
                 year: 2024,
                 workflow: "Input Domain Document → Apply Domain Rules (legal sections, medical records, code functions) → Split by Domain Logic → Create Domain Chunks → Output Chunks",
                 description: "Tailored to content type (e.g., legal documents by sections, medical records by encounters, code by functions).",
                 references: "Allamraju, A., et al. (2024). Breaking It Down: Domain-Aware Semantic Segmentation for Retrieval Augmented Generation. arXiv:2512.00367",
@@ -687,6 +744,14 @@
             }
         ];
         const mainCategoryDefinitions = {
             'Foundational': 'The baseline RAG approach establishing core retrieval-augmented generation principles with simple, fixed pipelines.',
             'Modular': 'Focuses on pipeline flexibility through specialized, interchangeable components and intermediate operations between query and generation.',
@@ -889,8 +954,16 @@
                 if (selected) {
                     createFlowchart(selected.workflow, 'chunkingFlowchart');
                     document.getElementById('chunkingDescription').textContent = selected.description;
-                    document.getElementById('chunkingReference').innerHTML =
-                        `📄 <a href="${selected.paperUrl}" target="_blank">${selected.references}</a>`;
                     details.style.display = 'block';
                 } else {
                     details.style.display = 'none';
@@ -900,12 +973,7 @@
             // Chunking methods chart
             const chunkingTypes = {};
             chunkingData.forEach(chunk => {
-                const category = chunk.type.includes('Semantic') || chunk.type.includes('Embedding') ? 'Semantic-based' :
-                                chunk.type.includes('Fixed') || chunk.type.includes('Sliding') ? 'Size-based' :
-                                chunk.type.includes('Sentence') || chunk.type.includes('Paragraph') ? 'Structure-based' :
-                                chunk.type.includes('Recursive') || chunk.type.includes('Hierarchical') ? 'Hierarchical' :
-                                'Domain-specific';
-                chunkingTypes[category] = (chunkingTypes[category] || 0) + 1;
             });
             new Chart(document.getElementById('chunkingChart'), {
@@ -928,37 +996,35 @@
                 }
             });
-            // Chunking timeline
-            const chunkingYearCount = {};
-            chunkingData.forEach(chunk => {
-                chunkingYearCount[chunk.year] = (chunkingYearCount[chunk.year] || 0) + 1;
             });
-            new Chart(document.getElementById('chunkingTimelineChart'), {
-                type: 'line',
-                data: {
-                    labels: Object.keys(chunkingYearCount).sort(),
-                    datasets: [{
-                        label: 'Chunking Methods',
-                        data: Object.keys(chunkingYearCount).sort().map(year => chunkingYearCount[year]),
-                        borderColor: '#667eea',
-                        backgroundColor: 'rgba(102, 126, 234, 0.1)',
-                        tension: 0.4,
-                        fill: true
-                    }]
-                },
-                options: {
-                    responsive: true,
-                    maintainAspectRatio: false,
-                    scales: {
-                        y: {
-                            beginAtZero: true,
-                            ticks: {
-                                stepSize: 1
-                            }
-                        }
-                    }
-                }
             });
             // Chunking year timeline
@@ -966,11 +1032,19 @@
             chunkingData.forEach(chunk => {
                 const item = document.createElement('div');
                 item.className = 'timeline-item';
                 item.innerHTML = `
                     <div class="timeline-year">${chunk.year}</div>
                     <div class="timeline-content">
                         <div class="timeline-type">${chunk.type}</div>
-                        <div class="timeline-reference">📄 <a href="${chunk.paperUrl}" target="_blank">${chunk.references}</a></div>
                     </div>
                 `;
                 chunkingYearTimeline.appendChild(item);

 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>RAG & Chunking Visualization</title>
     <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/3.9.1/chart.min.js"></script>
     <style>
         * {
         .chunking-brief .reference a:hover {
             text-decoration: underline;
         }
+        .category-tree {
+            background: #f8f9fa;
+            padding: 20px;
+            border-radius: 10px;
+            margin-top: 20px;
+        }
+        .category-node {
+            margin: 15px 0;
+            padding: 15px;
+            background: white;
+            border-radius: 8px;
+            border-left: 4px solid #667eea;
+            box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+        }
+        .category-node h4 {
+            color: #667eea;
+            margin-bottom: 10px;
+            font-size: 1.1em;
+        }
+        .category-node .brief {
+            color: #666;
+            font-size: 0.9em;
+            font-style: italic;
+            margin-bottom: 10px;
+            line-height: 1.5;
+        }
+        .category-methods {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 8px;
+            margin-top: 10px;
+        }
+        .method-tag {
+            background: linear-gradient(135deg, #667eea, #764ba2);
+            color: white;
+            padding: 6px 12px;
+            border-radius: 15px;
+            font-size: 0.85em;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        }
     </style>
 </head>
 <body>
     <div class="container">
+        <h1>🔍 RAG & Chunking Visualization Dashboard</h1>
         <div class="tabs">
             <button class="tab active" onclick="switchTab('rag')">RAG Types</button>
+            <button class="tab" onclick="switchTab('chunking')">Chunking Methods</button>
         </div>
         <!-- RAG TYPES TAB -->
                     <div class="chart-container">
                         <canvas id="chunkingChart"></canvas>
                     </div>
+                    <div id="chunkingCategoryDefs"></div>
                 </div>
                 <div class="card">
+                    <h2>Chunking Categories & Methods</h2>
+                    <div id="categoryHierarchy" class="category-tree"></div>
                 </div>
             </div>
             {
                 type: "Fixed-size Chunking",
                 year: 2001,
+                category: "Size-based",
                 workflow: "Input Document → Split by Fixed Token/Word Count (200-500 tokens) → Create Chunks → Output Chunks",
                 description: "Splits text into chunks based on a fixed number of tokens/words (e.g., 200-500 tokens). Simple but may break semantic boundaries.",
                 references: "Collobert, R., et al. (2011). Natural language processing (almost) from scratch. Journal of Machine Learning Research 12:2493-2537.",
             {
                 type: "Sliding Window Chunking",
                 year: 1998,
+                category: "Size-based",
                 workflow: "Input Document → Define Chunk Size & Overlap → Slide Window (e.g., size 300, overlap 100) → Create Overlapping Chunks → Output Chunks",
                 description: "Uses a fixed chunk size plus overlap (e.g., chunk size 300, overlap 100). Maintains context continuity across chunks.",
                 references: "Carbonell, J. & Goldstein, J. (1998). The use of MMR, diversity-based reranking for reordering documents. SIGIR 1998.",
             {
                 type: "Semantic Chunking",
                 year: 2024,
+                category: "Semantic-based",
                 workflow: "Input Document → Analyze Topic Shifts/Semantic Boundaries → Split at Semantic Breaks → Create Semantically Coherent Chunks → Output Chunks",
                 description: "Chunks based on topic shifts or semantic boundaries rather than size. Preserves meaning and context.",
                 references: "LangChain (2024). Semantic chunker. Accessed 2024-09-14.",
             {
                 type: "Sentence-based Chunking",
                 year: 2024,
+                category: "Structure-based",
                 workflow: "Input Document → Split by Sentences → Group Sentences (Optional) → Create Sentence Chunks → Output Chunks",
                 description: "Splits text by sentences. Maintains grammatical integrity and natural language boundaries.",
                 references: "Dong, K., et al. (2024). Multi-view content-aware indexing for long document retrieval. arXiv:2404.15103",
             {
                 type: "Paragraph-based Chunking",
                 year: 2014,
+                category: "Structure-based",
                 workflow: "Input Document → Identify Paragraph Boundaries → Split at Paragraphs → Create Paragraph Chunks → Output Chunks",
                 description: "Uses natural paragraph boundaries for chunking. Preserves document structure and thematic grouping.",
                 references: "Dudhabaware, R. S., et al. (2014). Review on natural language processing tasks for text documents. IEEE ICCIC 2014.",
             {
                 type: "Recursive/Hierarchical Chunking",
                 year: 2023,
+                category: "Hierarchical",
                 workflow: "Input Document → Break into Sections → Split Sections into Paragraphs → Split Paragraphs into Sentences → Split Sentences into Tokens → Output Multi-level Chunks",
                 description: "Breaks document into progressively smaller units (section → paragraph → sentence → tokens). Enables multi-level retrieval.",
+                references: "Latif, S., et al. (2025). The Chunking Paradigm: Recursive Semantic for RAG. ICNLSP 2025.",
                 paperUrl: "https://aclanthology.org/2025.icnlsp-1.16/"
             },
             {
                 type: "Hybrid Chunking",
                 year: 2024,
+                category: "Semantic-based",
                 workflow: "Input Document → Apply Semantic Analysis → Apply Fixed-size Constraint → Apply Overlap Strategy → Create Hybrid Chunks → Output Chunks",
                 description: "Combines semantic + fixed-size + overlap strategies. Balances context preservation with practical constraints.",
+                references: "Kamradt, G. (2024). 5 levels of text splitting.",
+                paperUrl: ""
             },
             {
                 type: "Discourse-aware Chunking",
                 year: 2005,
+                category: "Semantic-based",
                 workflow: "Input Document → Identify Discourse Markers (however, in contrast, therefore) → Detect Conceptual Shifts → Split at Discourse Boundaries → Output Discourse Chunks",
                 description: "Uses discourse markers (e.g., 'however', 'in contrast', 'therefore') to detect conceptual shifts and chunk accordingly.",
                 references: "Sporleder, C. & Lapata, M. (2005). Discourse Chunking and its Application to Sentence Compression. HLT 2005.",
             {
                 type: "Embedding-similarity-based Chunking",
                 year: 2024,
+                category: "Semantic-based",
                 workflow: "Input Document → Generate Sentence Embeddings → Calculate Similarity Scores → Detect Similarity Drops Below Threshold → Split at Low Similarity Points → Output Chunks",
                 description: "Sequentially processes text and opens a new chunk when embedding similarity drops below a threshold. Data-driven approach.",
+                references: "Kamradt, G. (2024). 5 levels of text splitting.",
+                paperUrl: ""
             },
             {
                 type: "Metadata-based Chunking",
                 year: 2025,
+                category: "Structure-based",
                 workflow: "Input Document → Extract Metadata (headers, bullets, sections) → Split Using Metadata Structure → Create Metadata-aware Chunks → Output Chunks",
                 description: "Splits text using document metadata (e.g., headers, bullets). Preserves document structure and hierarchy.",
                 references: "Zhao, J., et al. (2025). MoC: Mixtures of Text Chunking Learners for Retrieval-Augmented Generation. ACL 2025.",
             {
                 type: "Page-based Chunking",
                 year: 2024,
+                category: "Structure-based",
                 workflow: "Input PDF/Scanned Document → Identify Page Boundaries → Split by Page → Create Page Chunks → Output Chunks",
                 description: "Splits by PDF page or scanned document page. Useful for document-level retrieval and citation.",
+                references: "",
+                paperUrl: ""
             },
             {
                 type: "Domain-specific Chunking",
                 year: 2024,
+                category: "Domain-specific",
                 workflow: "Input Domain Document → Apply Domain Rules (legal sections, medical records, code functions) → Split by Domain Logic → Create Domain Chunks → Output Chunks",
                 description: "Tailored to content type (e.g., legal documents by sections, medical records by encounters, code by functions).",
                 references: "Allamraju, A., et al. (2024). Breaking It Down: Domain-Aware Semantic Segmentation for Retrieval Augmented Generation. arXiv:2512.00367",
             }
         ];
+        const chunkingCategoryBriefs = {
+            'Size-based': 'Chunks text by fixed sizes or overlapping windows. Simple, predictable, but may break context.',
+            'Semantic-based': 'Identifies meaning, topics, or discourse patterns to create contextually coherent chunks.',
+            'Structure-based': 'Uses natural document structure (sentences, paragraphs, metadata) for boundaries.',
+            'Hierarchical': 'Creates multi-level chunks (sections → paragraphs → sentences) for flexible retrieval.',
+            'Domain-specific': 'Applies domain knowledge and rules tailored to specific content types (legal, medical, code).'
+        };
         const mainCategoryDefinitions = {
             'Foundational': 'The baseline RAG approach establishing core retrieval-augmented generation principles with simple, fixed pipelines.',
             'Modular': 'Focuses on pipeline flexibility through specialized, interchangeable components and intermediate operations between query and generation.',
                 if (selected) {
                     createFlowchart(selected.workflow, 'chunkingFlowchart');
                     document.getElementById('chunkingDescription').textContent = selected.description;
+                    const refElement = document.getElementById('chunkingReference');
+                    if (selected.references && selected.paperUrl) {
+                        refElement.innerHTML = `📄 <a href="${selected.paperUrl}" target="_blank">${selected.references}</a>`;
+                    } else if (selected.references) {
+                        refElement.textContent = `📄 ${selected.references}`;
+                    } else {
+                        refElement.textContent = '';
+                    }
                     details.style.display = 'block';
                 } else {
                     details.style.display = 'none';
             // Chunking methods chart
             const chunkingTypes = {};
             chunkingData.forEach(chunk => {
+                chunkingTypes[chunk.category] = (chunkingTypes[chunk.category] || 0) + 1;
             });
             new Chart(document.getElementById('chunkingChart'), {
                 }
             });
+            // Add category definitions
+            const chunkingCategoryDefsContainer = document.getElementById('chunkingCategoryDefs');
+            Object.keys(chunkingCategoryBriefs).forEach(category => {
+                const div = document.createElement('div');
+                div.className = 'category-definition def-modular';
+                div.innerHTML = `
+                    <strong>${category}</strong>
+                    <p>${chunkingCategoryBriefs[category]}</p>
+                `;
+                chunkingCategoryDefsContainer.appendChild(div);
             });
+            // Create hierarchical category view
+            const categoryHierarchy = document.getElementById('categoryHierarchy');
+            Object.keys(chunkingCategoryBriefs).forEach(category => {
+                const methods = chunkingData.filter(c => c.category === category);
+                const node = document.createElement('div');
+                node.className = 'category-node';
+                const methodTags = methods.map(m =>
+                    `<span class="method-tag">${m.type}</span>`
+                ).join('');
+                node.innerHTML = `
+                    <h4>${category}</h4>
+                    <div class="brief">${chunkingCategoryBriefs[category]}</div>
+                    <div class="category-methods">${methodTags}</div>
+                `;
+                categoryHierarchy.appendChild(node);
             });
             // Chunking year timeline
             chunkingData.forEach(chunk => {
                 const item = document.createElement('div');
                 item.className = 'timeline-item';
+                let referenceHTML = '';
+                if (chunk.references && chunk.paperUrl) {
+                    referenceHTML = `<div class="timeline-reference">📄 <a href="${chunk.paperUrl}" target="_blank">${chunk.references}</a></div>`;
+                } else if (chunk.references) {
+                    referenceHTML = `<div class="timeline-reference">📄 ${chunk.references}</div>`;
+                }
                 item.innerHTML = `
                     <div class="timeline-year">${chunk.year}</div>
                     <div class="timeline-content">
                         <div class="timeline-type">${chunk.type}</div>
+                        ${referenceHTML}
                     </div>
                 `;
                 chunkingYearTimeline.appendChild(item);