Spaces:

Soha85
/

RAG_Study

Running

App Files Files Community

Soha85 commited on 2 days ago

Commit

e6b91d3

verified ·

1 Parent(s): 4f6ff3a

Update index.html

Browse files

Files changed (1) hide show

index.html +312 -71

index.html CHANGED Viewed

@@ -376,6 +376,44 @@
             font-size: 0.85em;
             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
         }
     </style>
 </head>
 <body>
@@ -385,6 +423,7 @@
         <div class="tabs">
             <button class="tab active" onclick="switchTab('rag')">RAG</button>
             <button class="tab" onclick="switchTab('chunking')">Chunking</button>
         </div>
         <!-- RAG TYPES TAB -->
@@ -484,6 +523,205 @@
                 </div>
             </div>
         </div>
     </div>
     <script>
@@ -522,15 +760,15 @@
                 paperUrl: "https://arxiv.org/abs/2407.21059"
             },
             {
-                type: "Graph RAG",
                 year: 2024,
-                category: "Structural & Modular",
-                indexing: "Preprocessing → Entity/Relation Extraction → Store in Knowledge Graph (KG) & Vector DB",
-                inference: "User Query → Embedding/KG Query → Simultaneous Retrieval (Vector + KG Path) → Concatenation → LLM Generate",
-                benefits: "Resolves complex, multi-hop queries by leveraging factual relationships; Improves interpretability and fact consistency.",
-                challenges: "High indexing complexity (KG construction); Expensive maintenance for rapidly changing data; Retrieval latency can be high.",
-                references: "Barry, M., et al. (2025). GraphRAG: Leveraging Graph-Based Efficiency to Minimize Hallucinations in LLM-Driven RAG. GenAIK Workshop.",
-                paperUrl: "https://aclanthology.org/2025.genaik-1.6/"
             },
             {
                 type: "MultiModal RAG",
@@ -543,6 +781,17 @@
                 references: "Gao, Y., et al. (2024). Retrieval-Augmented Multimodal Language Modeling. CVPR 2024.",
                 paperUrl: "https://arxiv.org/abs/2211.12561"
             },
             {
                 type: "Recursive RAG",
                 year: 2024,
@@ -555,15 +804,15 @@
                 paperUrl: "https://arxiv.org/abs/2405.10627"
             },
             {
-                type: "Cache RAG",
                 year: 2024,
-                category: "Modular",
-                indexing: "Preprocessing → Standard Chunking → Embedding → Vector DB Storage",
-                inference: "User Query → Cache Lookup → If Hit: Return Cached Answer → If Miss: Standard Retrieval → LLM Generate → Cache Store",
-                benefits: "Dramatically improves latency and reduces LLM cost for repeated or highly similar queries.",
-                challenges: "Complex cache invalidation logic; Requires robust query similarity and hashing functions.",
-                references: "Jin, C., et al. (2024). RAGCache: Efficient Knowledge Caching for Retrieval-Augmented Generation. ACM TOCS.",
-                paperUrl: "https://arxiv.org/abs/2404.12457"
             },
             {
                 type: "Corrective RAG",
@@ -576,17 +825,6 @@
                 references: "Yan, S.-Q., et al. (2024). Corrective Retrieval Augmented Generation. arXiv:2401.15884",
                 paperUrl: "https://arxiv.org/abs/2401.15884"
             },
-            {
-                type: "Multi-Hop RAG",
-                year: 2024,
-                category: "Structural & Modular",
-                indexing: "Preprocessing → Chunking/Entity Extraction → Embedding → Structured Storage (Vector DB + Optional KG)",
-                inference: "User Query → Query Decomposition → Hop 1 Retrieval → Iterative Reasoning → Hop 2 Retrieval → Final Evidence Aggregation → LLM Generate",
-                benefits: "Solves questions requiring reasoning across multiple independent documents or retrieval steps.",
-                challenges: "Prone to error propagation (if one hop fails); Significantly higher latency; Requires generation of accurate intermediate queries.",
-                references: "Tang, B., & Yang, Y. (2024). MultiHop-RAG: Benchmarking Retrieval-Augmented Generation for Multi-Hop Queries. COLM 2024.",
-                paperUrl: "https://arxiv.org/abs/2401.15391"
-            },
             {
                 type: "Agentic RAG",
                 year: 2024,
@@ -634,15 +872,6 @@
         ];
         const chunkingData = [
-            {
-                type: "Fixed-size Chunking",
-                year: 2001,
-                category: "Size-based",
-                workflow: "Input Document → Split by Fixed Token/Word Count (200-500 tokens) → Create Chunks → Output Chunks",
-                description: "Splits text into chunks based on a fixed number of tokens/words (e.g., 200-500 tokens). Simple but may break semantic boundaries.",
-                references: "Collobert, R., et al. (2011). Natural language processing (almost) from scratch. Journal of Machine Learning Research 12:2493-2537.",
-                paperUrl: "https://www.jmlr.org/papers/volume12/collobert11a/collobert11a.pdf"
-            },
             {
                 type: "Sliding Window Chunking",
                 year: 1998,
@@ -651,24 +880,24 @@
                 description: "Uses a fixed chunk size plus overlap (e.g., chunk size 300, overlap 100). Maintains context continuity across chunks.",
                 references: "Carbonell, J. & Goldstein, J. (1998). The use of MMR, diversity-based reranking for reordering documents. SIGIR 1998.",
                 paperUrl: "https://dl.acm.org/doi/10.1145/290941.291025"
-            },
             {
-                type: "Semantic Chunking",
-                year: 2024,
-                category: "Semantic-based",
-                workflow: "Input Document → Analyze Topic Shifts/Semantic Boundaries → Split at Semantic Breaks → Create Semantically Coherent Chunks → Output Chunks",
-                description: "Chunks based on topic shifts or semantic boundaries rather than size. Preserves meaning and context.",
-                references: "LangChain (2024). Semantic chunker. Accessed 2024-09-14.",
-                paperUrl: "https://python.langchain.com/docs/how_to/semantic-chunker/"
             },
             {
-                type: "Sentence-based Chunking",
-                year: 2024,
-                category: "Structure-based",
-                workflow: "Input Document → Split by Sentences → Group Sentences (Optional) → Create Sentence Chunks → Output Chunks",
-                description: "Splits text by sentences. Maintains grammatical integrity and natural language boundaries.",
-                references: "Dong, K., et al. (2024). Multi-view content-aware indexing for long document retrieval. arXiv:2404.15103",
-                paperUrl: "https://arxiv.org/abs/2404.15103"
             },
             {
                 type: "Paragraph-based Chunking",
@@ -688,6 +917,24 @@
                 references: "Latif, S., et al. (2025). The Chunking Paradigm: Recursive Semantic for RAG. ICNLSP 2025.",
                 paperUrl: "https://aclanthology.org/2025.icnlsp-1.16/"
             },
             {
                 type: "Hybrid Chunking",
                 year: 2024,
@@ -697,15 +944,6 @@
                 references: "Kamradt, G. (2024). 5 levels of text splitting.",
                 paperUrl: ""
             },
-            {
-                type: "Discourse-aware Chunking",
-                year: 2005,
-                category: "Semantic-based",
-                workflow: "Input Document → Identify Discourse Markers (however, in contrast, therefore) → Detect Conceptual Shifts → Split at Discourse Boundaries → Output Discourse Chunks",
-                description: "Uses discourse markers (e.g., 'however', 'in contrast', 'therefore') to detect conceptual shifts and chunk accordingly.",
-                references: "Sporleder, C. & Lapata, M. (2005). Discourse Chunking and its Application to Sentence Compression. HLT 2005.",
-                paperUrl: "https://aclanthology.org/H05-1033/"
-            },
             {
                 type: "Embedding-similarity-based Chunking",
                 year: 2024,
@@ -715,15 +953,6 @@
                 references: "Kamradt, G. (2024). 5 levels of text splitting.",
                 paperUrl: ""
             },
-            {
-                type: "Metadata-based Chunking",
-                year: 2025,
-                category: "Structure-based",
-                workflow: "Input Document → Extract Metadata (headers, bullets, sections) → Split Using Metadata Structure → Create Metadata-aware Chunks → Output Chunks",
-                description: "Splits text using document metadata (e.g., headers, bullets). Preserves document structure and hierarchy.",
-                references: "Zhao, J., et al. (2025). MoC: Mixtures of Text Chunking Learners for Retrieval-Augmented Generation. ACL 2025.",
-                paperUrl: "https://aclanthology.org/2025.acl-long.1/"
-            },
             {
                 type: "Page-based Chunking",
                 year: 2024,
@@ -741,6 +970,15 @@
                 description: "Tailored to content type (e.g., legal documents by sections, medical records by encounters, code by functions).",
                 references: "Allamraju, A., et al. (2024). Breaking It Down: Domain-Aware Semantic Segmentation for Retrieval Augmented Generation. arXiv:2512.00367",
                 paperUrl: "https://arxiv.org/abs/2512.00367"
             }
         ];
@@ -769,11 +1007,14 @@
             if (tabName === 'rag') {
                 tabs[0].classList.add('active');
                 document.getElementById('rag-content').classList.add('active');
-            } else {
                 tabs[1].classList.add('active');
                 document.getElementById('chunking-content').classList.add('active');
                 initChunkingVisualizations();
-            }
         }
         function getCategoryBadgeClass(category) {

             font-size: 0.85em;
             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
         }
+        .rag-table {
+            width: 100%;
+            border-collapse: collapse;
+            margin-bottom: 2rem;
+            font-size: 0.95rem;
+        }
+        .rag-table th,
+        .rag-table td {
+            border: 1px solid #ddd;
+            padding: 8px;
+            vertical-align: top;
+            color: #666;
+            font-size: 0.9em;
+            font-style: italic;
+            margin-bottom: 10px;
+            line-height: 1.5;
+        }
+        .rag-table th {
+            background-color: #5b6ee1;
+            color: #fff;
+            text-align: left;
+        }
+        .rag-table tr:nth-child(even) {
+            background-color: #f5f6fa;
+        }
+        .rag-table a {
+            color: #1a4cff;
+            text-decoration: none;
+            font-weight: 500;
+        }
+        .rag-table a:hover {
+            text-decoration: underline;
+        }
     </style>
 </head>
 <body>
         <div class="tabs">
             <button class="tab active" onclick="switchTab('rag')">RAG</button>
             <button class="tab" onclick="switchTab('chunking')">Chunking</button>
+            <button class="tab" onclick="switchTab('indexing-db')">Indexing & Vector DB</button>
         </div>
         <!-- RAG TYPES TAB -->
                 </div>
             </div>
         </div>
+        <!-- INDEXING TAB -->
+        <div id="indexing-db" class="tab-content">
+        <div class="grid">
+                <div class="card">
+                    <h4>🧮 Indexing Techniques (Algorithmic Level)</h4>
+                    <p>Algorithmic methods for organizing and searching vectors efficiently. These are the core algorithms that determine how vectors are structured and retrieved (e.g., Flat Index, ANN, IVF, HNSW, PQ/OPQ).</p>
+                </div>
+                <div class="card">
+                    <h4>💾 Vector Databases (System Level)</h4>
+                    <p>Complete systems/engines that implement indexing techniques along with additional features like persistence, metadata support, distributed architecture, and APIs. They are the production-grade platforms (e.g., FAISS, Milvus, Pinecone, Weaviate).</p>
+                </div>
+        </div>
+        <!-- ---------- INDEXING TECHNIQUES TABLE ---------- -->
+        <div class="card">
+            <table class="rag-table">
+                <thead>
+                <tr>
+                    <th>Indexing Technique</th>
+                    <th>Description</th>
+                    <th>Workflow</th>
+                    <th>Pros</th>
+                    <th>Cons</th>
+                    <th>Peer-Reviewed Reference</th>
+                </tr>
+                </thead>
+                <tbody>
+                <tr>
+                    <td>Flat Index (Brute-Force)</td>
+                    <td>Exact similarity search over all vectors using distance metrics.</td>
+                    <td>
+                    Embed documents → store vectors → compare query with all vectors → return top-k
+                    </td>
+                    <td>Exact results<br>Simple implementation</td>
+                    <td>Not scalable<br>High latency & memory cost</td>
+                    <td class="timeline-reference">
+                    <a href="https://proceedings.neurips.cc/paper_files/paper/2020/file/6b493230205f780e1bc26945df7481e5-Paper.pdf" target="_blank">
+                        Lewis, P., Perez, E., Piktus, A., Petroni, F., Karpukhin, V., Goyal, N., Küttler, H., Lewis, M., Yih, W., Rocktäschel, T., Riedel, S., & Kiela, D. (2020). Retrieval‑augmented generation for knowledge‑intensive NLP tasks. In H. Larochelle, M. Ranzato, R. Hadsell, M. F. Balcan, & H. T. Lin (Eds.), Advances in Neural Information Processing Systems, 33, 9459–9474. Curran Associates, Inc.
+                    </a>
+                    </td>
+                </tr>
+                <tr>
+                    <td>ANN (Approximate Nearest Neighbor)</td>
+                    <td>Fast similarity search with controlled approximation.</td>
+                    <td>
+                    Embed documents → build ANN structure → retrieve approximate neighbors
+                    </td>
+                    <td>Fast retrieval<br>Scales to millions/billions</td>
+                    <td>Approximate (not exact)</td>
+                    <td class="timeline-reference">
+                    <a href="https://simg.baai.ac.cn/paperfile/25a43194-c74c-4cd3-b60f-0a1f27f8b8af.pdf" target="_blank">
+                        Gao, Y., Xiong, Y., Gao, X., Jia, K., Pan, J., Bi, Y., Dai, Y., Sun, J., Wang, M., & Wang, H. (2024). Retrieval‑Augmented Generation for Large Language Models: A Survey (arXiv:2312.10997). arXiv Preprint. https://doi.org/10.48550/arXiv.2312.10997
+                    </a>
+                    </td>
+                </tr>
+                <tr>
+                    <td>IVF (Inverted File Index)</td>
+                    <td>Partitions vector space into clusters; searches only relevant partitions.</td>
+                    <td>
+                    K-means clustering → assign vectors → query nearest clusters → search inside
+                    </td>
+                    <td>Good speed-accuracy trade-off<br>Scalable</td>
+                    <td>Requires tuning<br>Cluster-quality sensitive</td>
+                    <td class="timeline-reference">
+                    <a href="https://arxiv.org/pdf/1702.08734" target="_blank">
+                        Johnson, J., Douze, M., & Jégou, H. (2019). Billion‑scale similarity search with GPUs. IEEE Transactions on Big Data, 7(3), 535–547. https://doi.org/10.1109/TBDATA.2019.2921572
+                    </a>
+                    </td>
+                </tr>
+                <tr>
+                    <td>HNSW</td>
+                    <td>Graph-based ANN using hierarchical proximity graphs.</td>
+                    <td>
+                    Build layered graph → incremental insertion → top-down graph traversal
+                    </td>
+                    <td>Very fast<br>High recall<br>Dynamic updates</td>
+                    <td>High memory usage<br>Complex construction</td>
+                    <td class="timeline-reference">
+                    <a href="https://arxiv.org/abs/1603.09320" target="_blank">
+                        Malkov, Y. A., & Yashunin, D. A. (2020). Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. IEEE Transactions on Pattern Analysis and Machine Intelligence, 42(4), 824–836. https://doi.org/10.1109/TPAMI.2018.2889473
+                    </a>
+                    </td>
+                </tr>
+                <tr>
+                    <td>PQ / OPQ</td>
+                    <td>Compresses vectors into short codes for memory-efficient ANN.</td>
+                    <td>
+                    Subspace split → quantization → store codes → approximate distance
+                    </td>
+                    <td>Memory efficient<br>Enables billion-scale RAG</td>
+                    <td>Lossy compression<br>Lower recall if misconfigured</td>
+                    <td class="timeline-reference">
+                    <a href="https://www.irisa.fr/texmex/people/jegou/papers/jegou_searching_with_quantization.pdf" target="_blank">
+                        Jégou, H., Douze, M., & Schmid, C. (2011). Product quantization for nearest neighbor search. IEEE Transactions on Pattern Analysis and Machine Intelligence, 33(1), 117–128. https://doi.org/10.1109/TPAMI.2010.57
+                    </a>
+                    </td>
+                </tr>
+                </tbody>
+            </table>
+         </div>
+        <!-- ---------- VECTOR DATABASES TABLE ---------- -->
+        <div class="card">
+            <table class="rag-table">
+                <thead>
+                <tr>
+                    <th>Vector DB</th>
+                    <th>Description</th>
+                    <th>Underlying Index</th>
+                    <th>Pros</th>
+                    <th>Cons</th>
+                    <th>Peer-Reviewed Reference</th>
+                </tr>
+                </thead>
+                <tbody>
+                <tr>
+                    <td>FAISS</td>
+                    <td>Research-oriented vector similarity library widely used in RAG.</td>
+                    <td>Flat, IVF, HNSW, PQ, OPQ</td>
+                    <td>Highly optimized<br>Flexible ANN<br>Academic standard</td>
+                    <td>Not a full DBMS<br>Limited metadata</td>
+                    <td class="timeline-reference">
+                    <a href="https://simg.baai.ac.cn/paperfile/25a43194-c74c-4cd3-b60f-0a1f27f8b8af.pdf" target="_blank">
+                        Gao, Y., Xiong, Y., Gao, X., Jia, K., Pan, J., Bi, Y., Dai, Y., Sun, J., Wang, M., & Wang, H. (2024). Retrieval‑Augmented Generation for Large Language Models: A Survey (arXiv:2312.10997). arXiv Preprint. https://doi.org/10.48550/arXiv.2312.10997
+                    </a>
+                    </td>
+                </tr>
+                <tr>
+                    <td>Milvus</td>
+                    <td>Distributed open-source vector database for large-scale RAG.</td>
+                    <td>HNSW, IVF-PQ</td>
+                    <td>Scalable<br>Distributed<br>Open-source</td>
+                    <td>Deployment complexity<br>Operational overhead</td>
+                    <td class="timeline-reference">
+                    <a href="https://ijaibdcms.org/index.php/ijaibdcms/article/view/257?utm_source=chatgpt.com" target="_blank">
+                        Rusum, G. P., & Anasuri, S. (2025). Vector databases in modern applications: Real‑time search, recommendations, and retrieval‑augmented generation (RAG). International Journal of AI, BigData, Computational and Management Studies, 5(4), Article 113. https://doi.org/10.63282/3050‑9416.IJAIBDCMS‑V5I4P113
+                    </a>
+                    </td>
+                </tr>
+                <tr>
+                    <td>Pinecone</td>
+                    <td>Fully managed vector database for production RAG systems.</td>
+                    <td>Proprietary ANN (HNSW-like)</td>
+                    <td>Fully managed<br>High availability<br>Scalable</td>
+                    <td>Closed-source<br>Opaque indexing</td>
+                    <td class="timeline-reference">
+                    <a href="https://simg.baai.ac.cn/paperfile/25a43194-c74c-4cd3-b60f-0a1f27f8b8af.pdf" target="_blank">
+                        Gao, Y., Xiong, Y., Gao, X., Jia, K., Pan, J., Bi, Y., Dai, Y., Sun, J., Wang, M., & Wang, H. (2024). Retrieval‑Augmented Generation for Large Language Models: A Survey (arXiv:2312.10997). arXiv Preprint. https://doi.org/10.48550/arXiv.2312.10997
+                    </a>
+                    </td>
+                </tr>
+                <tr>
+                    <td>Weaviate</td>
+                    <td>Open-source vector DB with schema & metadata-aware retrieval.</td>
+                    <td>HNSW</td>
+                    <td>Simple API<br>Schema support</td>
+                    <td>High memory usage<br>Limited ANN tuning</td>
+                    <td class="timeline-reference">
+                    <a href="https://ijaibdcms.org/index.php/ijaibdcms/article/view/257?utm_source=chatgpt.com" target="_blank">
+                        Rusum, G. P., & Anasuri, S. (2025). Vector databases in modern applications: Real‑time search, recommendations, and retrieval‑augmented generation (RAG). International Journal of AI, BigData, Computational and Management Studies, 5(4), Article 113. https://doi.org/10.63282/3050‑9416.IJAIBDCMS‑V5I4P113
+                    </a>
+                    </td>
+                </tr>
+                <tr>
+                    <td>Elasticsearch (Vector)</td>
+                    <td>Hybrid sparse-dense retrieval using BM25 + vectors.</td>
+                    <td>HNSW + BM25</td>
+                    <td>Hybrid retrieval<br>Mature ecosystem</td>
+                    <td>Higher latency<br>Slower pure vector search</td>
+                    <td class="timeline-reference">
+                    <a href="https://simg.baai.ac.cn/paperfile/25a43194-c74c-4cd3-b60f-0a1f27f8b8af.pdf" target="_blank">
+                        Gao, Y., Xiong, Y., Gao, X., Jia, K., Pan, J., Bi, Y., Dai, Y., Sun, J., Wang, M., & Wang, H. (2024). Retrieval‑Augmented Generation for Large Language Models: A Survey (arXiv:2312.10997). arXiv Preprint. https://doi.org/10.48550/arXiv.2312.10997
+                    </a>
+                    </td>
+                </tr>
+                <tr>
+                    <td>Chroma</td>
+                    <td>Lightweight developer-focused vector store for prototyping.</td>
+                    <td>HNSW (ANN backends)</td>
+                    <td>Easy integration<br>Fast prototyping</td>
+                    <td>Limited scalability<br>Not enterprise-grade</td>
+                    <td>
+                    <a href="https://simg.baai.ac.cn/paperfile/25a43194-c74c-4cd3-b60f-0a1f27f8b8af.pdf" target="_blank">
+                        Gao, Y., Xiong, Y., Gao, X., Jia, K., Pan, J., Bi, Y., Dai, Y., Sun, J., Wang, M., & Wang, H. (2024). Retrieval‑Augmented Generation for Large Language Models: A Survey (arXiv:2312.10997). arXiv Preprint. https://doi.org/10.48550/arXiv.2312.10997
+                    </a>
+                    </td>
+                </tr>
+                </tbody>
+            </table>
+        </div>
+        </div>
     </div>
     <script>
                 paperUrl: "https://arxiv.org/abs/2407.21059"
             },
             {
+                type: "Cache RAG",
                 year: 2024,
+                category: "Modular",
+                indexing: "Preprocessing → Standard Chunking → Embedding → Vector DB Storage",
+                inference: "User Query → Cache Lookup → If Hit: Return Cached Answer → If Miss: Standard Retrieval → LLM Generate → Cache Store",
+                benefits: "Dramatically improves latency and reduces LLM cost for repeated or highly similar queries.",
+                challenges: "Complex cache invalidation logic; Requires robust query similarity and hashing functions.",
+                references: "Jin, C., et al. (2024). RAGCache: Efficient Knowledge Caching for Retrieval-Augmented Generation. ACM TOCS.",
+                paperUrl: "https://arxiv.org/abs/2404.12457"
             },
             {
                 type: "MultiModal RAG",
                 references: "Gao, Y., et al. (2024). Retrieval-Augmented Multimodal Language Modeling. CVPR 2024.",
                 paperUrl: "https://arxiv.org/abs/2211.12561"
             },
+            {
+                type: "Graph RAG",
+                year: 2024,
+                category: "Structural & Modular",
+                indexing: "Preprocessing → Entity/Relation Extraction → Store in Knowledge Graph (KG) & Vector DB",
+                inference: "User Query → Embedding/KG Query → Simultaneous Retrieval (Vector + KG Path) → Concatenation → LLM Generate",
+                benefits: "Resolves complex, multi-hop queries by leveraging factual relationships; Improves interpretability and fact consistency.",
+                challenges: "High indexing complexity (KG construction); Expensive maintenance for rapidly changing data; Retrieval latency can be high.",
+                references: "Barry, M., et al. (2025). GraphRAG: Leveraging Graph-Based Efficiency to Minimize Hallucinations in LLM-Driven RAG. GenAIK Workshop.",
+                paperUrl: "https://aclanthology.org/2025.genaik-1.6/"
+            },
             {
                 type: "Recursive RAG",
                 year: 2024,
                 paperUrl: "https://arxiv.org/abs/2405.10627"
             },
             {
+                type: "Multi-Hop RAG",
                 year: 2024,
+                category: "Structural & Modular",
+                indexing: "Preprocessing → Chunking/Entity Extraction → Embedding → Structured Storage (Vector DB + Optional KG)",
+                inference: "User Query → Query Decomposition → Hop 1 Retrieval → Iterative Reasoning → Hop 2 Retrieval → Final Evidence Aggregation → LLM Generate",
+                benefits: "Solves questions requiring reasoning across multiple independent documents or retrieval steps.",
+                challenges: "Prone to error propagation (if one hop fails); Significantly higher latency; Requires generation of accurate intermediate queries.",
+                references: "Tang, B., & Yang, Y. (2024). MultiHop-RAG: Benchmarking Retrieval-Augmented Generation for Multi-Hop Queries. COLM 2024.",
+                paperUrl: "https://arxiv.org/abs/2401.15391"
             },
             {
                 type: "Corrective RAG",
                 references: "Yan, S.-Q., et al. (2024). Corrective Retrieval Augmented Generation. arXiv:2401.15884",
                 paperUrl: "https://arxiv.org/abs/2401.15884"
             },
             {
                 type: "Agentic RAG",
                 year: 2024,
         ];
         const chunkingData = [
             {
                 type: "Sliding Window Chunking",
                 year: 1998,
                 description: "Uses a fixed chunk size plus overlap (e.g., chunk size 300, overlap 100). Maintains context continuity across chunks.",
                 references: "Carbonell, J. & Goldstein, J. (1998). The use of MMR, diversity-based reranking for reordering documents. SIGIR 1998.",
                 paperUrl: "https://dl.acm.org/doi/10.1145/290941.291025"
+            },
             {
+                type: "Fixed-size Chunking",
+                year: 2001,
+                category: "Size-based",
+                workflow: "Input Document → Split by Fixed Token/Word Count (200-500 tokens) → Create Chunks → Output Chunks",
+                description: "Splits text into chunks based on a fixed number of tokens/words (e.g., 200-500 tokens). Simple but may break semantic boundaries.",
+                references: "Collobert, R., et al. (2011). Natural language processing (almost) from scratch. Journal of Machine Learning Research 12:2493-2537.",
+                paperUrl: "https://www.jmlr.org/papers/volume12/collobert11a/collobert11a.pdf"
             },
             {
+                type: "Discourse-aware Chunking",
+                year: 2005,
+                category: "Semantic-based",
+                workflow: "Input Document → Identify Discourse Markers (however, in contrast, therefore) → Detect Conceptual Shifts → Split at Discourse Boundaries → Output Discourse Chunks",
+                description: "Uses discourse markers (e.g., 'however', 'in contrast', 'therefore') to detect conceptual shifts and chunk accordingly.",
+                references: "Sporleder, C. & Lapata, M. (2005). Discourse Chunking and its Application to Sentence Compression. HLT 2005.",
+                paperUrl: "https://aclanthology.org/H05-1033/"
             },
             {
                 type: "Paragraph-based Chunking",
                 references: "Latif, S., et al. (2025). The Chunking Paradigm: Recursive Semantic for RAG. ICNLSP 2025.",
                 paperUrl: "https://aclanthology.org/2025.icnlsp-1.16/"
             },
+            {
+                type: "Semantic Chunking",
+                year: 2024,
+                category: "Semantic-based",
+                workflow: "Input Document → Analyze Topic Shifts/Semantic Boundaries → Split at Semantic Breaks → Create Semantically Coherent Chunks → Output Chunks",
+                description: "Chunks based on topic shifts or semantic boundaries rather than size. Preserves meaning and context.",
+                references: "LangChain (2024). Semantic chunker. Accessed 2024-09-14.",
+                paperUrl: "https://python.langchain.com/docs/how_to/semantic-chunker/"
+            },
+            {
+                type: "Sentence-based Chunking",
+                year: 2024,
+                category: "Structure-based",
+                workflow: "Input Document → Split by Sentences → Group Sentences (Optional) → Create Sentence Chunks → Output Chunks",
+                description: "Splits text by sentences. Maintains grammatical integrity and natural language boundaries.",
+                references: "Dong, K., et al. (2024). Multi-view content-aware indexing for long document retrieval. arXiv:2404.15103",
+                paperUrl: "https://arxiv.org/abs/2404.15103"
+            },
             {
                 type: "Hybrid Chunking",
                 year: 2024,
                 references: "Kamradt, G. (2024). 5 levels of text splitting.",
                 paperUrl: ""
             },
             {
                 type: "Embedding-similarity-based Chunking",
                 year: 2024,
                 references: "Kamradt, G. (2024). 5 levels of text splitting.",
                 paperUrl: ""
             },
             {
                 type: "Page-based Chunking",
                 year: 2024,
                 description: "Tailored to content type (e.g., legal documents by sections, medical records by encounters, code by functions).",
                 references: "Allamraju, A., et al. (2024). Breaking It Down: Domain-Aware Semantic Segmentation for Retrieval Augmented Generation. arXiv:2512.00367",
                 paperUrl: "https://arxiv.org/abs/2512.00367"
+            },
+            {
+                type: "Metadata-based Chunking",
+                year: 2025,
+                category: "Structure-based",
+                workflow: "Input Document → Extract Metadata (headers, bullets, sections) → Split Using Metadata Structure → Create Metadata-aware Chunks → Output Chunks",
+                description: "Splits text using document metadata (e.g., headers, bullets). Preserves document structure and hierarchy.",
+                references: "Zhao, J., et al. (2025). MoC: Mixtures of Text Chunking Learners for Retrieval-Augmented Generation. ACL 2025.",
+                paperUrl: "https://aclanthology.org/2025.acl-long.1/"
             }
         ];
             if (tabName === 'rag') {
                 tabs[0].classList.add('active');
                 document.getElementById('rag-content').classList.add('active');
+            } else if (tabName === 'chunking') {
                 tabs[1].classList.add('active');
                 document.getElementById('chunking-content').classList.add('active');
                 initChunkingVisualizations();
+            } else if (tabName === 'indexing-db') {
+                tabs[2].classList.add('active');
+                document.getElementById('indexing-db').classList.add('active');
+            }
         }
         function getCategoryBadgeClass(category) {