Update index.html
Browse files- index.html +383 -46
index.html
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
<head>
|
| 4 |
<meta charset="UTF-8">
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
-
<title>RAG
|
| 7 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/3.9.1/chart.min.js"></script>
|
| 8 |
<style>
|
| 9 |
* {
|
|
@@ -32,6 +32,41 @@
|
|
| 32 |
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
|
| 33 |
}
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
.grid {
|
| 36 |
display: grid;
|
| 37 |
grid-template-columns: repeat(auto-fit, minmax(450px, 1fr));
|
|
@@ -265,66 +300,141 @@
|
|
| 265 |
.def-foundational { border-left-color: #667eea; }
|
| 266 |
.def-agentic { border-left-color: #764ba2; }
|
| 267 |
.def-modular { border-left-color: #f093fb; }
|
| 268 |
-
.def-structural-modular { border-left-color: #4facfe; }
|
| 269 |
.def-structural { border-left-color: #00d2ff; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
</style>
|
| 271 |
</head>
|
| 272 |
<body>
|
| 273 |
<div class="container">
|
| 274 |
-
<h1>π RAG
|
| 275 |
-
|
| 276 |
-
<div class="
|
| 277 |
-
|
| 278 |
-
<
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
</div>
|
| 283 |
-
<div id="categoryDefinitions" style="margin-top: 20px;"></div>
|
| 284 |
</div>
|
| 285 |
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
<
|
| 289 |
-
|
| 290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
</div>
|
| 292 |
</div>
|
| 293 |
</div>
|
| 294 |
|
| 295 |
-
<!--
|
| 296 |
-
<div class="
|
| 297 |
-
<
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
<div class="card timeline-card">
|
| 303 |
-
<h2>Workflow Flowcharts</h2>
|
| 304 |
-
<div class="workflow-selector">
|
| 305 |
-
<select id="ragTypeSelector">
|
| 306 |
-
<option value="">Select a RAG Type to view workflows...</option>
|
| 307 |
-
</select>
|
| 308 |
-
</div>
|
| 309 |
-
<div id="flowchartDetails" style="display: none;">
|
| 310 |
-
<div class="flowchart-container">
|
| 311 |
-
<div class="flowchart">
|
| 312 |
-
<h3>π₯ Indexing Workflow</h3>
|
| 313 |
-
<div id="indexingFlowchart"></div>
|
| 314 |
</div>
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
| 318 |
</div>
|
| 319 |
</div>
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
</div>
|
| 325 |
-
<div class="
|
| 326 |
-
<
|
| 327 |
-
<
|
| 328 |
</div>
|
| 329 |
</div>
|
| 330 |
</div>
|
|
@@ -478,6 +588,105 @@
|
|
| 478 |
}
|
| 479 |
];
|
| 480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
const mainCategoryDefinitions = {
|
| 482 |
'Foundational': 'The baseline RAG approach establishing core retrieval-augmented generation principles with simple, fixed pipelines.',
|
| 483 |
'Modular': 'Focuses on pipeline flexibility through specialized, interchangeable components and intermediate operations between query and generation.',
|
|
@@ -485,6 +694,23 @@
|
|
| 485 |
'Structural': 'Innovations in how source documents are preprocessed, stored, and indexed for more effective retrieval (e.g., knowledge graphs, hierarchies, multi-modal embeddings).'
|
| 486 |
};
|
| 487 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
function getCategoryBadgeClass(category) {
|
| 489 |
const map = {
|
| 490 |
'Foundational': 'badge-foundational',
|
|
@@ -534,7 +760,7 @@
|
|
| 534 |
yearTimeline.appendChild(item);
|
| 535 |
});
|
| 536 |
|
| 537 |
-
// Populate selector
|
| 538 |
const selector = document.getElementById('ragTypeSelector');
|
| 539 |
ragData.forEach(rag => {
|
| 540 |
const option = document.createElement('option');
|
|
@@ -639,6 +865,117 @@
|
|
| 639 |
}
|
| 640 |
}
|
| 641 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
</script>
|
| 643 |
</body>
|
| 644 |
</html>
|
|
|
|
| 3 |
<head>
|
| 4 |
<meta charset="UTF-8">
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>RAG</title>
|
| 7 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/3.9.1/chart.min.js"></script>
|
| 8 |
<style>
|
| 9 |
* {
|
|
|
|
| 32 |
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
|
| 33 |
}
|
| 34 |
|
| 35 |
+
.tabs {
|
| 36 |
+
display: flex;
|
| 37 |
+
gap: 10px;
|
| 38 |
+
margin-bottom: 25px;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
.tab {
|
| 42 |
+
padding: 15px 30px;
|
| 43 |
+
background: rgba(255, 255, 255, 0.3);
|
| 44 |
+
color: white;
|
| 45 |
+
border: none;
|
| 46 |
+
border-radius: 10px 10px 0 0;
|
| 47 |
+
cursor: pointer;
|
| 48 |
+
font-size: 1.1em;
|
| 49 |
+
font-weight: bold;
|
| 50 |
+
transition: all 0.3s ease;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
.tab:hover {
|
| 54 |
+
background: rgba(255, 255, 255, 0.4);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
.tab.active {
|
| 58 |
+
background: white;
|
| 59 |
+
color: #667eea;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
.tab-content {
|
| 63 |
+
display: none;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
.tab-content.active {
|
| 67 |
+
display: block;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
.grid {
|
| 71 |
display: grid;
|
| 72 |
grid-template-columns: repeat(auto-fit, minmax(450px, 1fr));
|
|
|
|
| 300 |
.def-foundational { border-left-color: #667eea; }
|
| 301 |
.def-agentic { border-left-color: #764ba2; }
|
| 302 |
.def-modular { border-left-color: #f093fb; }
|
|
|
|
| 303 |
.def-structural { border-left-color: #00d2ff; }
|
| 304 |
+
|
| 305 |
+
.chunking-brief {
|
| 306 |
+
background: #f8f9fa;
|
| 307 |
+
padding: 15px;
|
| 308 |
+
border-radius: 8px;
|
| 309 |
+
margin-top: 15px;
|
| 310 |
+
border-left: 4px solid #667eea;
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
.chunking-brief p {
|
| 314 |
+
color: #555;
|
| 315 |
+
line-height: 1.6;
|
| 316 |
+
margin-bottom: 10px;
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
.chunking-brief .reference {
|
| 320 |
+
color: #888;
|
| 321 |
+
font-size: 0.9em;
|
| 322 |
+
font-style: italic;
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
.chunking-brief .reference a {
|
| 326 |
+
color: #667eea;
|
| 327 |
+
text-decoration: none;
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
.chunking-brief .reference a:hover {
|
| 331 |
+
text-decoration: underline;
|
| 332 |
+
}
|
| 333 |
</style>
|
| 334 |
</head>
|
| 335 |
<body>
|
| 336 |
<div class="container">
|
| 337 |
+
<h1>π RAG Dashboard</h1>
|
| 338 |
+
|
| 339 |
+
<div class="tabs">
|
| 340 |
+
<button class="tab active" onclick="switchTab('rag')">RAG Types</button>
|
| 341 |
+
<button class="tab" onclick="switchTab('chunking')">Chunking</button>
|
| 342 |
+
</div>
|
| 343 |
+
|
| 344 |
+
<!-- RAG TYPES TAB -->
|
| 345 |
+
<div id="rag-content" class="tab-content active">
|
| 346 |
+
<div class="grid">
|
| 347 |
+
<div class="card">
|
| 348 |
+
<h2>RAG Categories Distribution</h2>
|
| 349 |
+
<div class="chart-container">
|
| 350 |
+
<canvas id="categoryChart"></canvas>
|
| 351 |
+
</div>
|
| 352 |
+
<div id="categoryDefinitions"></div>
|
| 353 |
+
</div>
|
| 354 |
+
|
| 355 |
+
<div class="card">
|
| 356 |
+
<h2>RAG Types by Year</h2>
|
| 357 |
+
<div class="chart-container">
|
| 358 |
+
<canvas id="timelineChart"></canvas>
|
| 359 |
+
</div>
|
| 360 |
</div>
|
|
|
|
| 361 |
</div>
|
| 362 |
|
| 363 |
+
<div class="card timeline-card">
|
| 364 |
+
<h2>π
Evolution Timeline: Year β RAG Type</h2>
|
| 365 |
+
<div id="yearTimeline"></div>
|
| 366 |
+
</div>
|
| 367 |
+
|
| 368 |
+
<div class="card timeline-card">
|
| 369 |
+
<h2>RAG Workflow Flowcharts</h2>
|
| 370 |
+
<div class="workflow-selector">
|
| 371 |
+
<select id="ragTypeSelector">
|
| 372 |
+
<option value="">Select a RAG Type to view workflows...</option>
|
| 373 |
+
</select>
|
| 374 |
+
</div>
|
| 375 |
+
<div id="flowchartDetails" style="display: none;">
|
| 376 |
+
<div class="flowchart-container">
|
| 377 |
+
<div class="flowchart">
|
| 378 |
+
<h3>π₯ Indexing Workflow</h3>
|
| 379 |
+
<div id="indexingFlowchart"></div>
|
| 380 |
+
</div>
|
| 381 |
+
<div class="flowchart">
|
| 382 |
+
<h3>π Inference Workflow</h3>
|
| 383 |
+
<div id="inferenceFlowchart"></div>
|
| 384 |
+
</div>
|
| 385 |
+
</div>
|
| 386 |
+
<div class="benefits-challenges">
|
| 387 |
+
<div class="info-box">
|
| 388 |
+
<h4>β
Key Benefits</h4>
|
| 389 |
+
<p id="benefits"></p>
|
| 390 |
+
</div>
|
| 391 |
+
<div class="info-box">
|
| 392 |
+
<h4>β οΈ Challenges</h4>
|
| 393 |
+
<p id="challenges"></p>
|
| 394 |
+
</div>
|
| 395 |
+
</div>
|
| 396 |
</div>
|
| 397 |
</div>
|
| 398 |
</div>
|
| 399 |
|
| 400 |
+
<!-- CHUNKING TAB -->
|
| 401 |
+
<div id="chunking-content" class="tab-content">
|
| 402 |
+
<div class="grid">
|
| 403 |
+
<div class="card">
|
| 404 |
+
<h2>Chunking Methods Distribution</h2>
|
| 405 |
+
<div class="chart-container">
|
| 406 |
+
<canvas id="chunkingChart"></canvas>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
</div>
|
| 408 |
+
</div>
|
| 409 |
+
|
| 410 |
+
<div class="card">
|
| 411 |
+
<h2>Chunking Evolution Timeline</h2>
|
| 412 |
+
<div class="chart-container">
|
| 413 |
+
<canvas id="chunkingTimelineChart"></canvas>
|
| 414 |
</div>
|
| 415 |
</div>
|
| 416 |
+
</div>
|
| 417 |
+
|
| 418 |
+
<div class="card timeline-card">
|
| 419 |
+
<h2>π
Chunking Methods Timeline</h2>
|
| 420 |
+
<div id="chunkingYearTimeline"></div>
|
| 421 |
+
</div>
|
| 422 |
+
|
| 423 |
+
<div class="card timeline-card">
|
| 424 |
+
<h2>Chunking Method Flowcharts</h2>
|
| 425 |
+
<div class="workflow-selector">
|
| 426 |
+
<select id="chunkingSelector">
|
| 427 |
+
<option value="">Select a Chunking Method...</option>
|
| 428 |
+
</select>
|
| 429 |
+
</div>
|
| 430 |
+
<div id="chunkingFlowchartDetails" style="display: none;">
|
| 431 |
+
<div class="flowchart">
|
| 432 |
+
<h3>π Chunking Process Flow</h3>
|
| 433 |
+
<div id="chunkingFlowchart"></div>
|
| 434 |
</div>
|
| 435 |
+
<div class="chunking-brief">
|
| 436 |
+
<p id="chunkingDescription"></p>
|
| 437 |
+
<div class="reference" id="chunkingReference"></div>
|
| 438 |
</div>
|
| 439 |
</div>
|
| 440 |
</div>
|
|
|
|
| 588 |
}
|
| 589 |
];
|
| 590 |
|
| 591 |
+
const chunkingData = [
|
| 592 |
+
{
|
| 593 |
+
type: "Fixed-size Chunking",
|
| 594 |
+
year: 2001,
|
| 595 |
+
workflow: "Input Document β Split by Fixed Token/Word Count (200-500 tokens) β Create Chunks β Output Chunks",
|
| 596 |
+
description: "Splits text into chunks based on a fixed number of tokens/words (e.g., 200-500 tokens). Simple but may break semantic boundaries.",
|
| 597 |
+
references: "Collobert, R., et al. (2011). Natural language processing (almost) from scratch. Journal of Machine Learning Research 12:2493-2537.",
|
| 598 |
+
paperUrl: "https://www.jmlr.org/papers/volume12/collobert11a/collobert11a.pdf"
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
type: "Sliding Window Chunking",
|
| 602 |
+
year: 1998,
|
| 603 |
+
workflow: "Input Document β Define Chunk Size & Overlap β Slide Window (e.g., size 300, overlap 100) β Create Overlapping Chunks β Output Chunks",
|
| 604 |
+
description: "Uses a fixed chunk size plus overlap (e.g., chunk size 300, overlap 100). Maintains context continuity across chunks.",
|
| 605 |
+
references: "Carbonell, J. & Goldstein, J. (1998). The use of MMR, diversity-based reranking for reordering documents. SIGIR 1998.",
|
| 606 |
+
paperUrl: "https://dl.acm.org/doi/10.1145/290941.291025"
|
| 607 |
+
},
|
| 608 |
+
{
|
| 609 |
+
type: "Semantic Chunking",
|
| 610 |
+
year: 2024,
|
| 611 |
+
workflow: "Input Document β Analyze Topic Shifts/Semantic Boundaries β Split at Semantic Breaks β Create Semantically Coherent Chunks β Output Chunks",
|
| 612 |
+
description: "Chunks based on topic shifts or semantic boundaries rather than size. Preserves meaning and context.",
|
| 613 |
+
references: "LangChain (2024). Semantic chunker. Accessed 2024-09-14.",
|
| 614 |
+
paperUrl: "https://python.langchain.com/docs/how_to/semantic-chunker/"
|
| 615 |
+
},
|
| 616 |
+
{
|
| 617 |
+
type: "Sentence-based Chunking",
|
| 618 |
+
year: 2024,
|
| 619 |
+
workflow: "Input Document β Split by Sentences β Group Sentences (Optional) β Create Sentence Chunks β Output Chunks",
|
| 620 |
+
description: "Splits text by sentences. Maintains grammatical integrity and natural language boundaries.",
|
| 621 |
+
references: "Dong, K., et al. (2024). Multi-view content-aware indexing for long document retrieval. arXiv:2404.15103",
|
| 622 |
+
paperUrl: "https://arxiv.org/abs/2404.15103"
|
| 623 |
+
},
|
| 624 |
+
{
|
| 625 |
+
type: "Paragraph-based Chunking",
|
| 626 |
+
year: 2014,
|
| 627 |
+
workflow: "Input Document β Identify Paragraph Boundaries β Split at Paragraphs β Create Paragraph Chunks β Output Chunks",
|
| 628 |
+
description: "Uses natural paragraph boundaries for chunking. Preserves document structure and thematic grouping.",
|
| 629 |
+
references: "Dudhabaware, R. S., et al. (2014). Review on natural language processing tasks for text documents. IEEE ICCIC 2014.",
|
| 630 |
+
paperUrl: "https://ieeexplore.ieee.org/document/7238334"
|
| 631 |
+
},
|
| 632 |
+
{
|
| 633 |
+
type: "Recursive/Hierarchical Chunking",
|
| 634 |
+
year: 2023,
|
| 635 |
+
workflow: "Input Document β Break into Sections β Split Sections into Paragraphs β Split Paragraphs into Sentences β Split Sentences into Tokens β Output Multi-level Chunks",
|
| 636 |
+
description: "Breaks document into progressively smaller units (section β paragraph β sentence β tokens). Enables multi-level retrieval.",
|
| 637 |
+
references: "LangChain (2023). Recursively split by character. Accessed 2024-09-14. | Latif, S., et al. (2025). The Chunking Paradigm: Recursive Semantic for RAG. ICNLSP 2025.",
|
| 638 |
+
paperUrl: "https://aclanthology.org/2025.icnlsp-1.16/"
|
| 639 |
+
},
|
| 640 |
+
{
|
| 641 |
+
type: "Hybrid Chunking",
|
| 642 |
+
year: 2024,
|
| 643 |
+
workflow: "Input Document β Apply Semantic Analysis β Apply Fixed-size Constraint β Apply Overlap Strategy β Create Hybrid Chunks β Output Chunks",
|
| 644 |
+
description: "Combines semantic + fixed-size + overlap strategies. Balances context preservation with practical constraints.",
|
| 645 |
+
references: "Kamradt, G. (2024). 5 levels of text splitting. LangChain blog.",
|
| 646 |
+
paperUrl: "https://python.langchain.com/docs/how_to/semantic-chunker/"
|
| 647 |
+
},
|
| 648 |
+
{
|
| 649 |
+
type: "Discourse-aware Chunking",
|
| 650 |
+
year: 2005,
|
| 651 |
+
workflow: "Input Document β Identify Discourse Markers (however, in contrast, therefore) β Detect Conceptual Shifts β Split at Discourse Boundaries β Output Discourse Chunks",
|
| 652 |
+
description: "Uses discourse markers (e.g., 'however', 'in contrast', 'therefore') to detect conceptual shifts and chunk accordingly.",
|
| 653 |
+
references: "Sporleder, C. & Lapata, M. (2005). Discourse Chunking and its Application to Sentence Compression. HLT 2005.",
|
| 654 |
+
paperUrl: "https://aclanthology.org/H05-1033/"
|
| 655 |
+
},
|
| 656 |
+
{
|
| 657 |
+
type: "Embedding-similarity-based Chunking",
|
| 658 |
+
year: 2024,
|
| 659 |
+
workflow: "Input Document β Generate Sentence Embeddings β Calculate Similarity Scores β Detect Similarity Drops Below Threshold β Split at Low Similarity Points β Output Chunks",
|
| 660 |
+
description: "Sequentially processes text and opens a new chunk when embedding similarity drops below a threshold. Data-driven approach.",
|
| 661 |
+
references: "Kamradt, G. (2024). 5 levels of text splitting. LangChain blog.",
|
| 662 |
+
paperUrl: "https://python.langchain.com/docs/how_to/semantic-chunker/"
|
| 663 |
+
},
|
| 664 |
+
{
|
| 665 |
+
type: "Metadata-based Chunking",
|
| 666 |
+
year: 2025,
|
| 667 |
+
workflow: "Input Document β Extract Metadata (headers, bullets, sections) β Split Using Metadata Structure β Create Metadata-aware Chunks β Output Chunks",
|
| 668 |
+
description: "Splits text using document metadata (e.g., headers, bullets). Preserves document structure and hierarchy.",
|
| 669 |
+
references: "Zhao, J., et al. (2025). MoC: Mixtures of Text Chunking Learners for Retrieval-Augmented Generation. ACL 2025.",
|
| 670 |
+
paperUrl: "https://aclanthology.org/2025.acl-long.1/"
|
| 671 |
+
},
|
| 672 |
+
{
|
| 673 |
+
type: "Page-based Chunking",
|
| 674 |
+
year: 2024,
|
| 675 |
+
workflow: "Input PDF/Scanned Document β Identify Page Boundaries β Split by Page β Create Page Chunks β Output Chunks",
|
| 676 |
+
description: "Splits by PDF page or scanned document page. Useful for document-level retrieval and citation.",
|
| 677 |
+
references: "Standard PDF processing practice. No specific paper required.",
|
| 678 |
+
paperUrl: "#"
|
| 679 |
+
},
|
| 680 |
+
{
|
| 681 |
+
type: "Domain-specific Chunking",
|
| 682 |
+
year: 2024,
|
| 683 |
+
workflow: "Input Domain Document β Apply Domain Rules (legal sections, medical records, code functions) β Split by Domain Logic β Create Domain Chunks β Output Chunks",
|
| 684 |
+
description: "Tailored to content type (e.g., legal documents by sections, medical records by encounters, code by functions).",
|
| 685 |
+
references: "Allamraju, A., et al. (2024). Breaking It Down: Domain-Aware Semantic Segmentation for Retrieval Augmented Generation. arXiv:2512.00367",
|
| 686 |
+
paperUrl: "https://arxiv.org/abs/2512.00367"
|
| 687 |
+
}
|
| 688 |
+
];
|
| 689 |
+
|
| 690 |
const mainCategoryDefinitions = {
|
| 691 |
'Foundational': 'The baseline RAG approach establishing core retrieval-augmented generation principles with simple, fixed pipelines.',
|
| 692 |
'Modular': 'Focuses on pipeline flexibility through specialized, interchangeable components and intermediate operations between query and generation.',
|
|
|
|
| 694 |
'Structural': 'Innovations in how source documents are preprocessed, stored, and indexed for more effective retrieval (e.g., knowledge graphs, hierarchies, multi-modal embeddings).'
|
| 695 |
};
|
| 696 |
|
| 697 |
+
function switchTab(tabName) {
|
| 698 |
+
const tabs = document.querySelectorAll('.tab');
|
| 699 |
+
const contents = document.querySelectorAll('.tab-content');
|
| 700 |
+
|
| 701 |
+
tabs.forEach(tab => tab.classList.remove('active'));
|
| 702 |
+
contents.forEach(content => content.classList.remove('active'));
|
| 703 |
+
|
| 704 |
+
if (tabName === 'rag') {
|
| 705 |
+
tabs[0].classList.add('active');
|
| 706 |
+
document.getElementById('rag-content').classList.add('active');
|
| 707 |
+
} else {
|
| 708 |
+
tabs[1].classList.add('active');
|
| 709 |
+
document.getElementById('chunking-content').classList.add('active');
|
| 710 |
+
initChunkingVisualizations();
|
| 711 |
+
}
|
| 712 |
+
}
|
| 713 |
+
|
| 714 |
function getCategoryBadgeClass(category) {
|
| 715 |
const map = {
|
| 716 |
'Foundational': 'badge-foundational',
|
|
|
|
| 760 |
yearTimeline.appendChild(item);
|
| 761 |
});
|
| 762 |
|
| 763 |
+
// Populate RAG selector
|
| 764 |
const selector = document.getElementById('ragTypeSelector');
|
| 765 |
ragData.forEach(rag => {
|
| 766 |
const option = document.createElement('option');
|
|
|
|
| 865 |
}
|
| 866 |
}
|
| 867 |
});
|
| 868 |
+
|
| 869 |
+
// Chunking Visualizations
|
| 870 |
+
let chunkingChartsInitialized = false;
|
| 871 |
+
|
| 872 |
+
function initChunkingVisualizations() {
|
| 873 |
+
if (chunkingChartsInitialized) return;
|
| 874 |
+
chunkingChartsInitialized = true;
|
| 875 |
+
|
| 876 |
+
// Populate chunking selector
|
| 877 |
+
const chunkingSelector = document.getElementById('chunkingSelector');
|
| 878 |
+
chunkingData.forEach(chunk => {
|
| 879 |
+
const option = document.createElement('option');
|
| 880 |
+
option.value = chunk.type;
|
| 881 |
+
option.textContent = `${chunk.type} (${chunk.year})`;
|
| 882 |
+
chunkingSelector.appendChild(option);
|
| 883 |
+
});
|
| 884 |
+
|
| 885 |
+
chunkingSelector.addEventListener('change', (e) => {
|
| 886 |
+
const selected = chunkingData.find(c => c.type === e.target.value);
|
| 887 |
+
const details = document.getElementById('chunkingFlowchartDetails');
|
| 888 |
+
|
| 889 |
+
if (selected) {
|
| 890 |
+
createFlowchart(selected.workflow, 'chunkingFlowchart');
|
| 891 |
+
document.getElementById('chunkingDescription').textContent = selected.description;
|
| 892 |
+
document.getElementById('chunkingReference').innerHTML =
|
| 893 |
+
`π <a href="${selected.paperUrl}" target="_blank">${selected.references}</a>`;
|
| 894 |
+
details.style.display = 'block';
|
| 895 |
+
} else {
|
| 896 |
+
details.style.display = 'none';
|
| 897 |
+
}
|
| 898 |
+
});
|
| 899 |
+
|
| 900 |
+
// Chunking methods chart
|
| 901 |
+
const chunkingTypes = {};
|
| 902 |
+
chunkingData.forEach(chunk => {
|
| 903 |
+
const category = chunk.type.includes('Semantic') || chunk.type.includes('Embedding') ? 'Semantic-based' :
|
| 904 |
+
chunk.type.includes('Fixed') || chunk.type.includes('Sliding') ? 'Size-based' :
|
| 905 |
+
chunk.type.includes('Sentence') || chunk.type.includes('Paragraph') ? 'Structure-based' :
|
| 906 |
+
chunk.type.includes('Recursive') || chunk.type.includes('Hierarchical') ? 'Hierarchical' :
|
| 907 |
+
'Domain-specific';
|
| 908 |
+
chunkingTypes[category] = (chunkingTypes[category] || 0) + 1;
|
| 909 |
+
});
|
| 910 |
+
|
| 911 |
+
new Chart(document.getElementById('chunkingChart'), {
|
| 912 |
+
type: 'pie',
|
| 913 |
+
data: {
|
| 914 |
+
labels: Object.keys(chunkingTypes),
|
| 915 |
+
datasets: [{
|
| 916 |
+
data: Object.values(chunkingTypes),
|
| 917 |
+
backgroundColor: ['#667eea', '#764ba2', '#f093fb', '#4facfe', '#00d2ff']
|
| 918 |
+
}]
|
| 919 |
+
},
|
| 920 |
+
options: {
|
| 921 |
+
responsive: true,
|
| 922 |
+
maintainAspectRatio: false,
|
| 923 |
+
plugins: {
|
| 924 |
+
legend: {
|
| 925 |
+
position: 'bottom'
|
| 926 |
+
}
|
| 927 |
+
}
|
| 928 |
+
}
|
| 929 |
+
});
|
| 930 |
+
|
| 931 |
+
// Chunking timeline
|
| 932 |
+
const chunkingYearCount = {};
|
| 933 |
+
chunkingData.forEach(chunk => {
|
| 934 |
+
chunkingYearCount[chunk.year] = (chunkingYearCount[chunk.year] || 0) + 1;
|
| 935 |
+
});
|
| 936 |
+
|
| 937 |
+
new Chart(document.getElementById('chunkingTimelineChart'), {
|
| 938 |
+
type: 'line',
|
| 939 |
+
data: {
|
| 940 |
+
labels: Object.keys(chunkingYearCount).sort(),
|
| 941 |
+
datasets: [{
|
| 942 |
+
label: 'Chunking Methods',
|
| 943 |
+
data: Object.keys(chunkingYearCount).sort().map(year => chunkingYearCount[year]),
|
| 944 |
+
borderColor: '#667eea',
|
| 945 |
+
backgroundColor: 'rgba(102, 126, 234, 0.1)',
|
| 946 |
+
tension: 0.4,
|
| 947 |
+
fill: true
|
| 948 |
+
}]
|
| 949 |
+
},
|
| 950 |
+
options: {
|
| 951 |
+
responsive: true,
|
| 952 |
+
maintainAspectRatio: false,
|
| 953 |
+
scales: {
|
| 954 |
+
y: {
|
| 955 |
+
beginAtZero: true,
|
| 956 |
+
ticks: {
|
| 957 |
+
stepSize: 1
|
| 958 |
+
}
|
| 959 |
+
}
|
| 960 |
+
}
|
| 961 |
+
}
|
| 962 |
+
});
|
| 963 |
+
|
| 964 |
+
// Chunking year timeline
|
| 965 |
+
const chunkingYearTimeline = document.getElementById('chunkingYearTimeline');
|
| 966 |
+
chunkingData.forEach(chunk => {
|
| 967 |
+
const item = document.createElement('div');
|
| 968 |
+
item.className = 'timeline-item';
|
| 969 |
+
item.innerHTML = `
|
| 970 |
+
<div class="timeline-year">${chunk.year}</div>
|
| 971 |
+
<div class="timeline-content">
|
| 972 |
+
<div class="timeline-type">${chunk.type}</div>
|
| 973 |
+
<div class="timeline-reference">π <a href="${chunk.paperUrl}" target="_blank">${chunk.references}</a></div>
|
| 974 |
+
</div>
|
| 975 |
+
`;
|
| 976 |
+
chunkingYearTimeline.appendChild(item);
|
| 977 |
+
});
|
| 978 |
+
}
|
| 979 |
</script>
|
| 980 |
</body>
|
| 981 |
</html>
|