Update index.html
Browse files- index.html +126 -52
index.html
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
<head>
|
| 4 |
<meta charset="UTF-8">
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
-
<title>RAG</title>
|
| 7 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/3.9.1/chart.min.js"></script>
|
| 8 |
<style>
|
| 9 |
* {
|
|
@@ -330,15 +330,61 @@
|
|
| 330 |
.chunking-brief .reference a:hover {
|
| 331 |
text-decoration: underline;
|
| 332 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
</style>
|
| 334 |
</head>
|
| 335 |
<body>
|
| 336 |
<div class="container">
|
| 337 |
-
<h1>π RAG Dashboard</h1>
|
| 338 |
|
| 339 |
<div class="tabs">
|
| 340 |
<button class="tab active" onclick="switchTab('rag')">RAG Types</button>
|
| 341 |
-
<button class="tab" onclick="switchTab('chunking')">Chunking</button>
|
| 342 |
</div>
|
| 343 |
|
| 344 |
<!-- RAG TYPES TAB -->
|
|
@@ -405,13 +451,12 @@
|
|
| 405 |
<div class="chart-container">
|
| 406 |
<canvas id="chunkingChart"></canvas>
|
| 407 |
</div>
|
|
|
|
| 408 |
</div>
|
| 409 |
|
| 410 |
<div class="card">
|
| 411 |
-
<h2>Chunking
|
| 412 |
-
<div class="
|
| 413 |
-
<canvas id="chunkingTimelineChart"></canvas>
|
| 414 |
-
</div>
|
| 415 |
</div>
|
| 416 |
</div>
|
| 417 |
|
|
@@ -592,6 +637,7 @@
|
|
| 592 |
{
|
| 593 |
type: "Fixed-size Chunking",
|
| 594 |
year: 2001,
|
|
|
|
| 595 |
workflow: "Input Document β Split by Fixed Token/Word Count (200-500 tokens) β Create Chunks β Output Chunks",
|
| 596 |
description: "Splits text into chunks based on a fixed number of tokens/words (e.g., 200-500 tokens). Simple but may break semantic boundaries.",
|
| 597 |
references: "Collobert, R., et al. (2011). Natural language processing (almost) from scratch. Journal of Machine Learning Research 12:2493-2537.",
|
|
@@ -600,6 +646,7 @@
|
|
| 600 |
{
|
| 601 |
type: "Sliding Window Chunking",
|
| 602 |
year: 1998,
|
|
|
|
| 603 |
workflow: "Input Document β Define Chunk Size & Overlap β Slide Window (e.g., size 300, overlap 100) β Create Overlapping Chunks β Output Chunks",
|
| 604 |
description: "Uses a fixed chunk size plus overlap (e.g., chunk size 300, overlap 100). Maintains context continuity across chunks.",
|
| 605 |
references: "Carbonell, J. & Goldstein, J. (1998). The use of MMR, diversity-based reranking for reordering documents. SIGIR 1998.",
|
|
@@ -608,6 +655,7 @@
|
|
| 608 |
{
|
| 609 |
type: "Semantic Chunking",
|
| 610 |
year: 2024,
|
|
|
|
| 611 |
workflow: "Input Document β Analyze Topic Shifts/Semantic Boundaries β Split at Semantic Breaks β Create Semantically Coherent Chunks β Output Chunks",
|
| 612 |
description: "Chunks based on topic shifts or semantic boundaries rather than size. Preserves meaning and context.",
|
| 613 |
references: "LangChain (2024). Semantic chunker. Accessed 2024-09-14.",
|
|
@@ -616,6 +664,7 @@
|
|
| 616 |
{
|
| 617 |
type: "Sentence-based Chunking",
|
| 618 |
year: 2024,
|
|
|
|
| 619 |
workflow: "Input Document β Split by Sentences β Group Sentences (Optional) β Create Sentence Chunks β Output Chunks",
|
| 620 |
description: "Splits text by sentences. Maintains grammatical integrity and natural language boundaries.",
|
| 621 |
references: "Dong, K., et al. (2024). Multi-view content-aware indexing for long document retrieval. arXiv:2404.15103",
|
|
@@ -624,6 +673,7 @@
|
|
| 624 |
{
|
| 625 |
type: "Paragraph-based Chunking",
|
| 626 |
year: 2014,
|
|
|
|
| 627 |
workflow: "Input Document β Identify Paragraph Boundaries β Split at Paragraphs β Create Paragraph Chunks β Output Chunks",
|
| 628 |
description: "Uses natural paragraph boundaries for chunking. Preserves document structure and thematic grouping.",
|
| 629 |
references: "Dudhabaware, R. S., et al. (2014). Review on natural language processing tasks for text documents. IEEE ICCIC 2014.",
|
|
@@ -632,22 +682,25 @@
|
|
| 632 |
{
|
| 633 |
type: "Recursive/Hierarchical Chunking",
|
| 634 |
year: 2023,
|
|
|
|
| 635 |
workflow: "Input Document β Break into Sections β Split Sections into Paragraphs β Split Paragraphs into Sentences β Split Sentences into Tokens β Output Multi-level Chunks",
|
| 636 |
description: "Breaks document into progressively smaller units (section β paragraph β sentence β tokens). Enables multi-level retrieval.",
|
| 637 |
-
references: "
|
| 638 |
paperUrl: "https://aclanthology.org/2025.icnlsp-1.16/"
|
| 639 |
},
|
| 640 |
{
|
| 641 |
type: "Hybrid Chunking",
|
| 642 |
year: 2024,
|
|
|
|
| 643 |
workflow: "Input Document β Apply Semantic Analysis β Apply Fixed-size Constraint β Apply Overlap Strategy β Create Hybrid Chunks β Output Chunks",
|
| 644 |
description: "Combines semantic + fixed-size + overlap strategies. Balances context preservation with practical constraints.",
|
| 645 |
-
references: "Kamradt, G. (2024). 5 levels of text splitting.
|
| 646 |
-
paperUrl: "
|
| 647 |
},
|
| 648 |
{
|
| 649 |
type: "Discourse-aware Chunking",
|
| 650 |
year: 2005,
|
|
|
|
| 651 |
workflow: "Input Document β Identify Discourse Markers (however, in contrast, therefore) β Detect Conceptual Shifts β Split at Discourse Boundaries β Output Discourse Chunks",
|
| 652 |
description: "Uses discourse markers (e.g., 'however', 'in contrast', 'therefore') to detect conceptual shifts and chunk accordingly.",
|
| 653 |
references: "Sporleder, C. & Lapata, M. (2005). Discourse Chunking and its Application to Sentence Compression. HLT 2005.",
|
|
@@ -656,14 +709,16 @@
|
|
| 656 |
{
|
| 657 |
type: "Embedding-similarity-based Chunking",
|
| 658 |
year: 2024,
|
|
|
|
| 659 |
workflow: "Input Document β Generate Sentence Embeddings β Calculate Similarity Scores β Detect Similarity Drops Below Threshold β Split at Low Similarity Points β Output Chunks",
|
| 660 |
description: "Sequentially processes text and opens a new chunk when embedding similarity drops below a threshold. Data-driven approach.",
|
| 661 |
-
references: "Kamradt, G. (2024). 5 levels of text splitting.
|
| 662 |
-
paperUrl: "
|
| 663 |
},
|
| 664 |
{
|
| 665 |
type: "Metadata-based Chunking",
|
| 666 |
year: 2025,
|
|
|
|
| 667 |
workflow: "Input Document β Extract Metadata (headers, bullets, sections) β Split Using Metadata Structure β Create Metadata-aware Chunks β Output Chunks",
|
| 668 |
description: "Splits text using document metadata (e.g., headers, bullets). Preserves document structure and hierarchy.",
|
| 669 |
references: "Zhao, J., et al. (2025). MoC: Mixtures of Text Chunking Learners for Retrieval-Augmented Generation. ACL 2025.",
|
|
@@ -672,14 +727,16 @@
|
|
| 672 |
{
|
| 673 |
type: "Page-based Chunking",
|
| 674 |
year: 2024,
|
|
|
|
| 675 |
workflow: "Input PDF/Scanned Document β Identify Page Boundaries β Split by Page β Create Page Chunks β Output Chunks",
|
| 676 |
description: "Splits by PDF page or scanned document page. Useful for document-level retrieval and citation.",
|
| 677 |
-
references: "
|
| 678 |
-
paperUrl: "
|
| 679 |
},
|
| 680 |
{
|
| 681 |
type: "Domain-specific Chunking",
|
| 682 |
year: 2024,
|
|
|
|
| 683 |
workflow: "Input Domain Document β Apply Domain Rules (legal sections, medical records, code functions) β Split by Domain Logic β Create Domain Chunks β Output Chunks",
|
| 684 |
description: "Tailored to content type (e.g., legal documents by sections, medical records by encounters, code by functions).",
|
| 685 |
references: "Allamraju, A., et al. (2024). Breaking It Down: Domain-Aware Semantic Segmentation for Retrieval Augmented Generation. arXiv:2512.00367",
|
|
@@ -687,6 +744,14 @@
|
|
| 687 |
}
|
| 688 |
];
|
| 689 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 690 |
const mainCategoryDefinitions = {
|
| 691 |
'Foundational': 'The baseline RAG approach establishing core retrieval-augmented generation principles with simple, fixed pipelines.',
|
| 692 |
'Modular': 'Focuses on pipeline flexibility through specialized, interchangeable components and intermediate operations between query and generation.',
|
|
@@ -889,8 +954,16 @@
|
|
| 889 |
if (selected) {
|
| 890 |
createFlowchart(selected.workflow, 'chunkingFlowchart');
|
| 891 |
document.getElementById('chunkingDescription').textContent = selected.description;
|
| 892 |
-
|
| 893 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 894 |
details.style.display = 'block';
|
| 895 |
} else {
|
| 896 |
details.style.display = 'none';
|
|
@@ -900,12 +973,7 @@
|
|
| 900 |
// Chunking methods chart
|
| 901 |
const chunkingTypes = {};
|
| 902 |
chunkingData.forEach(chunk => {
|
| 903 |
-
|
| 904 |
-
chunk.type.includes('Fixed') || chunk.type.includes('Sliding') ? 'Size-based' :
|
| 905 |
-
chunk.type.includes('Sentence') || chunk.type.includes('Paragraph') ? 'Structure-based' :
|
| 906 |
-
chunk.type.includes('Recursive') || chunk.type.includes('Hierarchical') ? 'Hierarchical' :
|
| 907 |
-
'Domain-specific';
|
| 908 |
-
chunkingTypes[category] = (chunkingTypes[category] || 0) + 1;
|
| 909 |
});
|
| 910 |
|
| 911 |
new Chart(document.getElementById('chunkingChart'), {
|
|
@@ -928,37 +996,35 @@
|
|
| 928 |
}
|
| 929 |
});
|
| 930 |
|
| 931 |
-
//
|
| 932 |
-
const
|
| 933 |
-
|
| 934 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 935 |
});
|
| 936 |
|
| 937 |
-
|
| 938 |
-
|
| 939 |
-
|
| 940 |
-
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
-
|
| 946 |
-
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
y: {
|
| 955 |
-
beginAtZero: true,
|
| 956 |
-
ticks: {
|
| 957 |
-
stepSize: 1
|
| 958 |
-
}
|
| 959 |
-
}
|
| 960 |
-
}
|
| 961 |
-
}
|
| 962 |
});
|
| 963 |
|
| 964 |
// Chunking year timeline
|
|
@@ -966,11 +1032,19 @@
|
|
| 966 |
chunkingData.forEach(chunk => {
|
| 967 |
const item = document.createElement('div');
|
| 968 |
item.className = 'timeline-item';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 969 |
item.innerHTML = `
|
| 970 |
<div class="timeline-year">${chunk.year}</div>
|
| 971 |
<div class="timeline-content">
|
| 972 |
<div class="timeline-type">${chunk.type}</div>
|
| 973 |
-
|
| 974 |
</div>
|
| 975 |
`;
|
| 976 |
chunkingYearTimeline.appendChild(item);
|
|
|
|
| 3 |
<head>
|
| 4 |
<meta charset="UTF-8">
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>RAG & Chunking Visualization</title>
|
| 7 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/3.9.1/chart.min.js"></script>
|
| 8 |
<style>
|
| 9 |
* {
|
|
|
|
| 330 |
.chunking-brief .reference a:hover {
|
| 331 |
text-decoration: underline;
|
| 332 |
}
|
| 333 |
+
|
| 334 |
+
.category-tree {
|
| 335 |
+
background: #f8f9fa;
|
| 336 |
+
padding: 20px;
|
| 337 |
+
border-radius: 10px;
|
| 338 |
+
margin-top: 20px;
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
.category-node {
|
| 342 |
+
margin: 15px 0;
|
| 343 |
+
padding: 15px;
|
| 344 |
+
background: white;
|
| 345 |
+
border-radius: 8px;
|
| 346 |
+
border-left: 4px solid #667eea;
|
| 347 |
+
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
.category-node h4 {
|
| 351 |
+
color: #667eea;
|
| 352 |
+
margin-bottom: 10px;
|
| 353 |
+
font-size: 1.1em;
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
.category-node .brief {
|
| 357 |
+
color: #666;
|
| 358 |
+
font-size: 0.9em;
|
| 359 |
+
font-style: italic;
|
| 360 |
+
margin-bottom: 10px;
|
| 361 |
+
line-height: 1.5;
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
.category-methods {
|
| 365 |
+
display: flex;
|
| 366 |
+
flex-wrap: wrap;
|
| 367 |
+
gap: 8px;
|
| 368 |
+
margin-top: 10px;
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
.method-tag {
|
| 372 |
+
background: linear-gradient(135deg, #667eea, #764ba2);
|
| 373 |
+
color: white;
|
| 374 |
+
padding: 6px 12px;
|
| 375 |
+
border-radius: 15px;
|
| 376 |
+
font-size: 0.85em;
|
| 377 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 378 |
+
}
|
| 379 |
</style>
|
| 380 |
</head>
|
| 381 |
<body>
|
| 382 |
<div class="container">
|
| 383 |
+
<h1>π RAG & Chunking Visualization Dashboard</h1>
|
| 384 |
|
| 385 |
<div class="tabs">
|
| 386 |
<button class="tab active" onclick="switchTab('rag')">RAG Types</button>
|
| 387 |
+
<button class="tab" onclick="switchTab('chunking')">Chunking Methods</button>
|
| 388 |
</div>
|
| 389 |
|
| 390 |
<!-- RAG TYPES TAB -->
|
|
|
|
| 451 |
<div class="chart-container">
|
| 452 |
<canvas id="chunkingChart"></canvas>
|
| 453 |
</div>
|
| 454 |
+
<div id="chunkingCategoryDefs"></div>
|
| 455 |
</div>
|
| 456 |
|
| 457 |
<div class="card">
|
| 458 |
+
<h2>Chunking Categories & Methods</h2>
|
| 459 |
+
<div id="categoryHierarchy" class="category-tree"></div>
|
|
|
|
|
|
|
| 460 |
</div>
|
| 461 |
</div>
|
| 462 |
|
|
|
|
| 637 |
{
|
| 638 |
type: "Fixed-size Chunking",
|
| 639 |
year: 2001,
|
| 640 |
+
category: "Size-based",
|
| 641 |
workflow: "Input Document β Split by Fixed Token/Word Count (200-500 tokens) β Create Chunks β Output Chunks",
|
| 642 |
description: "Splits text into chunks based on a fixed number of tokens/words (e.g., 200-500 tokens). Simple but may break semantic boundaries.",
|
| 643 |
references: "Collobert, R., et al. (2011). Natural language processing (almost) from scratch. Journal of Machine Learning Research 12:2493-2537.",
|
|
|
|
| 646 |
{
|
| 647 |
type: "Sliding Window Chunking",
|
| 648 |
year: 1998,
|
| 649 |
+
category: "Size-based",
|
| 650 |
workflow: "Input Document β Define Chunk Size & Overlap β Slide Window (e.g., size 300, overlap 100) β Create Overlapping Chunks β Output Chunks",
|
| 651 |
description: "Uses a fixed chunk size plus overlap (e.g., chunk size 300, overlap 100). Maintains context continuity across chunks.",
|
| 652 |
references: "Carbonell, J. & Goldstein, J. (1998). The use of MMR, diversity-based reranking for reordering documents. SIGIR 1998.",
|
|
|
|
| 655 |
{
|
| 656 |
type: "Semantic Chunking",
|
| 657 |
year: 2024,
|
| 658 |
+
category: "Semantic-based",
|
| 659 |
workflow: "Input Document β Analyze Topic Shifts/Semantic Boundaries β Split at Semantic Breaks β Create Semantically Coherent Chunks β Output Chunks",
|
| 660 |
description: "Chunks based on topic shifts or semantic boundaries rather than size. Preserves meaning and context.",
|
| 661 |
references: "LangChain (2024). Semantic chunker. Accessed 2024-09-14.",
|
|
|
|
| 664 |
{
|
| 665 |
type: "Sentence-based Chunking",
|
| 666 |
year: 2024,
|
| 667 |
+
category: "Structure-based",
|
| 668 |
workflow: "Input Document β Split by Sentences β Group Sentences (Optional) β Create Sentence Chunks β Output Chunks",
|
| 669 |
description: "Splits text by sentences. Maintains grammatical integrity and natural language boundaries.",
|
| 670 |
references: "Dong, K., et al. (2024). Multi-view content-aware indexing for long document retrieval. arXiv:2404.15103",
|
|
|
|
| 673 |
{
|
| 674 |
type: "Paragraph-based Chunking",
|
| 675 |
year: 2014,
|
| 676 |
+
category: "Structure-based",
|
| 677 |
workflow: "Input Document β Identify Paragraph Boundaries β Split at Paragraphs β Create Paragraph Chunks β Output Chunks",
|
| 678 |
description: "Uses natural paragraph boundaries for chunking. Preserves document structure and thematic grouping.",
|
| 679 |
references: "Dudhabaware, R. S., et al. (2014). Review on natural language processing tasks for text documents. IEEE ICCIC 2014.",
|
|
|
|
| 682 |
{
|
| 683 |
type: "Recursive/Hierarchical Chunking",
|
| 684 |
year: 2023,
|
| 685 |
+
category: "Hierarchical",
|
| 686 |
workflow: "Input Document β Break into Sections β Split Sections into Paragraphs β Split Paragraphs into Sentences β Split Sentences into Tokens β Output Multi-level Chunks",
|
| 687 |
description: "Breaks document into progressively smaller units (section β paragraph β sentence β tokens). Enables multi-level retrieval.",
|
| 688 |
+
references: "Latif, S., et al. (2025). The Chunking Paradigm: Recursive Semantic for RAG. ICNLSP 2025.",
|
| 689 |
paperUrl: "https://aclanthology.org/2025.icnlsp-1.16/"
|
| 690 |
},
|
| 691 |
{
|
| 692 |
type: "Hybrid Chunking",
|
| 693 |
year: 2024,
|
| 694 |
+
category: "Semantic-based",
|
| 695 |
workflow: "Input Document β Apply Semantic Analysis β Apply Fixed-size Constraint β Apply Overlap Strategy β Create Hybrid Chunks β Output Chunks",
|
| 696 |
description: "Combines semantic + fixed-size + overlap strategies. Balances context preservation with practical constraints.",
|
| 697 |
+
references: "Kamradt, G. (2024). 5 levels of text splitting.",
|
| 698 |
+
paperUrl: ""
|
| 699 |
},
|
| 700 |
{
|
| 701 |
type: "Discourse-aware Chunking",
|
| 702 |
year: 2005,
|
| 703 |
+
category: "Semantic-based",
|
| 704 |
workflow: "Input Document β Identify Discourse Markers (however, in contrast, therefore) β Detect Conceptual Shifts β Split at Discourse Boundaries β Output Discourse Chunks",
|
| 705 |
description: "Uses discourse markers (e.g., 'however', 'in contrast', 'therefore') to detect conceptual shifts and chunk accordingly.",
|
| 706 |
references: "Sporleder, C. & Lapata, M. (2005). Discourse Chunking and its Application to Sentence Compression. HLT 2005.",
|
|
|
|
| 709 |
{
|
| 710 |
type: "Embedding-similarity-based Chunking",
|
| 711 |
year: 2024,
|
| 712 |
+
category: "Semantic-based",
|
| 713 |
workflow: "Input Document β Generate Sentence Embeddings β Calculate Similarity Scores β Detect Similarity Drops Below Threshold β Split at Low Similarity Points β Output Chunks",
|
| 714 |
description: "Sequentially processes text and opens a new chunk when embedding similarity drops below a threshold. Data-driven approach.",
|
| 715 |
+
references: "Kamradt, G. (2024). 5 levels of text splitting.",
|
| 716 |
+
paperUrl: ""
|
| 717 |
},
|
| 718 |
{
|
| 719 |
type: "Metadata-based Chunking",
|
| 720 |
year: 2025,
|
| 721 |
+
category: "Structure-based",
|
| 722 |
workflow: "Input Document β Extract Metadata (headers, bullets, sections) β Split Using Metadata Structure β Create Metadata-aware Chunks β Output Chunks",
|
| 723 |
description: "Splits text using document metadata (e.g., headers, bullets). Preserves document structure and hierarchy.",
|
| 724 |
references: "Zhao, J., et al. (2025). MoC: Mixtures of Text Chunking Learners for Retrieval-Augmented Generation. ACL 2025.",
|
|
|
|
| 727 |
{
|
| 728 |
type: "Page-based Chunking",
|
| 729 |
year: 2024,
|
| 730 |
+
category: "Structure-based",
|
| 731 |
workflow: "Input PDF/Scanned Document β Identify Page Boundaries β Split by Page β Create Page Chunks β Output Chunks",
|
| 732 |
description: "Splits by PDF page or scanned document page. Useful for document-level retrieval and citation.",
|
| 733 |
+
references: "",
|
| 734 |
+
paperUrl: ""
|
| 735 |
},
|
| 736 |
{
|
| 737 |
type: "Domain-specific Chunking",
|
| 738 |
year: 2024,
|
| 739 |
+
category: "Domain-specific",
|
| 740 |
workflow: "Input Domain Document β Apply Domain Rules (legal sections, medical records, code functions) β Split by Domain Logic β Create Domain Chunks β Output Chunks",
|
| 741 |
description: "Tailored to content type (e.g., legal documents by sections, medical records by encounters, code by functions).",
|
| 742 |
references: "Allamraju, A., et al. (2024). Breaking It Down: Domain-Aware Semantic Segmentation for Retrieval Augmented Generation. arXiv:2512.00367",
|
|
|
|
| 744 |
}
|
| 745 |
];
|
| 746 |
|
| 747 |
+
const chunkingCategoryBriefs = {
|
| 748 |
+
'Size-based': 'Chunks text by fixed sizes or overlapping windows. Simple, predictable, but may break context.',
|
| 749 |
+
'Semantic-based': 'Identifies meaning, topics, or discourse patterns to create contextually coherent chunks.',
|
| 750 |
+
'Structure-based': 'Uses natural document structure (sentences, paragraphs, metadata) for boundaries.',
|
| 751 |
+
'Hierarchical': 'Creates multi-level chunks (sections β paragraphs β sentences) for flexible retrieval.',
|
| 752 |
+
'Domain-specific': 'Applies domain knowledge and rules tailored to specific content types (legal, medical, code).'
|
| 753 |
+
};
|
| 754 |
+
|
| 755 |
const mainCategoryDefinitions = {
|
| 756 |
'Foundational': 'The baseline RAG approach establishing core retrieval-augmented generation principles with simple, fixed pipelines.',
|
| 757 |
'Modular': 'Focuses on pipeline flexibility through specialized, interchangeable components and intermediate operations between query and generation.',
|
|
|
|
| 954 |
if (selected) {
|
| 955 |
createFlowchart(selected.workflow, 'chunkingFlowchart');
|
| 956 |
document.getElementById('chunkingDescription').textContent = selected.description;
|
| 957 |
+
|
| 958 |
+
const refElement = document.getElementById('chunkingReference');
|
| 959 |
+
if (selected.references && selected.paperUrl) {
|
| 960 |
+
refElement.innerHTML = `π <a href="${selected.paperUrl}" target="_blank">${selected.references}</a>`;
|
| 961 |
+
} else if (selected.references) {
|
| 962 |
+
refElement.textContent = `π ${selected.references}`;
|
| 963 |
+
} else {
|
| 964 |
+
refElement.textContent = '';
|
| 965 |
+
}
|
| 966 |
+
|
| 967 |
details.style.display = 'block';
|
| 968 |
} else {
|
| 969 |
details.style.display = 'none';
|
|
|
|
| 973 |
// Chunking methods chart
|
| 974 |
const chunkingTypes = {};
|
| 975 |
chunkingData.forEach(chunk => {
|
| 976 |
+
chunkingTypes[chunk.category] = (chunkingTypes[chunk.category] || 0) + 1;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 977 |
});
|
| 978 |
|
| 979 |
new Chart(document.getElementById('chunkingChart'), {
|
|
|
|
| 996 |
}
|
| 997 |
});
|
| 998 |
|
| 999 |
+
// Add category definitions
|
| 1000 |
+
const chunkingCategoryDefsContainer = document.getElementById('chunkingCategoryDefs');
|
| 1001 |
+
Object.keys(chunkingCategoryBriefs).forEach(category => {
|
| 1002 |
+
const div = document.createElement('div');
|
| 1003 |
+
div.className = 'category-definition def-modular';
|
| 1004 |
+
div.innerHTML = `
|
| 1005 |
+
<strong>${category}</strong>
|
| 1006 |
+
<p>${chunkingCategoryBriefs[category]}</p>
|
| 1007 |
+
`;
|
| 1008 |
+
chunkingCategoryDefsContainer.appendChild(div);
|
| 1009 |
});
|
| 1010 |
|
| 1011 |
+
// Create hierarchical category view
|
| 1012 |
+
const categoryHierarchy = document.getElementById('categoryHierarchy');
|
| 1013 |
+
Object.keys(chunkingCategoryBriefs).forEach(category => {
|
| 1014 |
+
const methods = chunkingData.filter(c => c.category === category);
|
| 1015 |
+
const node = document.createElement('div');
|
| 1016 |
+
node.className = 'category-node';
|
| 1017 |
+
|
| 1018 |
+
const methodTags = methods.map(m =>
|
| 1019 |
+
`<span class="method-tag">${m.type}</span>`
|
| 1020 |
+
).join('');
|
| 1021 |
+
|
| 1022 |
+
node.innerHTML = `
|
| 1023 |
+
<h4>${category}</h4>
|
| 1024 |
+
<div class="brief">${chunkingCategoryBriefs[category]}</div>
|
| 1025 |
+
<div class="category-methods">${methodTags}</div>
|
| 1026 |
+
`;
|
| 1027 |
+
categoryHierarchy.appendChild(node);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1028 |
});
|
| 1029 |
|
| 1030 |
// Chunking year timeline
|
|
|
|
| 1032 |
chunkingData.forEach(chunk => {
|
| 1033 |
const item = document.createElement('div');
|
| 1034 |
item.className = 'timeline-item';
|
| 1035 |
+
|
| 1036 |
+
let referenceHTML = '';
|
| 1037 |
+
if (chunk.references && chunk.paperUrl) {
|
| 1038 |
+
referenceHTML = `<div class="timeline-reference">π <a href="${chunk.paperUrl}" target="_blank">${chunk.references}</a></div>`;
|
| 1039 |
+
} else if (chunk.references) {
|
| 1040 |
+
referenceHTML = `<div class="timeline-reference">π ${chunk.references}</div>`;
|
| 1041 |
+
}
|
| 1042 |
+
|
| 1043 |
item.innerHTML = `
|
| 1044 |
<div class="timeline-year">${chunk.year}</div>
|
| 1045 |
<div class="timeline-content">
|
| 1046 |
<div class="timeline-type">${chunk.type}</div>
|
| 1047 |
+
${referenceHTML}
|
| 1048 |
</div>
|
| 1049 |
`;
|
| 1050 |
chunkingYearTimeline.appendChild(item);
|