zeroshotGPU / tests /test_chunking.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
import unittest
from zsgdp.chunking import build_agentic_chunks
from zsgdp.config import load_config
from zsgdp.schema import DocumentProfile, Element, FigureObject, PageProfile, ParsedDocument, QualityReport, TableObject
from zsgdp.verify import verify_chunks
class ChunkingTests(unittest.TestCase):
def test_agentic_chunking_builds_parent_child_chunks(self):
profile = DocumentProfile(
doc_id="d1",
source_path="sample.md",
file_type="markdown",
page_count=1,
extension=".md",
pages=[PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0)],
)
parsed = ParsedDocument(
doc_id="d1",
source_path="sample.md",
file_type="markdown",
quality_report=QualityReport(score=0.95),
)
parsed.elements.extend(
[
Element("e1", "d1", 1, "title", markdown="# Report", reading_order=1, source_parser="text"),
Element("e2", "d1", 1, "paragraph", text=" ".join(["alpha"] * 80), reading_order=2, source_parser="text"),
]
)
chunks = build_agentic_chunks(parsed, profile, load_config())
self.assertTrue(any(chunk.content_type == "parent" for chunk in chunks))
self.assertTrue(any(chunk.parent_chunk_id for chunk in chunks))
self.assertEqual(parsed.provenance["chunking"]["plan"]["target_tokens"], 512)
def test_chunk_readiness_adds_metrics(self):
profile = DocumentProfile(
doc_id="d1",
source_path="sample.md",
file_type="markdown",
page_count=1,
extension=".md",
pages=[PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0)],
)
parsed = ParsedDocument(
doc_id="d1",
source_path="sample.md",
file_type="markdown",
quality_report=QualityReport(score=0.95),
)
parsed.elements.append(
Element("e1", "d1", 1, "paragraph", text=" ".join(["alpha"] * 80), reading_order=1, source_parser="text")
)
parsed.chunks = build_agentic_chunks(parsed, profile, load_config())
report = verify_chunks(parsed, load_config())
self.assertEqual(report.metrics["chunk_count"], len(parsed.chunks))
self.assertIn("fixed_token_baseline", report.metrics["chunk_strategy_counts"])
self.assertIn("recursive_structure", report.metrics["chunk_strategy_counts"])
def test_fixed_token_baseline_chunks_are_emitted_with_provenance(self):
profile = DocumentProfile(
doc_id="d1",
source_path="sample.md",
file_type="markdown",
page_count=2,
extension=".md",
pages=[
PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0),
PageProfile(page_num=2, digital_text_chars=120, digital_text_quality=1.0),
],
)
parsed = ParsedDocument(
doc_id="d1",
source_path="sample.md",
file_type="markdown",
quality_report=QualityReport(score=0.95),
)
parsed.elements.extend(
[
Element("e1", "d1", 1, "paragraph", text=" ".join(["alpha"] * 18), reading_order=1, source_parser="text"),
Element("e2", "d1", 2, "paragraph", text=" ".join(["beta"] * 18), reading_order=1, source_parser="text"),
]
)
config = load_config(overrides={"chunking": {"target_tokens": 10, "overlap_ratio": 0.2}})
chunks = build_agentic_chunks(parsed, profile, config)
baseline_chunks = [chunk for chunk in chunks if chunk.strategy == "fixed_token_baseline"]
self.assertGreaterEqual(len(baseline_chunks), 4)
self.assertEqual(baseline_chunks[0].element_ids, ["e1"])
self.assertEqual(baseline_chunks[-1].page_end, 2)
self.assertEqual(parsed.provenance["chunking"]["fixed_token_baseline_count"], len(baseline_chunks))
def test_figure_without_caption_still_gets_visual_chunk(self):
profile = DocumentProfile(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
page_count=1,
extension=".pdf",
pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)],
)
parsed = ParsedDocument(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
quality_report=QualityReport(score=0.90),
)
parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf"))
parsed.figures.append(
FigureObject(
figure_id="f1",
page_num=1,
image_path="/tmp/figure.png",
confidence=0.5,
source_parser="pymupdf",
)
)
parsed.chunks = build_agentic_chunks(parsed, profile, load_config())
report = verify_chunks(parsed, load_config())
self.assertTrue(any(chunk.figure_ids == ["f1"] for chunk in parsed.chunks))
self.assertEqual(report.metrics["figure_chunk_coverage"], 1.0)
def test_table_chunk_keeps_multimodal_metadata(self):
profile = DocumentProfile(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
page_count=1,
extension=".pdf",
pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)],
)
parsed = ParsedDocument(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
quality_report=QualityReport(score=0.90),
)
parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf"))
parsed.tables.append(
TableObject(
table_id="t1",
page_nums=[1],
bbox=[(1.0, 2.0, 3.0, 4.0)],
markdown="| A | B |\n| --- | --- |\n| 1 | 2 |",
natural_language_rendering="Table with columns A, B. Rows: 1: B=2.",
confidence=0.82,
source_parser="pymupdf",
provenance={"crop_path": "/tmp/table.png", "source_parsers": ["pymupdf", "docling"]},
)
)
parsed.chunks = build_agentic_chunks(parsed, profile, load_config())
table_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "table_object")
self.assertEqual(table_chunk.text, "Table with columns A, B. Rows: 1: B=2.")
self.assertEqual(table_chunk.metadata["markdown"], "| A | B |\n| --- | --- |\n| 1 | 2 |")
self.assertEqual(table_chunk.metadata["bbox"], [(1.0, 2.0, 3.0, 4.0)])
self.assertEqual(table_chunk.metadata["crop_path"], "/tmp/table.png")
self.assertEqual(table_chunk.metadata["source_parsers"], ["pymupdf", "docling"])
def test_vision_guided_chunking_exports_visual_regions(self):
profile = DocumentProfile(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
page_count=1,
extension=".pdf",
pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)],
)
parsed = ParsedDocument(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
quality_report=QualityReport(score=0.90),
)
parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf"))
parsed.tables.append(TableObject(table_id="t1", page_nums=[1], bbox=[(1.0, 2.0, 3.0, 4.0)], markdown="| A | B |\n| --- | --- |\n| 1 | 2 |"))
parsed.figures.append(FigureObject(figure_id="f1", page_num=1, bbox=(5.0, 6.0, 7.0, 8.0), source_parser="pymupdf"))
config = load_config(overrides={"chunking": {"vision_guided": True}})
parsed.chunks = build_agentic_chunks(parsed, profile, config)
visual_chunks = [chunk for chunk in parsed.chunks if chunk.content_type in {"table", "figure"}]
self.assertTrue(all(chunk.requires_visual_context for chunk in visual_chunks))
self.assertEqual(len(parsed.provenance["chunking"]["vision_regions"]), 2)
self.assertEqual(parsed.provenance["chunking"]["vision_regions"][0]["region_id"], "t1")
def test_advanced_chunking_flags_emit_strategy_chunks(self):
profile = DocumentProfile(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
page_count=2,
extension=".pdf",
pages=[
PageProfile(page_num=1, digital_text_chars=200, digital_text_quality=1.0),
PageProfile(page_num=2, digital_text_chars=200, digital_text_quality=1.0),
],
)
parsed = ParsedDocument(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
quality_report=QualityReport(score=0.92),
)
parsed.elements.extend(
[
Element("e1", "d1", 1, "heading", markdown="## Revenue", reading_order=1, source_parser="pymupdf"),
Element(
"e2",
"d1",
1,
"paragraph",
text="Revenue increased by 12 percent in Q1. Gross margin improved due to pricing.",
reading_order=2,
source_parser="pymupdf",
),
Element("e3", "d1", 2, "heading", markdown="## Safety", reading_order=1, source_parser="pymupdf"),
Element(
"e4",
"d1",
2,
"paragraph",
text="Safety inspections found three unresolved risks. Corrective actions are due in June.",
reading_order=2,
source_parser="pymupdf",
),
]
)
parsed.tables.append(
TableObject(
table_id="t1",
page_nums=[1],
markdown="| Metric | Value |\n| --- | --- |\n| Revenue | 12% |",
natural_language_rendering="Table t1 reports revenue growth of 12 percent.",
source_parser="pymupdf",
)
)
parsed.figures.append(
FigureObject(
figure_id="f1",
page_num=2,
caption="Risk trend chart shows open safety findings.",
source_parser="pymupdf",
)
)
config = load_config(
overrides={
"chunking": {
"contextual_retrieval": True,
"semantic_chunking": True,
"late_chunking": True,
"vision_guided": True,
"agentic_proposition_chunking": True,
}
}
)
parsed.chunks = build_agentic_chunks(parsed, profile, config)
strategies = {chunk.strategy for chunk in parsed.chunks}
self.assertIn("semantic", strategies)
self.assertIn("late", strategies)
self.assertIn("contextual_retrieval", strategies)
self.assertIn("vision_guided", strategies)
self.assertIn("agentic_proposition", strategies)
self.assertGreater(parsed.provenance["chunking"]["semantic_chunk_count"], 0)
self.assertGreater(parsed.provenance["chunking"]["late_chunk_count"], 0)
self.assertGreater(parsed.provenance["chunking"]["contextual_retrieval_chunk_count"], 0)
semantic_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "semantic")
self.assertEqual(semantic_chunk.metadata["execution_mode"], "lexical_similarity_proxy")
contextual_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "contextual_retrieval")
self.assertIn("source_chunk_id", contextual_chunk.metadata)
late_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "late")
self.assertTrue(late_chunk.metadata["requires_token_level_embeddings"])
if __name__ == "__main__":
unittest.main()