import unittest from zsgdp.chunking import build_agentic_chunks from zsgdp.config import load_config from zsgdp.schema import DocumentProfile, Element, FigureObject, PageProfile, ParsedDocument, QualityReport, TableObject from zsgdp.verify import verify_chunks class ChunkingTests(unittest.TestCase): def test_agentic_chunking_builds_parent_child_chunks(self): profile = DocumentProfile( doc_id="d1", source_path="sample.md", file_type="markdown", page_count=1, extension=".md", pages=[PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0)], ) parsed = ParsedDocument( doc_id="d1", source_path="sample.md", file_type="markdown", quality_report=QualityReport(score=0.95), ) parsed.elements.extend( [ Element("e1", "d1", 1, "title", markdown="# Report", reading_order=1, source_parser="text"), Element("e2", "d1", 1, "paragraph", text=" ".join(["alpha"] * 80), reading_order=2, source_parser="text"), ] ) chunks = build_agentic_chunks(parsed, profile, load_config()) self.assertTrue(any(chunk.content_type == "parent" for chunk in chunks)) self.assertTrue(any(chunk.parent_chunk_id for chunk in chunks)) self.assertEqual(parsed.provenance["chunking"]["plan"]["target_tokens"], 512) def test_chunk_readiness_adds_metrics(self): profile = DocumentProfile( doc_id="d1", source_path="sample.md", file_type="markdown", page_count=1, extension=".md", pages=[PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0)], ) parsed = ParsedDocument( doc_id="d1", source_path="sample.md", file_type="markdown", quality_report=QualityReport(score=0.95), ) parsed.elements.append( Element("e1", "d1", 1, "paragraph", text=" ".join(["alpha"] * 80), reading_order=1, source_parser="text") ) parsed.chunks = build_agentic_chunks(parsed, profile, load_config()) report = verify_chunks(parsed, load_config()) self.assertEqual(report.metrics["chunk_count"], len(parsed.chunks)) self.assertIn("fixed_token_baseline", report.metrics["chunk_strategy_counts"]) self.assertIn("recursive_structure", report.metrics["chunk_strategy_counts"]) def test_fixed_token_baseline_chunks_are_emitted_with_provenance(self): profile = DocumentProfile( doc_id="d1", source_path="sample.md", file_type="markdown", page_count=2, extension=".md", pages=[ PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0), PageProfile(page_num=2, digital_text_chars=120, digital_text_quality=1.0), ], ) parsed = ParsedDocument( doc_id="d1", source_path="sample.md", file_type="markdown", quality_report=QualityReport(score=0.95), ) parsed.elements.extend( [ Element("e1", "d1", 1, "paragraph", text=" ".join(["alpha"] * 18), reading_order=1, source_parser="text"), Element("e2", "d1", 2, "paragraph", text=" ".join(["beta"] * 18), reading_order=1, source_parser="text"), ] ) config = load_config(overrides={"chunking": {"target_tokens": 10, "overlap_ratio": 0.2}}) chunks = build_agentic_chunks(parsed, profile, config) baseline_chunks = [chunk for chunk in chunks if chunk.strategy == "fixed_token_baseline"] self.assertGreaterEqual(len(baseline_chunks), 4) self.assertEqual(baseline_chunks[0].element_ids, ["e1"]) self.assertEqual(baseline_chunks[-1].page_end, 2) self.assertEqual(parsed.provenance["chunking"]["fixed_token_baseline_count"], len(baseline_chunks)) def test_figure_without_caption_still_gets_visual_chunk(self): profile = DocumentProfile( doc_id="d1", source_path="sample.pdf", file_type="pdf", page_count=1, extension=".pdf", pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)], ) parsed = ParsedDocument( doc_id="d1", source_path="sample.pdf", file_type="pdf", quality_report=QualityReport(score=0.90), ) parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf")) parsed.figures.append( FigureObject( figure_id="f1", page_num=1, image_path="/tmp/figure.png", confidence=0.5, source_parser="pymupdf", ) ) parsed.chunks = build_agentic_chunks(parsed, profile, load_config()) report = verify_chunks(parsed, load_config()) self.assertTrue(any(chunk.figure_ids == ["f1"] for chunk in parsed.chunks)) self.assertEqual(report.metrics["figure_chunk_coverage"], 1.0) def test_table_chunk_keeps_multimodal_metadata(self): profile = DocumentProfile( doc_id="d1", source_path="sample.pdf", file_type="pdf", page_count=1, extension=".pdf", pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)], ) parsed = ParsedDocument( doc_id="d1", source_path="sample.pdf", file_type="pdf", quality_report=QualityReport(score=0.90), ) parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf")) parsed.tables.append( TableObject( table_id="t1", page_nums=[1], bbox=[(1.0, 2.0, 3.0, 4.0)], markdown="| A | B |\n| --- | --- |\n| 1 | 2 |", natural_language_rendering="Table with columns A, B. Rows: 1: B=2.", confidence=0.82, source_parser="pymupdf", provenance={"crop_path": "/tmp/table.png", "source_parsers": ["pymupdf", "docling"]}, ) ) parsed.chunks = build_agentic_chunks(parsed, profile, load_config()) table_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "table_object") self.assertEqual(table_chunk.text, "Table with columns A, B. Rows: 1: B=2.") self.assertEqual(table_chunk.metadata["markdown"], "| A | B |\n| --- | --- |\n| 1 | 2 |") self.assertEqual(table_chunk.metadata["bbox"], [(1.0, 2.0, 3.0, 4.0)]) self.assertEqual(table_chunk.metadata["crop_path"], "/tmp/table.png") self.assertEqual(table_chunk.metadata["source_parsers"], ["pymupdf", "docling"]) def test_vision_guided_chunking_exports_visual_regions(self): profile = DocumentProfile( doc_id="d1", source_path="sample.pdf", file_type="pdf", page_count=1, extension=".pdf", pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)], ) parsed = ParsedDocument( doc_id="d1", source_path="sample.pdf", file_type="pdf", quality_report=QualityReport(score=0.90), ) parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf")) parsed.tables.append(TableObject(table_id="t1", page_nums=[1], bbox=[(1.0, 2.0, 3.0, 4.0)], markdown="| A | B |\n| --- | --- |\n| 1 | 2 |")) parsed.figures.append(FigureObject(figure_id="f1", page_num=1, bbox=(5.0, 6.0, 7.0, 8.0), source_parser="pymupdf")) config = load_config(overrides={"chunking": {"vision_guided": True}}) parsed.chunks = build_agentic_chunks(parsed, profile, config) visual_chunks = [chunk for chunk in parsed.chunks if chunk.content_type in {"table", "figure"}] self.assertTrue(all(chunk.requires_visual_context for chunk in visual_chunks)) self.assertEqual(len(parsed.provenance["chunking"]["vision_regions"]), 2) self.assertEqual(parsed.provenance["chunking"]["vision_regions"][0]["region_id"], "t1") def test_advanced_chunking_flags_emit_strategy_chunks(self): profile = DocumentProfile( doc_id="d1", source_path="sample.pdf", file_type="pdf", page_count=2, extension=".pdf", pages=[ PageProfile(page_num=1, digital_text_chars=200, digital_text_quality=1.0), PageProfile(page_num=2, digital_text_chars=200, digital_text_quality=1.0), ], ) parsed = ParsedDocument( doc_id="d1", source_path="sample.pdf", file_type="pdf", quality_report=QualityReport(score=0.92), ) parsed.elements.extend( [ Element("e1", "d1", 1, "heading", markdown="## Revenue", reading_order=1, source_parser="pymupdf"), Element( "e2", "d1", 1, "paragraph", text="Revenue increased by 12 percent in Q1. Gross margin improved due to pricing.", reading_order=2, source_parser="pymupdf", ), Element("e3", "d1", 2, "heading", markdown="## Safety", reading_order=1, source_parser="pymupdf"), Element( "e4", "d1", 2, "paragraph", text="Safety inspections found three unresolved risks. Corrective actions are due in June.", reading_order=2, source_parser="pymupdf", ), ] ) parsed.tables.append( TableObject( table_id="t1", page_nums=[1], markdown="| Metric | Value |\n| --- | --- |\n| Revenue | 12% |", natural_language_rendering="Table t1 reports revenue growth of 12 percent.", source_parser="pymupdf", ) ) parsed.figures.append( FigureObject( figure_id="f1", page_num=2, caption="Risk trend chart shows open safety findings.", source_parser="pymupdf", ) ) config = load_config( overrides={ "chunking": { "contextual_retrieval": True, "semantic_chunking": True, "late_chunking": True, "vision_guided": True, "agentic_proposition_chunking": True, } } ) parsed.chunks = build_agentic_chunks(parsed, profile, config) strategies = {chunk.strategy for chunk in parsed.chunks} self.assertIn("semantic", strategies) self.assertIn("late", strategies) self.assertIn("contextual_retrieval", strategies) self.assertIn("vision_guided", strategies) self.assertIn("agentic_proposition", strategies) self.assertGreater(parsed.provenance["chunking"]["semantic_chunk_count"], 0) self.assertGreater(parsed.provenance["chunking"]["late_chunk_count"], 0) self.assertGreater(parsed.provenance["chunking"]["contextual_retrieval_chunk_count"], 0) semantic_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "semantic") self.assertEqual(semantic_chunk.metadata["execution_mode"], "lexical_similarity_proxy") contextual_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "contextual_retrieval") self.assertIn("source_chunk_id", contextual_chunk.metadata) late_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "late") self.assertTrue(late_chunk.metadata["requires_token_level_embeddings"]) if __name__ == "__main__": unittest.main()