Spaces:
Running on Zero
Running on Zero
| import unittest | |
| from zsgdp.chunking import build_agentic_chunks | |
| from zsgdp.config import load_config | |
| from zsgdp.schema import DocumentProfile, Element, FigureObject, PageProfile, ParsedDocument, QualityReport, TableObject | |
| from zsgdp.verify import verify_chunks | |
| class ChunkingTests(unittest.TestCase): | |
| def test_agentic_chunking_builds_parent_child_chunks(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.md", | |
| file_type="markdown", | |
| page_count=1, | |
| extension=".md", | |
| pages=[PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0)], | |
| ) | |
| parsed = ParsedDocument( | |
| doc_id="d1", | |
| source_path="sample.md", | |
| file_type="markdown", | |
| quality_report=QualityReport(score=0.95), | |
| ) | |
| parsed.elements.extend( | |
| [ | |
| Element("e1", "d1", 1, "title", markdown="# Report", reading_order=1, source_parser="text"), | |
| Element("e2", "d1", 1, "paragraph", text=" ".join(["alpha"] * 80), reading_order=2, source_parser="text"), | |
| ] | |
| ) | |
| chunks = build_agentic_chunks(parsed, profile, load_config()) | |
| self.assertTrue(any(chunk.content_type == "parent" for chunk in chunks)) | |
| self.assertTrue(any(chunk.parent_chunk_id for chunk in chunks)) | |
| self.assertEqual(parsed.provenance["chunking"]["plan"]["target_tokens"], 512) | |
| def test_chunk_readiness_adds_metrics(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.md", | |
| file_type="markdown", | |
| page_count=1, | |
| extension=".md", | |
| pages=[PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0)], | |
| ) | |
| parsed = ParsedDocument( | |
| doc_id="d1", | |
| source_path="sample.md", | |
| file_type="markdown", | |
| quality_report=QualityReport(score=0.95), | |
| ) | |
| parsed.elements.append( | |
| Element("e1", "d1", 1, "paragraph", text=" ".join(["alpha"] * 80), reading_order=1, source_parser="text") | |
| ) | |
| parsed.chunks = build_agentic_chunks(parsed, profile, load_config()) | |
| report = verify_chunks(parsed, load_config()) | |
| self.assertEqual(report.metrics["chunk_count"], len(parsed.chunks)) | |
| self.assertIn("fixed_token_baseline", report.metrics["chunk_strategy_counts"]) | |
| self.assertIn("recursive_structure", report.metrics["chunk_strategy_counts"]) | |
| def test_fixed_token_baseline_chunks_are_emitted_with_provenance(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.md", | |
| file_type="markdown", | |
| page_count=2, | |
| extension=".md", | |
| pages=[ | |
| PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0), | |
| PageProfile(page_num=2, digital_text_chars=120, digital_text_quality=1.0), | |
| ], | |
| ) | |
| parsed = ParsedDocument( | |
| doc_id="d1", | |
| source_path="sample.md", | |
| file_type="markdown", | |
| quality_report=QualityReport(score=0.95), | |
| ) | |
| parsed.elements.extend( | |
| [ | |
| Element("e1", "d1", 1, "paragraph", text=" ".join(["alpha"] * 18), reading_order=1, source_parser="text"), | |
| Element("e2", "d1", 2, "paragraph", text=" ".join(["beta"] * 18), reading_order=1, source_parser="text"), | |
| ] | |
| ) | |
| config = load_config(overrides={"chunking": {"target_tokens": 10, "overlap_ratio": 0.2}}) | |
| chunks = build_agentic_chunks(parsed, profile, config) | |
| baseline_chunks = [chunk for chunk in chunks if chunk.strategy == "fixed_token_baseline"] | |
| self.assertGreaterEqual(len(baseline_chunks), 4) | |
| self.assertEqual(baseline_chunks[0].element_ids, ["e1"]) | |
| self.assertEqual(baseline_chunks[-1].page_end, 2) | |
| self.assertEqual(parsed.provenance["chunking"]["fixed_token_baseline_count"], len(baseline_chunks)) | |
| def test_figure_without_caption_still_gets_visual_chunk(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| page_count=1, | |
| extension=".pdf", | |
| pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)], | |
| ) | |
| parsed = ParsedDocument( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| quality_report=QualityReport(score=0.90), | |
| ) | |
| parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf")) | |
| parsed.figures.append( | |
| FigureObject( | |
| figure_id="f1", | |
| page_num=1, | |
| image_path="/tmp/figure.png", | |
| confidence=0.5, | |
| source_parser="pymupdf", | |
| ) | |
| ) | |
| parsed.chunks = build_agentic_chunks(parsed, profile, load_config()) | |
| report = verify_chunks(parsed, load_config()) | |
| self.assertTrue(any(chunk.figure_ids == ["f1"] for chunk in parsed.chunks)) | |
| self.assertEqual(report.metrics["figure_chunk_coverage"], 1.0) | |
| def test_table_chunk_keeps_multimodal_metadata(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| page_count=1, | |
| extension=".pdf", | |
| pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)], | |
| ) | |
| parsed = ParsedDocument( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| quality_report=QualityReport(score=0.90), | |
| ) | |
| parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf")) | |
| parsed.tables.append( | |
| TableObject( | |
| table_id="t1", | |
| page_nums=[1], | |
| bbox=[(1.0, 2.0, 3.0, 4.0)], | |
| markdown="| A | B |\n| --- | --- |\n| 1 | 2 |", | |
| natural_language_rendering="Table with columns A, B. Rows: 1: B=2.", | |
| confidence=0.82, | |
| source_parser="pymupdf", | |
| provenance={"crop_path": "/tmp/table.png", "source_parsers": ["pymupdf", "docling"]}, | |
| ) | |
| ) | |
| parsed.chunks = build_agentic_chunks(parsed, profile, load_config()) | |
| table_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "table_object") | |
| self.assertEqual(table_chunk.text, "Table with columns A, B. Rows: 1: B=2.") | |
| self.assertEqual(table_chunk.metadata["markdown"], "| A | B |\n| --- | --- |\n| 1 | 2 |") | |
| self.assertEqual(table_chunk.metadata["bbox"], [(1.0, 2.0, 3.0, 4.0)]) | |
| self.assertEqual(table_chunk.metadata["crop_path"], "/tmp/table.png") | |
| self.assertEqual(table_chunk.metadata["source_parsers"], ["pymupdf", "docling"]) | |
| def test_vision_guided_chunking_exports_visual_regions(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| page_count=1, | |
| extension=".pdf", | |
| pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)], | |
| ) | |
| parsed = ParsedDocument( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| quality_report=QualityReport(score=0.90), | |
| ) | |
| parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf")) | |
| parsed.tables.append(TableObject(table_id="t1", page_nums=[1], bbox=[(1.0, 2.0, 3.0, 4.0)], markdown="| A | B |\n| --- | --- |\n| 1 | 2 |")) | |
| parsed.figures.append(FigureObject(figure_id="f1", page_num=1, bbox=(5.0, 6.0, 7.0, 8.0), source_parser="pymupdf")) | |
| config = load_config(overrides={"chunking": {"vision_guided": True}}) | |
| parsed.chunks = build_agentic_chunks(parsed, profile, config) | |
| visual_chunks = [chunk for chunk in parsed.chunks if chunk.content_type in {"table", "figure"}] | |
| self.assertTrue(all(chunk.requires_visual_context for chunk in visual_chunks)) | |
| self.assertEqual(len(parsed.provenance["chunking"]["vision_regions"]), 2) | |
| self.assertEqual(parsed.provenance["chunking"]["vision_regions"][0]["region_id"], "t1") | |
| def test_advanced_chunking_flags_emit_strategy_chunks(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| page_count=2, | |
| extension=".pdf", | |
| pages=[ | |
| PageProfile(page_num=1, digital_text_chars=200, digital_text_quality=1.0), | |
| PageProfile(page_num=2, digital_text_chars=200, digital_text_quality=1.0), | |
| ], | |
| ) | |
| parsed = ParsedDocument( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| quality_report=QualityReport(score=0.92), | |
| ) | |
| parsed.elements.extend( | |
| [ | |
| Element("e1", "d1", 1, "heading", markdown="## Revenue", reading_order=1, source_parser="pymupdf"), | |
| Element( | |
| "e2", | |
| "d1", | |
| 1, | |
| "paragraph", | |
| text="Revenue increased by 12 percent in Q1. Gross margin improved due to pricing.", | |
| reading_order=2, | |
| source_parser="pymupdf", | |
| ), | |
| Element("e3", "d1", 2, "heading", markdown="## Safety", reading_order=1, source_parser="pymupdf"), | |
| Element( | |
| "e4", | |
| "d1", | |
| 2, | |
| "paragraph", | |
| text="Safety inspections found three unresolved risks. Corrective actions are due in June.", | |
| reading_order=2, | |
| source_parser="pymupdf", | |
| ), | |
| ] | |
| ) | |
| parsed.tables.append( | |
| TableObject( | |
| table_id="t1", | |
| page_nums=[1], | |
| markdown="| Metric | Value |\n| --- | --- |\n| Revenue | 12% |", | |
| natural_language_rendering="Table t1 reports revenue growth of 12 percent.", | |
| source_parser="pymupdf", | |
| ) | |
| ) | |
| parsed.figures.append( | |
| FigureObject( | |
| figure_id="f1", | |
| page_num=2, | |
| caption="Risk trend chart shows open safety findings.", | |
| source_parser="pymupdf", | |
| ) | |
| ) | |
| config = load_config( | |
| overrides={ | |
| "chunking": { | |
| "contextual_retrieval": True, | |
| "semantic_chunking": True, | |
| "late_chunking": True, | |
| "vision_guided": True, | |
| "agentic_proposition_chunking": True, | |
| } | |
| } | |
| ) | |
| parsed.chunks = build_agentic_chunks(parsed, profile, config) | |
| strategies = {chunk.strategy for chunk in parsed.chunks} | |
| self.assertIn("semantic", strategies) | |
| self.assertIn("late", strategies) | |
| self.assertIn("contextual_retrieval", strategies) | |
| self.assertIn("vision_guided", strategies) | |
| self.assertIn("agentic_proposition", strategies) | |
| self.assertGreater(parsed.provenance["chunking"]["semantic_chunk_count"], 0) | |
| self.assertGreater(parsed.provenance["chunking"]["late_chunk_count"], 0) | |
| self.assertGreater(parsed.provenance["chunking"]["contextual_retrieval_chunk_count"], 0) | |
| semantic_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "semantic") | |
| self.assertEqual(semantic_chunk.metadata["execution_mode"], "lexical_similarity_proxy") | |
| contextual_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "contextual_retrieval") | |
| self.assertIn("source_chunk_id", contextual_chunk.metadata) | |
| late_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "late") | |
| self.assertTrue(late_chunk.metadata["requires_token_level_embeddings"]) | |
| if __name__ == "__main__": | |
| unittest.main() | |