| | """ |
| | Unit Tests for Document Intelligence Subsystem |
| | |
| | Tests core components: |
| | - BoundingBox operations |
| | - Chunk models |
| | - Schema and extraction |
| | - Evidence building |
| | """ |
| |
|
| | import pytest |
| | from pathlib import Path |
| |
|
| |
|
| | class TestBoundingBox: |
| | """Tests for BoundingBox model.""" |
| |
|
| | def test_create_bbox(self): |
| | from src.document_intelligence.chunks import BoundingBox |
| |
|
| | bbox = BoundingBox( |
| | x_min=0.1, |
| | y_min=0.2, |
| | x_max=0.5, |
| | y_max=0.6, |
| | normalized=True |
| | ) |
| |
|
| | assert bbox.x_min == 0.1 |
| | assert bbox.y_min == 0.2 |
| | assert bbox.x_max == 0.5 |
| | assert bbox.y_max == 0.6 |
| | assert bbox.normalized is True |
| |
|
| | def test_bbox_properties(self): |
| | from src.document_intelligence.chunks import BoundingBox |
| |
|
| | bbox = BoundingBox( |
| | x_min=10, |
| | y_min=20, |
| | x_max=50, |
| | y_max=80, |
| | normalized=False |
| | ) |
| |
|
| | assert bbox.width == 40 |
| | assert bbox.height == 60 |
| | assert bbox.area == 2400 |
| | assert bbox.center == (30, 50) |
| | assert bbox.xyxy == (10, 20, 50, 80) |
| |
|
| | def test_bbox_to_pixel(self): |
| | from src.document_intelligence.chunks import BoundingBox |
| |
|
| | bbox = BoundingBox( |
| | x_min=0.1, |
| | y_min=0.2, |
| | x_max=0.5, |
| | y_max=0.6, |
| | normalized=True |
| | ) |
| |
|
| | pixel_bbox = bbox.to_pixel(1000, 800) |
| |
|
| | assert pixel_bbox.x_min == 100 |
| | assert pixel_bbox.y_min == 160 |
| | assert pixel_bbox.x_max == 500 |
| | assert pixel_bbox.y_max == 480 |
| | assert pixel_bbox.normalized is False |
| |
|
| | def test_bbox_to_normalized(self): |
| | from src.document_intelligence.chunks import BoundingBox |
| |
|
| | bbox = BoundingBox( |
| | x_min=100, |
| | y_min=160, |
| | x_max=500, |
| | y_max=480, |
| | normalized=False |
| | ) |
| |
|
| | norm_bbox = bbox.to_normalized(1000, 800) |
| |
|
| | assert abs(norm_bbox.x_min - 0.1) < 0.001 |
| | assert abs(norm_bbox.y_min - 0.2) < 0.001 |
| | assert abs(norm_bbox.x_max - 0.5) < 0.001 |
| | assert abs(norm_bbox.y_max - 0.6) < 0.001 |
| | assert norm_bbox.normalized is True |
| |
|
| | def test_bbox_iou(self): |
| | from src.document_intelligence.chunks import BoundingBox |
| |
|
| | bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) |
| | bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150) |
| |
|
| | |
| | |
| | |
| | iou = bbox1.iou(bbox2) |
| | assert 0.1 < iou < 0.2 |
| |
|
| | def test_bbox_contains(self): |
| | from src.document_intelligence.chunks import BoundingBox |
| |
|
| | bbox = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) |
| |
|
| | assert bbox.contains((50, 50)) is True |
| | assert bbox.contains((0, 0)) is True |
| | assert bbox.contains((100, 100)) is True |
| | assert bbox.contains((150, 50)) is False |
| |
|
| |
|
| | class TestDocumentChunk: |
| | """Tests for DocumentChunk model.""" |
| |
|
| | def test_create_chunk(self): |
| | from src.document_intelligence.chunks import ( |
| | DocumentChunk, |
| | ChunkType, |
| | BoundingBox, |
| | ) |
| |
|
| | bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True) |
| |
|
| | chunk = DocumentChunk( |
| | chunk_id="test_chunk_001", |
| | doc_id="doc_001", |
| | chunk_type=ChunkType.PARAGRAPH, |
| | text="This is a test paragraph.", |
| | page=1, |
| | bbox=bbox, |
| | confidence=0.95, |
| | sequence_index=0, |
| | ) |
| |
|
| | assert chunk.chunk_id == "test_chunk_001" |
| | assert chunk.chunk_type == ChunkType.PARAGRAPH |
| | assert chunk.text == "This is a test paragraph." |
| | assert chunk.page == 1 |
| | assert chunk.confidence == 0.95 |
| |
|
| | def test_generate_chunk_id(self): |
| | from src.document_intelligence.chunks import ( |
| | DocumentChunk, |
| | BoundingBox, |
| | ) |
| |
|
| | bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True) |
| |
|
| | chunk_id = DocumentChunk.generate_chunk_id( |
| | doc_id="doc_001", |
| | page=1, |
| | bbox=bbox, |
| | chunk_type_str="paragraph" |
| | ) |
| |
|
| | |
| | chunk_id_2 = DocumentChunk.generate_chunk_id( |
| | doc_id="doc_001", |
| | page=1, |
| | bbox=bbox, |
| | chunk_type_str="paragraph" |
| | ) |
| |
|
| | assert chunk_id == chunk_id_2 |
| | assert len(chunk_id) == 16 |
| |
|
| |
|
| | class TestTableChunk: |
| | """Tests for TableChunk model.""" |
| |
|
| | def test_create_table_chunk(self): |
| | from src.document_intelligence.chunks import ( |
| | TableChunk, |
| | TableCell, |
| | BoundingBox, |
| | ) |
| |
|
| | bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) |
| |
|
| | cells = [ |
| | TableCell(row=0, col=0, text="Header 1", is_header=True, |
| | bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.5, y_max=0.3)), |
| | TableCell(row=0, col=1, text="Header 2", is_header=True, |
| | bbox=BoundingBox(x_min=0.5, y_min=0.2, x_max=0.9, y_max=0.3)), |
| | TableCell(row=1, col=0, text="Value 1", |
| | bbox=BoundingBox(x_min=0.1, y_min=0.3, x_max=0.5, y_max=0.4)), |
| | TableCell(row=1, col=1, text="Value 2", |
| | bbox=BoundingBox(x_min=0.5, y_min=0.3, x_max=0.9, y_max=0.4)), |
| | ] |
| |
|
| | table = TableChunk( |
| | chunk_id="table_001", |
| | doc_id="doc_001", |
| | text="Table content", |
| | page=1, |
| | bbox=bbox, |
| | confidence=0.9, |
| | sequence_index=0, |
| | cells=cells, |
| | num_rows=2, |
| | num_cols=2, |
| | ) |
| |
|
| | assert table.num_rows == 2 |
| | assert table.num_cols == 2 |
| | assert len(table.cells) == 4 |
| |
|
| | def test_table_get_cell(self): |
| | from src.document_intelligence.chunks import ( |
| | TableChunk, |
| | TableCell, |
| | BoundingBox, |
| | ) |
| |
|
| | bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) |
| |
|
| | cells = [ |
| | TableCell(row=0, col=0, text="A", |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| | TableCell(row=0, col=1, text="B", |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| | TableCell(row=1, col=0, text="C", |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| | TableCell(row=1, col=1, text="D", |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| | ] |
| |
|
| | table = TableChunk( |
| | chunk_id="table_001", |
| | doc_id="doc_001", |
| | text="Table", |
| | page=1, |
| | bbox=bbox, |
| | confidence=0.9, |
| | sequence_index=0, |
| | cells=cells, |
| | num_rows=2, |
| | num_cols=2, |
| | ) |
| |
|
| | assert table.get_cell(0, 0).text == "A" |
| | assert table.get_cell(0, 1).text == "B" |
| | assert table.get_cell(1, 0).text == "C" |
| | assert table.get_cell(1, 1).text == "D" |
| |
|
| | def test_table_to_markdown(self): |
| | from src.document_intelligence.chunks import ( |
| | TableChunk, |
| | TableCell, |
| | BoundingBox, |
| | ) |
| |
|
| | bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) |
| |
|
| | cells = [ |
| | TableCell(row=0, col=0, text="Name", |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| | TableCell(row=0, col=1, text="Value", |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| | TableCell(row=1, col=0, text="A", |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| | TableCell(row=1, col=1, text="100", |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| | ] |
| |
|
| | table = TableChunk( |
| | chunk_id="table_001", |
| | doc_id="doc_001", |
| | text="Table", |
| | page=1, |
| | bbox=bbox, |
| | confidence=0.9, |
| | sequence_index=0, |
| | cells=cells, |
| | num_rows=2, |
| | num_cols=2, |
| | ) |
| |
|
| | md = table.to_markdown() |
| | assert "| Name | Value |" in md |
| | assert "| --- | --- |" in md |
| | assert "| A | 100 |" in md |
| |
|
| |
|
| | class TestExtractionSchema: |
| | """Tests for ExtractionSchema.""" |
| |
|
| | def test_create_schema(self): |
| | from src.document_intelligence.extraction import ( |
| | ExtractionSchema, |
| | FieldSpec, |
| | FieldType, |
| | ) |
| |
|
| | schema = ExtractionSchema(name="TestSchema") |
| | schema.add_string_field("name", "Person name", required=True) |
| | schema.add_number_field("age", "Person age", required=False, is_integer=True) |
| | schema.add_date_field("birth_date", "Date of birth") |
| |
|
| | assert schema.name == "TestSchema" |
| | assert len(schema.fields) == 3 |
| | assert schema.get_field("name").required is True |
| | assert schema.get_field("age").field_type == FieldType.INTEGER |
| |
|
| | def test_schema_to_json_schema(self): |
| | from src.document_intelligence.extraction import ExtractionSchema |
| |
|
| | schema = ExtractionSchema(name="Invoice") |
| | schema.add_string_field("invoice_number", required=True) |
| | schema.add_currency_field("total_amount", required=True) |
| |
|
| | json_schema = schema.to_json_schema() |
| |
|
| | assert json_schema["type"] == "object" |
| | assert "invoice_number" in json_schema["properties"] |
| | assert "total_amount" in json_schema["properties"] |
| | assert "invoice_number" in json_schema["required"] |
| |
|
| | def test_schema_from_json_schema(self): |
| | from src.document_intelligence.extraction import ExtractionSchema |
| |
|
| | json_schema = { |
| | "type": "object", |
| | "properties": { |
| | "name": {"type": "string", "description": "Name"}, |
| | "value": {"type": "number", "minimum": 0}, |
| | }, |
| | "required": ["name"], |
| | } |
| |
|
| | schema = ExtractionSchema.from_json_schema(json_schema, name="Test") |
| |
|
| | assert len(schema.fields) == 2 |
| | assert schema.get_field("name").required is True |
| | assert schema.get_field("value").required is False |
| |
|
| | def test_preset_schemas(self): |
| | from src.document_intelligence.extraction import ( |
| | create_invoice_schema, |
| | create_receipt_schema, |
| | create_contract_schema, |
| | ) |
| |
|
| | invoice = create_invoice_schema() |
| | assert invoice.get_field("invoice_number") is not None |
| | assert invoice.get_field("total_amount") is not None |
| |
|
| | receipt = create_receipt_schema() |
| | assert receipt.get_field("merchant_name") is not None |
| |
|
| | contract = create_contract_schema() |
| | assert contract.get_field("effective_date") is not None |
| |
|
| |
|
| | class TestEvidenceBuilder: |
| | """Tests for EvidenceBuilder.""" |
| |
|
| | def test_create_evidence(self): |
| | from src.document_intelligence.grounding import EvidenceBuilder |
| | from src.document_intelligence.chunks import ( |
| | DocumentChunk, |
| | ChunkType, |
| | BoundingBox, |
| | ) |
| |
|
| | chunk = DocumentChunk( |
| | chunk_id="chunk_001", |
| | doc_id="doc_001", |
| | chunk_type=ChunkType.PARAGRAPH, |
| | text="The total amount is $500.00.", |
| | page=1, |
| | bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3), |
| | confidence=0.9, |
| | sequence_index=0, |
| | ) |
| |
|
| | builder = EvidenceBuilder() |
| | evidence = builder.create_evidence( |
| | chunk=chunk, |
| | value="$500.00", |
| | field_name="total_amount" |
| | ) |
| |
|
| | assert evidence.chunk_id == "chunk_001" |
| | assert evidence.page == 1 |
| | assert "$500.00" in evidence.snippet or "500" in evidence.snippet |
| |
|
| |
|
| | class TestSemanticChunker: |
| | """Tests for SemanticChunker.""" |
| |
|
| | def test_chunk_text(self): |
| | from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig |
| |
|
| | config = ChunkingConfig( |
| | min_chunk_chars=10, |
| | max_chunk_chars=100, |
| | target_chunk_chars=50, |
| | ) |
| |
|
| | chunker = SemanticChunker(config) |
| |
|
| | text = """# Heading 1 |
| | |
| | This is the first paragraph with some text content. |
| | |
| | This is the second paragraph with more content. |
| | |
| | # Heading 2 |
| | |
| | Another section with different content. |
| | """ |
| |
|
| | chunks = chunker.chunk_text(text) |
| |
|
| | assert len(chunks) > 0 |
| | for chunk in chunks: |
| | assert "text" in chunk |
| | assert len(chunk["text"]) >= config.min_chunk_chars |
| |
|
| | def test_chunk_long_text(self): |
| | from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig |
| |
|
| | config = ChunkingConfig( |
| | min_chunk_chars=10, |
| | max_chunk_chars=200, |
| | target_chunk_chars=100, |
| | ) |
| |
|
| | chunker = SemanticChunker(config) |
| |
|
| | |
| | text = " ".join(["This is sentence number {}.".format(i) for i in range(50)]) |
| |
|
| | chunks = chunker.chunk_text(text) |
| |
|
| | assert len(chunks) > 1 |
| | for chunk in chunks: |
| | assert len(chunk["text"]) <= config.max_chunk_chars * 1.1 |
| |
|
| |
|
| | class TestValidation: |
| | """Tests for extraction validation.""" |
| |
|
| | def test_validate_extraction(self): |
| | from src.document_intelligence.extraction import ( |
| | ExtractionSchema, |
| | ExtractionValidator, |
| | ) |
| | from src.document_intelligence.chunks import ExtractionResult, FieldExtraction |
| |
|
| | schema = ExtractionSchema(name="Test") |
| | schema.add_string_field("name", required=True) |
| | schema.add_number_field("value", required=False, is_integer=True) |
| |
|
| | result = ExtractionResult( |
| | data={"name": "Test Name", "value": 42}, |
| | fields=[], |
| | evidence=[], |
| | overall_confidence=0.8, |
| | abstained_fields=[], |
| | ) |
| |
|
| | validator = ExtractionValidator() |
| | validation = validator.validate(result, schema) |
| |
|
| | assert validation.is_valid is True |
| | assert validation.error_count == 0 |
| |
|
| | def test_validate_missing_required(self): |
| | from src.document_intelligence.extraction import ( |
| | ExtractionSchema, |
| | ExtractionValidator, |
| | ) |
| | from src.document_intelligence.chunks import ExtractionResult |
| |
|
| | schema = ExtractionSchema(name="Test") |
| | schema.add_string_field("name", required=True) |
| | schema.add_string_field("description", required=True) |
| |
|
| | result = ExtractionResult( |
| | data={"name": "Test"}, |
| | fields=[], |
| | evidence=[], |
| | overall_confidence=0.5, |
| | abstained_fields=["description"], |
| | ) |
| |
|
| | validator = ExtractionValidator() |
| | validation = validator.validate(result, schema) |
| |
|
| | assert validation.is_valid is False |
| | assert validation.error_count >= 1 |
| |
|
| |
|
| | if __name__ == "__main__": |
| | pytest.main([__file__, "-v"]) |
| |
|