| | """ |
| | Integration Tests for Document Processing Pipeline |
| | |
| | Tests the full document processing workflow: |
| | - OCR extraction |
| | - Layout detection |
| | - Reading order reconstruction |
| | - Chunking |
| | """ |
| |
|
| | import pytest |
| | from pathlib import Path |
| | from unittest.mock import Mock, patch, MagicMock |
| | import numpy as np |
| |
|
| | |
| | @pytest.fixture |
| | def sample_image(): |
| | """Create a sample image for testing.""" |
| | return np.zeros((1000, 800, 3), dtype=np.uint8) |
| |
|
| |
|
| | @pytest.fixture |
| | def mock_ocr_result(): |
| | """Mock OCR result.""" |
| | from src.document.ocr import OCRResult |
| | from src.document.schemas.core import OCRRegion, BoundingBox |
| |
|
| | regions = [ |
| | OCRRegion( |
| | text="Sample Title", |
| | confidence=0.95, |
| | bbox=BoundingBox(x_min=100, y_min=50, x_max=700, y_max=100), |
| | page=0, |
| | engine="mock", |
| | ), |
| | OCRRegion( |
| | text="This is paragraph text that contains important information.", |
| | confidence=0.92, |
| | bbox=BoundingBox(x_min=100, y_min=150, x_max=700, y_max=250), |
| | page=0, |
| | engine="mock", |
| | ), |
| | ] |
| |
|
| | return OCRResult( |
| | success=True, |
| | regions=regions, |
| | page_num=0, |
| | processing_time=0.5, |
| | ) |
| |
|
| |
|
| | class TestDocumentSchemas: |
| | """Test document schema models.""" |
| |
|
| | def test_bounding_box_creation(self): |
| | """Test BoundingBox creation and properties.""" |
| | from src.document.schemas.core import BoundingBox |
| |
|
| | bbox = BoundingBox(x_min=10, y_min=20, x_max=100, y_max=80) |
| |
|
| | assert bbox.width == 90 |
| | assert bbox.height == 60 |
| | assert bbox.area == 5400 |
| | assert bbox.center == (55.0, 50.0) |
| |
|
| | def test_bounding_box_normalization(self): |
| | """Test BoundingBox normalization.""" |
| | from src.document.schemas.core import BoundingBox |
| |
|
| | bbox = BoundingBox(x_min=100, y_min=200, x_max=300, y_max=400) |
| |
|
| | normalized = bbox.normalize(1000, 800) |
| | assert normalized.normalized is True |
| | assert 0 <= normalized.x_min <= 1 |
| | assert 0 <= normalized.y_max <= 1 |
| |
|
| | def test_bounding_box_iou(self): |
| | """Test BoundingBox IoU calculation.""" |
| | from src.document.schemas.core import BoundingBox |
| |
|
| | bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) |
| | bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150) |
| | bbox3 = BoundingBox(x_min=200, y_min=200, x_max=300, y_max=300) |
| |
|
| | |
| | iou = bbox1.iou(bbox2) |
| | assert 0 < iou < 1 |
| |
|
| | |
| | iou = bbox1.iou(bbox3) |
| | assert iou == 0 |
| |
|
| | def test_ocr_region_creation(self): |
| | """Test OCRRegion creation.""" |
| | from src.document.schemas.core import OCRRegion, BoundingBox |
| |
|
| | region = OCRRegion( |
| | text="Sample text", |
| | confidence=0.95, |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=50), |
| | page=0, |
| | engine="paddleocr", |
| | ) |
| |
|
| | assert region.text == "Sample text" |
| | assert region.confidence == 0.95 |
| |
|
| | def test_document_chunk_creation(self): |
| | """Test DocumentChunk creation.""" |
| | from src.document.schemas.core import DocumentChunk, ChunkType, BoundingBox |
| |
|
| | chunk = DocumentChunk( |
| | chunk_id="chunk_001", |
| | chunk_type=ChunkType.TEXT, |
| | text="Sample chunk text", |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100), |
| | page=0, |
| | document_id="doc_001", |
| | source_path="/path/to/doc.pdf", |
| | sequence_index=0, |
| | confidence=0.9, |
| | ) |
| |
|
| | assert chunk.chunk_id == "chunk_001" |
| | assert chunk.chunk_type == ChunkType.TEXT |
| |
|
| |
|
| | class TestOCREngines: |
| | """Test OCR engine implementations.""" |
| |
|
| | def test_ocr_config_defaults(self): |
| | """Test OCRConfig default values.""" |
| | from src.document.ocr import OCRConfig |
| |
|
| | config = OCRConfig() |
| | assert config.engine == "paddleocr" |
| | assert config.language == "en" |
| |
|
| | def test_ocr_factory_paddleocr(self): |
| | """Test OCR factory for PaddleOCR.""" |
| | from src.document.ocr import get_ocr_engine, OCRConfig |
| |
|
| | with patch("src.document.ocr.paddle_ocr.PADDLEOCR_AVAILABLE", True): |
| | with patch("src.document.ocr.paddle_ocr.PaddleOCR"): |
| | config = OCRConfig(engine="paddleocr") |
| | |
| | |
| |
|
| | def test_ocr_factory_tesseract(self): |
| | """Test OCR factory for Tesseract.""" |
| | from src.document.ocr import get_ocr_engine, OCRConfig |
| |
|
| | with patch("src.document.ocr.tesseract_ocr.TESSERACT_AVAILABLE", True): |
| | config = OCRConfig(engine="tesseract") |
| | |
| |
|
| |
|
| | class TestLayoutDetection: |
| | """Test layout detection functionality.""" |
| |
|
| | def test_layout_config_defaults(self): |
| | """Test LayoutConfig defaults.""" |
| | from src.document.layout import LayoutConfig |
| |
|
| | config = LayoutConfig() |
| | assert config.method == "rule_based" |
| |
|
| | def test_layout_type_enum(self): |
| | """Test LayoutType enum values.""" |
| | from src.document.schemas.core import LayoutType |
| |
|
| | assert LayoutType.TEXT.value == "text" |
| | assert LayoutType.TITLE.value == "title" |
| | assert LayoutType.TABLE.value == "table" |
| |
|
| |
|
| | class TestReadingOrder: |
| | """Test reading order reconstruction.""" |
| |
|
| | def test_reading_order_config(self): |
| | """Test ReadingOrderConfig.""" |
| | from src.document.reading_order import ReadingOrderConfig |
| |
|
| | config = ReadingOrderConfig() |
| | assert config.method == "rule_based" |
| | assert config.reading_direction == "ltr" |
| |
|
| |
|
| | class TestChunking: |
| | """Test document chunking.""" |
| |
|
| | def test_chunker_config(self): |
| | """Test ChunkerConfig.""" |
| | from src.document.chunking import ChunkerConfig |
| |
|
| | config = ChunkerConfig() |
| | assert config.target_chunk_size > 0 |
| | assert config.max_chunk_size >= config.target_chunk_size |
| |
|
| | def test_semantic_chunker_creation(self): |
| | """Test SemanticChunker creation.""" |
| | from src.document.chunking import SemanticChunker, ChunkerConfig |
| |
|
| | config = ChunkerConfig(target_chunk_size=256) |
| | chunker = SemanticChunker(config) |
| |
|
| | assert chunker.config.target_chunk_size == 256 |
| |
|
| |
|
| | class TestValidation: |
| | """Test validation components.""" |
| |
|
| | def test_validation_status_enum(self): |
| | """Test ValidationStatus enum.""" |
| | from src.document.validation.critic import ValidationStatus |
| |
|
| | assert ValidationStatus.VALID.value == "valid" |
| | assert ValidationStatus.INVALID.value == "invalid" |
| | assert ValidationStatus.ABSTAIN.value == "abstain" |
| |
|
| | def test_evidence_strength_enum(self): |
| | """Test EvidenceStrength enum.""" |
| | from src.document.validation.verifier import EvidenceStrength |
| |
|
| | assert EvidenceStrength.STRONG.value == "strong" |
| | assert EvidenceStrength.NONE.value == "none" |
| |
|
| |
|
| | class TestPipelineIntegration: |
| | """Integration tests for full pipeline.""" |
| |
|
| | def test_pipeline_config_creation(self): |
| | """Test PipelineConfig creation.""" |
| | from src.document.pipeline import PipelineConfig |
| | from src.document.ocr import OCRConfig |
| |
|
| | config = PipelineConfig( |
| | ocr=OCRConfig(engine="paddleocr"), |
| | render_dpi=300, |
| | max_pages=10, |
| | ) |
| |
|
| | assert config.render_dpi == 300 |
| | assert config.max_pages == 10 |
| |
|
| | def test_processed_document_structure(self): |
| | """Test ProcessedDocument structure.""" |
| | from src.document.schemas.core import ( |
| | ProcessedDocument, |
| | DocumentMetadata, |
| | OCRRegion, |
| | LayoutRegion, |
| | DocumentChunk, |
| | ChunkType, |
| | BoundingBox, |
| | ) |
| | from datetime import datetime |
| |
|
| | metadata = DocumentMetadata( |
| | document_id="test_doc", |
| | source_path="/path/to/doc.pdf", |
| | filename="doc.pdf", |
| | file_type="pdf", |
| | file_size_bytes=1000, |
| | num_pages=1, |
| | page_dimensions=[(800, 1000)], |
| | processed_at=datetime.utcnow(), |
| | total_chunks=1, |
| | total_characters=100, |
| | ) |
| |
|
| | chunk = DocumentChunk( |
| | chunk_id="chunk_1", |
| | chunk_type=ChunkType.TEXT, |
| | text="Sample text", |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100), |
| | page=0, |
| | document_id="test_doc", |
| | source_path="/path/to/doc.pdf", |
| | sequence_index=0, |
| | confidence=0.9, |
| | ) |
| |
|
| | doc = ProcessedDocument( |
| | metadata=metadata, |
| | ocr_regions=[], |
| | layout_regions=[], |
| | chunks=[chunk], |
| | full_text="Sample text", |
| | status="completed", |
| | ) |
| |
|
| | assert doc.metadata.document_id == "test_doc" |
| | assert len(doc.chunks) == 1 |
| |
|