""" Integration Tests for Document Processing Pipeline Tests the full document processing workflow: - OCR extraction - Layout detection - Reading order reconstruction - Chunking """ import pytest from pathlib import Path from unittest.mock import Mock, patch, MagicMock import numpy as np # Test fixtures @pytest.fixture def sample_image(): """Create a sample image for testing.""" return np.zeros((1000, 800, 3), dtype=np.uint8) @pytest.fixture def mock_ocr_result(): """Mock OCR result.""" from src.document.ocr import OCRResult from src.document.schemas.core import OCRRegion, BoundingBox regions = [ OCRRegion( text="Sample Title", confidence=0.95, bbox=BoundingBox(x_min=100, y_min=50, x_max=700, y_max=100), page=0, engine="mock", ), OCRRegion( text="This is paragraph text that contains important information.", confidence=0.92, bbox=BoundingBox(x_min=100, y_min=150, x_max=700, y_max=250), page=0, engine="mock", ), ] return OCRResult( success=True, regions=regions, page_num=0, processing_time=0.5, ) class TestDocumentSchemas: """Test document schema models.""" def test_bounding_box_creation(self): """Test BoundingBox creation and properties.""" from src.document.schemas.core import BoundingBox bbox = BoundingBox(x_min=10, y_min=20, x_max=100, y_max=80) assert bbox.width == 90 assert bbox.height == 60 assert bbox.area == 5400 assert bbox.center == (55.0, 50.0) def test_bounding_box_normalization(self): """Test BoundingBox normalization.""" from src.document.schemas.core import BoundingBox bbox = BoundingBox(x_min=100, y_min=200, x_max=300, y_max=400) normalized = bbox.normalize(1000, 800) assert normalized.normalized is True assert 0 <= normalized.x_min <= 1 assert 0 <= normalized.y_max <= 1 def test_bounding_box_iou(self): """Test BoundingBox IoU calculation.""" from src.document.schemas.core import BoundingBox bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150) bbox3 = BoundingBox(x_min=200, y_min=200, x_max=300, y_max=300) # Overlapping boxes iou = bbox1.iou(bbox2) assert 0 < iou < 1 # Non-overlapping boxes iou = bbox1.iou(bbox3) assert iou == 0 def test_ocr_region_creation(self): """Test OCRRegion creation.""" from src.document.schemas.core import OCRRegion, BoundingBox region = OCRRegion( text="Sample text", confidence=0.95, bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=50), page=0, engine="paddleocr", ) assert region.text == "Sample text" assert region.confidence == 0.95 def test_document_chunk_creation(self): """Test DocumentChunk creation.""" from src.document.schemas.core import DocumentChunk, ChunkType, BoundingBox chunk = DocumentChunk( chunk_id="chunk_001", chunk_type=ChunkType.TEXT, text="Sample chunk text", bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100), page=0, document_id="doc_001", source_path="/path/to/doc.pdf", sequence_index=0, confidence=0.9, ) assert chunk.chunk_id == "chunk_001" assert chunk.chunk_type == ChunkType.TEXT class TestOCREngines: """Test OCR engine implementations.""" def test_ocr_config_defaults(self): """Test OCRConfig default values.""" from src.document.ocr import OCRConfig config = OCRConfig() assert config.engine == "paddleocr" assert config.language == "en" def test_ocr_factory_paddleocr(self): """Test OCR factory for PaddleOCR.""" from src.document.ocr import get_ocr_engine, OCRConfig with patch("src.document.ocr.paddle_ocr.PADDLEOCR_AVAILABLE", True): with patch("src.document.ocr.paddle_ocr.PaddleOCR"): config = OCRConfig(engine="paddleocr") # Factory should return PaddleOCREngine # (actual instantiation mocked) def test_ocr_factory_tesseract(self): """Test OCR factory for Tesseract.""" from src.document.ocr import get_ocr_engine, OCRConfig with patch("src.document.ocr.tesseract_ocr.TESSERACT_AVAILABLE", True): config = OCRConfig(engine="tesseract") # Factory should return TesseractOCREngine class TestLayoutDetection: """Test layout detection functionality.""" def test_layout_config_defaults(self): """Test LayoutConfig defaults.""" from src.document.layout import LayoutConfig config = LayoutConfig() assert config.method == "rule_based" def test_layout_type_enum(self): """Test LayoutType enum values.""" from src.document.schemas.core import LayoutType assert LayoutType.TEXT.value == "text" assert LayoutType.TITLE.value == "title" assert LayoutType.TABLE.value == "table" class TestReadingOrder: """Test reading order reconstruction.""" def test_reading_order_config(self): """Test ReadingOrderConfig.""" from src.document.reading_order import ReadingOrderConfig config = ReadingOrderConfig() assert config.method == "rule_based" assert config.reading_direction == "ltr" class TestChunking: """Test document chunking.""" def test_chunker_config(self): """Test ChunkerConfig.""" from src.document.chunking import ChunkerConfig config = ChunkerConfig() assert config.target_chunk_size > 0 assert config.max_chunk_size >= config.target_chunk_size def test_semantic_chunker_creation(self): """Test SemanticChunker creation.""" from src.document.chunking import SemanticChunker, ChunkerConfig config = ChunkerConfig(target_chunk_size=256) chunker = SemanticChunker(config) assert chunker.config.target_chunk_size == 256 class TestValidation: """Test validation components.""" def test_validation_status_enum(self): """Test ValidationStatus enum.""" from src.document.validation.critic import ValidationStatus assert ValidationStatus.VALID.value == "valid" assert ValidationStatus.INVALID.value == "invalid" assert ValidationStatus.ABSTAIN.value == "abstain" def test_evidence_strength_enum(self): """Test EvidenceStrength enum.""" from src.document.validation.verifier import EvidenceStrength assert EvidenceStrength.STRONG.value == "strong" assert EvidenceStrength.NONE.value == "none" class TestPipelineIntegration: """Integration tests for full pipeline.""" def test_pipeline_config_creation(self): """Test PipelineConfig creation.""" from src.document.pipeline import PipelineConfig from src.document.ocr import OCRConfig config = PipelineConfig( ocr=OCRConfig(engine="paddleocr"), render_dpi=300, max_pages=10, ) assert config.render_dpi == 300 assert config.max_pages == 10 def test_processed_document_structure(self): """Test ProcessedDocument structure.""" from src.document.schemas.core import ( ProcessedDocument, DocumentMetadata, OCRRegion, LayoutRegion, DocumentChunk, ChunkType, BoundingBox, ) from datetime import datetime metadata = DocumentMetadata( document_id="test_doc", source_path="/path/to/doc.pdf", filename="doc.pdf", file_type="pdf", file_size_bytes=1000, num_pages=1, page_dimensions=[(800, 1000)], processed_at=datetime.utcnow(), total_chunks=1, total_characters=100, ) chunk = DocumentChunk( chunk_id="chunk_1", chunk_type=ChunkType.TEXT, text="Sample text", bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100), page=0, document_id="test_doc", source_path="/path/to/doc.pdf", sequence_index=0, confidence=0.9, ) doc = ProcessedDocument( metadata=metadata, ocr_regions=[], layout_regions=[], chunks=[chunk], full_text="Sample text", status="completed", ) assert doc.metadata.document_id == "test_doc" assert len(doc.chunks) == 1