|
|
""" |
|
|
Integration Tests for Document Processing Pipeline |
|
|
|
|
|
Tests the full document processing workflow: |
|
|
- OCR extraction |
|
|
- Layout detection |
|
|
- Reading order reconstruction |
|
|
- Chunking |
|
|
""" |
|
|
|
|
|
import pytest |
|
|
from pathlib import Path |
|
|
from unittest.mock import Mock, patch, MagicMock |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_image(): |
|
|
"""Create a sample image for testing.""" |
|
|
return np.zeros((1000, 800, 3), dtype=np.uint8) |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def mock_ocr_result(): |
|
|
"""Mock OCR result.""" |
|
|
from src.document.ocr import OCRResult |
|
|
from src.document.schemas.core import OCRRegion, BoundingBox |
|
|
|
|
|
regions = [ |
|
|
OCRRegion( |
|
|
text="Sample Title", |
|
|
confidence=0.95, |
|
|
bbox=BoundingBox(x_min=100, y_min=50, x_max=700, y_max=100), |
|
|
page=0, |
|
|
engine="mock", |
|
|
), |
|
|
OCRRegion( |
|
|
text="This is paragraph text that contains important information.", |
|
|
confidence=0.92, |
|
|
bbox=BoundingBox(x_min=100, y_min=150, x_max=700, y_max=250), |
|
|
page=0, |
|
|
engine="mock", |
|
|
), |
|
|
] |
|
|
|
|
|
return OCRResult( |
|
|
success=True, |
|
|
regions=regions, |
|
|
page_num=0, |
|
|
processing_time=0.5, |
|
|
) |
|
|
|
|
|
|
|
|
class TestDocumentSchemas: |
|
|
"""Test document schema models.""" |
|
|
|
|
|
def test_bounding_box_creation(self): |
|
|
"""Test BoundingBox creation and properties.""" |
|
|
from src.document.schemas.core import BoundingBox |
|
|
|
|
|
bbox = BoundingBox(x_min=10, y_min=20, x_max=100, y_max=80) |
|
|
|
|
|
assert bbox.width == 90 |
|
|
assert bbox.height == 60 |
|
|
assert bbox.area == 5400 |
|
|
assert bbox.center == (55.0, 50.0) |
|
|
|
|
|
def test_bounding_box_normalization(self): |
|
|
"""Test BoundingBox normalization.""" |
|
|
from src.document.schemas.core import BoundingBox |
|
|
|
|
|
bbox = BoundingBox(x_min=100, y_min=200, x_max=300, y_max=400) |
|
|
|
|
|
normalized = bbox.normalize(1000, 800) |
|
|
assert normalized.normalized is True |
|
|
assert 0 <= normalized.x_min <= 1 |
|
|
assert 0 <= normalized.y_max <= 1 |
|
|
|
|
|
def test_bounding_box_iou(self): |
|
|
"""Test BoundingBox IoU calculation.""" |
|
|
from src.document.schemas.core import BoundingBox |
|
|
|
|
|
bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) |
|
|
bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150) |
|
|
bbox3 = BoundingBox(x_min=200, y_min=200, x_max=300, y_max=300) |
|
|
|
|
|
|
|
|
iou = bbox1.iou(bbox2) |
|
|
assert 0 < iou < 1 |
|
|
|
|
|
|
|
|
iou = bbox1.iou(bbox3) |
|
|
assert iou == 0 |
|
|
|
|
|
def test_ocr_region_creation(self): |
|
|
"""Test OCRRegion creation.""" |
|
|
from src.document.schemas.core import OCRRegion, BoundingBox |
|
|
|
|
|
region = OCRRegion( |
|
|
text="Sample text", |
|
|
confidence=0.95, |
|
|
bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=50), |
|
|
page=0, |
|
|
engine="paddleocr", |
|
|
) |
|
|
|
|
|
assert region.text == "Sample text" |
|
|
assert region.confidence == 0.95 |
|
|
|
|
|
def test_document_chunk_creation(self): |
|
|
"""Test DocumentChunk creation.""" |
|
|
from src.document.schemas.core import DocumentChunk, ChunkType, BoundingBox |
|
|
|
|
|
chunk = DocumentChunk( |
|
|
chunk_id="chunk_001", |
|
|
chunk_type=ChunkType.TEXT, |
|
|
text="Sample chunk text", |
|
|
bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100), |
|
|
page=0, |
|
|
document_id="doc_001", |
|
|
source_path="/path/to/doc.pdf", |
|
|
sequence_index=0, |
|
|
confidence=0.9, |
|
|
) |
|
|
|
|
|
assert chunk.chunk_id == "chunk_001" |
|
|
assert chunk.chunk_type == ChunkType.TEXT |
|
|
|
|
|
|
|
|
class TestOCREngines: |
|
|
"""Test OCR engine implementations.""" |
|
|
|
|
|
def test_ocr_config_defaults(self): |
|
|
"""Test OCRConfig default values.""" |
|
|
from src.document.ocr import OCRConfig |
|
|
|
|
|
config = OCRConfig() |
|
|
assert config.engine == "paddleocr" |
|
|
assert config.language == "en" |
|
|
|
|
|
def test_ocr_factory_paddleocr(self): |
|
|
"""Test OCR factory for PaddleOCR.""" |
|
|
from src.document.ocr import get_ocr_engine, OCRConfig |
|
|
|
|
|
with patch("src.document.ocr.paddle_ocr.PADDLEOCR_AVAILABLE", True): |
|
|
with patch("src.document.ocr.paddle_ocr.PaddleOCR"): |
|
|
config = OCRConfig(engine="paddleocr") |
|
|
|
|
|
|
|
|
|
|
|
def test_ocr_factory_tesseract(self): |
|
|
"""Test OCR factory for Tesseract.""" |
|
|
from src.document.ocr import get_ocr_engine, OCRConfig |
|
|
|
|
|
with patch("src.document.ocr.tesseract_ocr.TESSERACT_AVAILABLE", True): |
|
|
config = OCRConfig(engine="tesseract") |
|
|
|
|
|
|
|
|
|
|
|
class TestLayoutDetection: |
|
|
"""Test layout detection functionality.""" |
|
|
|
|
|
def test_layout_config_defaults(self): |
|
|
"""Test LayoutConfig defaults.""" |
|
|
from src.document.layout import LayoutConfig |
|
|
|
|
|
config = LayoutConfig() |
|
|
assert config.method == "rule_based" |
|
|
|
|
|
def test_layout_type_enum(self): |
|
|
"""Test LayoutType enum values.""" |
|
|
from src.document.schemas.core import LayoutType |
|
|
|
|
|
assert LayoutType.TEXT.value == "text" |
|
|
assert LayoutType.TITLE.value == "title" |
|
|
assert LayoutType.TABLE.value == "table" |
|
|
|
|
|
|
|
|
class TestReadingOrder: |
|
|
"""Test reading order reconstruction.""" |
|
|
|
|
|
def test_reading_order_config(self): |
|
|
"""Test ReadingOrderConfig.""" |
|
|
from src.document.reading_order import ReadingOrderConfig |
|
|
|
|
|
config = ReadingOrderConfig() |
|
|
assert config.method == "rule_based" |
|
|
assert config.reading_direction == "ltr" |
|
|
|
|
|
|
|
|
class TestChunking: |
|
|
"""Test document chunking.""" |
|
|
|
|
|
def test_chunker_config(self): |
|
|
"""Test ChunkerConfig.""" |
|
|
from src.document.chunking import ChunkerConfig |
|
|
|
|
|
config = ChunkerConfig() |
|
|
assert config.target_chunk_size > 0 |
|
|
assert config.max_chunk_size >= config.target_chunk_size |
|
|
|
|
|
def test_semantic_chunker_creation(self): |
|
|
"""Test SemanticChunker creation.""" |
|
|
from src.document.chunking import SemanticChunker, ChunkerConfig |
|
|
|
|
|
config = ChunkerConfig(target_chunk_size=256) |
|
|
chunker = SemanticChunker(config) |
|
|
|
|
|
assert chunker.config.target_chunk_size == 256 |
|
|
|
|
|
|
|
|
class TestValidation: |
|
|
"""Test validation components.""" |
|
|
|
|
|
def test_validation_status_enum(self): |
|
|
"""Test ValidationStatus enum.""" |
|
|
from src.document.validation.critic import ValidationStatus |
|
|
|
|
|
assert ValidationStatus.VALID.value == "valid" |
|
|
assert ValidationStatus.INVALID.value == "invalid" |
|
|
assert ValidationStatus.ABSTAIN.value == "abstain" |
|
|
|
|
|
def test_evidence_strength_enum(self): |
|
|
"""Test EvidenceStrength enum.""" |
|
|
from src.document.validation.verifier import EvidenceStrength |
|
|
|
|
|
assert EvidenceStrength.STRONG.value == "strong" |
|
|
assert EvidenceStrength.NONE.value == "none" |
|
|
|
|
|
|
|
|
class TestPipelineIntegration: |
|
|
"""Integration tests for full pipeline.""" |
|
|
|
|
|
def test_pipeline_config_creation(self): |
|
|
"""Test PipelineConfig creation.""" |
|
|
from src.document.pipeline import PipelineConfig |
|
|
from src.document.ocr import OCRConfig |
|
|
|
|
|
config = PipelineConfig( |
|
|
ocr=OCRConfig(engine="paddleocr"), |
|
|
render_dpi=300, |
|
|
max_pages=10, |
|
|
) |
|
|
|
|
|
assert config.render_dpi == 300 |
|
|
assert config.max_pages == 10 |
|
|
|
|
|
def test_processed_document_structure(self): |
|
|
"""Test ProcessedDocument structure.""" |
|
|
from src.document.schemas.core import ( |
|
|
ProcessedDocument, |
|
|
DocumentMetadata, |
|
|
OCRRegion, |
|
|
LayoutRegion, |
|
|
DocumentChunk, |
|
|
ChunkType, |
|
|
BoundingBox, |
|
|
) |
|
|
from datetime import datetime |
|
|
|
|
|
metadata = DocumentMetadata( |
|
|
document_id="test_doc", |
|
|
source_path="/path/to/doc.pdf", |
|
|
filename="doc.pdf", |
|
|
file_type="pdf", |
|
|
file_size_bytes=1000, |
|
|
num_pages=1, |
|
|
page_dimensions=[(800, 1000)], |
|
|
processed_at=datetime.utcnow(), |
|
|
total_chunks=1, |
|
|
total_characters=100, |
|
|
) |
|
|
|
|
|
chunk = DocumentChunk( |
|
|
chunk_id="chunk_1", |
|
|
chunk_type=ChunkType.TEXT, |
|
|
text="Sample text", |
|
|
bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100), |
|
|
page=0, |
|
|
document_id="test_doc", |
|
|
source_path="/path/to/doc.pdf", |
|
|
sequence_index=0, |
|
|
confidence=0.9, |
|
|
) |
|
|
|
|
|
doc = ProcessedDocument( |
|
|
metadata=metadata, |
|
|
ocr_regions=[], |
|
|
layout_regions=[], |
|
|
chunks=[chunk], |
|
|
full_text="Sample text", |
|
|
status="completed", |
|
|
) |
|
|
|
|
|
assert doc.metadata.document_id == "test_doc" |
|
|
assert len(doc.chunks) == 1 |
|
|
|