File size: 9,011 Bytes
d520909 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 |
"""
Integration Tests for Document Processing Pipeline
Tests the full document processing workflow:
- OCR extraction
- Layout detection
- Reading order reconstruction
- Chunking
"""
import pytest
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
import numpy as np
# Test fixtures
@pytest.fixture
def sample_image():
"""Create a sample image for testing."""
return np.zeros((1000, 800, 3), dtype=np.uint8)
@pytest.fixture
def mock_ocr_result():
"""Mock OCR result."""
from src.document.ocr import OCRResult
from src.document.schemas.core import OCRRegion, BoundingBox
regions = [
OCRRegion(
text="Sample Title",
confidence=0.95,
bbox=BoundingBox(x_min=100, y_min=50, x_max=700, y_max=100),
page=0,
engine="mock",
),
OCRRegion(
text="This is paragraph text that contains important information.",
confidence=0.92,
bbox=BoundingBox(x_min=100, y_min=150, x_max=700, y_max=250),
page=0,
engine="mock",
),
]
return OCRResult(
success=True,
regions=regions,
page_num=0,
processing_time=0.5,
)
class TestDocumentSchemas:
"""Test document schema models."""
def test_bounding_box_creation(self):
"""Test BoundingBox creation and properties."""
from src.document.schemas.core import BoundingBox
bbox = BoundingBox(x_min=10, y_min=20, x_max=100, y_max=80)
assert bbox.width == 90
assert bbox.height == 60
assert bbox.area == 5400
assert bbox.center == (55.0, 50.0)
def test_bounding_box_normalization(self):
"""Test BoundingBox normalization."""
from src.document.schemas.core import BoundingBox
bbox = BoundingBox(x_min=100, y_min=200, x_max=300, y_max=400)
normalized = bbox.normalize(1000, 800)
assert normalized.normalized is True
assert 0 <= normalized.x_min <= 1
assert 0 <= normalized.y_max <= 1
def test_bounding_box_iou(self):
"""Test BoundingBox IoU calculation."""
from src.document.schemas.core import BoundingBox
bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100)
bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150)
bbox3 = BoundingBox(x_min=200, y_min=200, x_max=300, y_max=300)
# Overlapping boxes
iou = bbox1.iou(bbox2)
assert 0 < iou < 1
# Non-overlapping boxes
iou = bbox1.iou(bbox3)
assert iou == 0
def test_ocr_region_creation(self):
"""Test OCRRegion creation."""
from src.document.schemas.core import OCRRegion, BoundingBox
region = OCRRegion(
text="Sample text",
confidence=0.95,
bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=50),
page=0,
engine="paddleocr",
)
assert region.text == "Sample text"
assert region.confidence == 0.95
def test_document_chunk_creation(self):
"""Test DocumentChunk creation."""
from src.document.schemas.core import DocumentChunk, ChunkType, BoundingBox
chunk = DocumentChunk(
chunk_id="chunk_001",
chunk_type=ChunkType.TEXT,
text="Sample chunk text",
bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100),
page=0,
document_id="doc_001",
source_path="/path/to/doc.pdf",
sequence_index=0,
confidence=0.9,
)
assert chunk.chunk_id == "chunk_001"
assert chunk.chunk_type == ChunkType.TEXT
class TestOCREngines:
"""Test OCR engine implementations."""
def test_ocr_config_defaults(self):
"""Test OCRConfig default values."""
from src.document.ocr import OCRConfig
config = OCRConfig()
assert config.engine == "paddleocr"
assert config.language == "en"
def test_ocr_factory_paddleocr(self):
"""Test OCR factory for PaddleOCR."""
from src.document.ocr import get_ocr_engine, OCRConfig
with patch("src.document.ocr.paddle_ocr.PADDLEOCR_AVAILABLE", True):
with patch("src.document.ocr.paddle_ocr.PaddleOCR"):
config = OCRConfig(engine="paddleocr")
# Factory should return PaddleOCREngine
# (actual instantiation mocked)
def test_ocr_factory_tesseract(self):
"""Test OCR factory for Tesseract."""
from src.document.ocr import get_ocr_engine, OCRConfig
with patch("src.document.ocr.tesseract_ocr.TESSERACT_AVAILABLE", True):
config = OCRConfig(engine="tesseract")
# Factory should return TesseractOCREngine
class TestLayoutDetection:
"""Test layout detection functionality."""
def test_layout_config_defaults(self):
"""Test LayoutConfig defaults."""
from src.document.layout import LayoutConfig
config = LayoutConfig()
assert config.method == "rule_based"
def test_layout_type_enum(self):
"""Test LayoutType enum values."""
from src.document.schemas.core import LayoutType
assert LayoutType.TEXT.value == "text"
assert LayoutType.TITLE.value == "title"
assert LayoutType.TABLE.value == "table"
class TestReadingOrder:
"""Test reading order reconstruction."""
def test_reading_order_config(self):
"""Test ReadingOrderConfig."""
from src.document.reading_order import ReadingOrderConfig
config = ReadingOrderConfig()
assert config.method == "rule_based"
assert config.reading_direction == "ltr"
class TestChunking:
"""Test document chunking."""
def test_chunker_config(self):
"""Test ChunkerConfig."""
from src.document.chunking import ChunkerConfig
config = ChunkerConfig()
assert config.target_chunk_size > 0
assert config.max_chunk_size >= config.target_chunk_size
def test_semantic_chunker_creation(self):
"""Test SemanticChunker creation."""
from src.document.chunking import SemanticChunker, ChunkerConfig
config = ChunkerConfig(target_chunk_size=256)
chunker = SemanticChunker(config)
assert chunker.config.target_chunk_size == 256
class TestValidation:
"""Test validation components."""
def test_validation_status_enum(self):
"""Test ValidationStatus enum."""
from src.document.validation.critic import ValidationStatus
assert ValidationStatus.VALID.value == "valid"
assert ValidationStatus.INVALID.value == "invalid"
assert ValidationStatus.ABSTAIN.value == "abstain"
def test_evidence_strength_enum(self):
"""Test EvidenceStrength enum."""
from src.document.validation.verifier import EvidenceStrength
assert EvidenceStrength.STRONG.value == "strong"
assert EvidenceStrength.NONE.value == "none"
class TestPipelineIntegration:
"""Integration tests for full pipeline."""
def test_pipeline_config_creation(self):
"""Test PipelineConfig creation."""
from src.document.pipeline import PipelineConfig
from src.document.ocr import OCRConfig
config = PipelineConfig(
ocr=OCRConfig(engine="paddleocr"),
render_dpi=300,
max_pages=10,
)
assert config.render_dpi == 300
assert config.max_pages == 10
def test_processed_document_structure(self):
"""Test ProcessedDocument structure."""
from src.document.schemas.core import (
ProcessedDocument,
DocumentMetadata,
OCRRegion,
LayoutRegion,
DocumentChunk,
ChunkType,
BoundingBox,
)
from datetime import datetime
metadata = DocumentMetadata(
document_id="test_doc",
source_path="/path/to/doc.pdf",
filename="doc.pdf",
file_type="pdf",
file_size_bytes=1000,
num_pages=1,
page_dimensions=[(800, 1000)],
processed_at=datetime.utcnow(),
total_chunks=1,
total_characters=100,
)
chunk = DocumentChunk(
chunk_id="chunk_1",
chunk_type=ChunkType.TEXT,
text="Sample text",
bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100),
page=0,
document_id="test_doc",
source_path="/path/to/doc.pdf",
sequence_index=0,
confidence=0.9,
)
doc = ProcessedDocument(
metadata=metadata,
ocr_regions=[],
layout_regions=[],
chunks=[chunk],
full_text="Sample text",
status="completed",
)
assert doc.metadata.document_id == "test_doc"
assert len(doc.chunks) == 1
|