SPARKNET / tests /integration /test_document_pipeline.py
MHamdan's picture
Initial commit: SPARKNET framework
d520909
"""
Integration Tests for Document Processing Pipeline
Tests the full document processing workflow:
- OCR extraction
- Layout detection
- Reading order reconstruction
- Chunking
"""
import pytest
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
import numpy as np
# Test fixtures
@pytest.fixture
def sample_image():
"""Create a sample image for testing."""
return np.zeros((1000, 800, 3), dtype=np.uint8)
@pytest.fixture
def mock_ocr_result():
"""Mock OCR result."""
from src.document.ocr import OCRResult
from src.document.schemas.core import OCRRegion, BoundingBox
regions = [
OCRRegion(
text="Sample Title",
confidence=0.95,
bbox=BoundingBox(x_min=100, y_min=50, x_max=700, y_max=100),
page=0,
engine="mock",
),
OCRRegion(
text="This is paragraph text that contains important information.",
confidence=0.92,
bbox=BoundingBox(x_min=100, y_min=150, x_max=700, y_max=250),
page=0,
engine="mock",
),
]
return OCRResult(
success=True,
regions=regions,
page_num=0,
processing_time=0.5,
)
class TestDocumentSchemas:
"""Test document schema models."""
def test_bounding_box_creation(self):
"""Test BoundingBox creation and properties."""
from src.document.schemas.core import BoundingBox
bbox = BoundingBox(x_min=10, y_min=20, x_max=100, y_max=80)
assert bbox.width == 90
assert bbox.height == 60
assert bbox.area == 5400
assert bbox.center == (55.0, 50.0)
def test_bounding_box_normalization(self):
"""Test BoundingBox normalization."""
from src.document.schemas.core import BoundingBox
bbox = BoundingBox(x_min=100, y_min=200, x_max=300, y_max=400)
normalized = bbox.normalize(1000, 800)
assert normalized.normalized is True
assert 0 <= normalized.x_min <= 1
assert 0 <= normalized.y_max <= 1
def test_bounding_box_iou(self):
"""Test BoundingBox IoU calculation."""
from src.document.schemas.core import BoundingBox
bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100)
bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150)
bbox3 = BoundingBox(x_min=200, y_min=200, x_max=300, y_max=300)
# Overlapping boxes
iou = bbox1.iou(bbox2)
assert 0 < iou < 1
# Non-overlapping boxes
iou = bbox1.iou(bbox3)
assert iou == 0
def test_ocr_region_creation(self):
"""Test OCRRegion creation."""
from src.document.schemas.core import OCRRegion, BoundingBox
region = OCRRegion(
text="Sample text",
confidence=0.95,
bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=50),
page=0,
engine="paddleocr",
)
assert region.text == "Sample text"
assert region.confidence == 0.95
def test_document_chunk_creation(self):
"""Test DocumentChunk creation."""
from src.document.schemas.core import DocumentChunk, ChunkType, BoundingBox
chunk = DocumentChunk(
chunk_id="chunk_001",
chunk_type=ChunkType.TEXT,
text="Sample chunk text",
bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100),
page=0,
document_id="doc_001",
source_path="/path/to/doc.pdf",
sequence_index=0,
confidence=0.9,
)
assert chunk.chunk_id == "chunk_001"
assert chunk.chunk_type == ChunkType.TEXT
class TestOCREngines:
"""Test OCR engine implementations."""
def test_ocr_config_defaults(self):
"""Test OCRConfig default values."""
from src.document.ocr import OCRConfig
config = OCRConfig()
assert config.engine == "paddleocr"
assert config.language == "en"
def test_ocr_factory_paddleocr(self):
"""Test OCR factory for PaddleOCR."""
from src.document.ocr import get_ocr_engine, OCRConfig
with patch("src.document.ocr.paddle_ocr.PADDLEOCR_AVAILABLE", True):
with patch("src.document.ocr.paddle_ocr.PaddleOCR"):
config = OCRConfig(engine="paddleocr")
# Factory should return PaddleOCREngine
# (actual instantiation mocked)
def test_ocr_factory_tesseract(self):
"""Test OCR factory for Tesseract."""
from src.document.ocr import get_ocr_engine, OCRConfig
with patch("src.document.ocr.tesseract_ocr.TESSERACT_AVAILABLE", True):
config = OCRConfig(engine="tesseract")
# Factory should return TesseractOCREngine
class TestLayoutDetection:
"""Test layout detection functionality."""
def test_layout_config_defaults(self):
"""Test LayoutConfig defaults."""
from src.document.layout import LayoutConfig
config = LayoutConfig()
assert config.method == "rule_based"
def test_layout_type_enum(self):
"""Test LayoutType enum values."""
from src.document.schemas.core import LayoutType
assert LayoutType.TEXT.value == "text"
assert LayoutType.TITLE.value == "title"
assert LayoutType.TABLE.value == "table"
class TestReadingOrder:
"""Test reading order reconstruction."""
def test_reading_order_config(self):
"""Test ReadingOrderConfig."""
from src.document.reading_order import ReadingOrderConfig
config = ReadingOrderConfig()
assert config.method == "rule_based"
assert config.reading_direction == "ltr"
class TestChunking:
"""Test document chunking."""
def test_chunker_config(self):
"""Test ChunkerConfig."""
from src.document.chunking import ChunkerConfig
config = ChunkerConfig()
assert config.target_chunk_size > 0
assert config.max_chunk_size >= config.target_chunk_size
def test_semantic_chunker_creation(self):
"""Test SemanticChunker creation."""
from src.document.chunking import SemanticChunker, ChunkerConfig
config = ChunkerConfig(target_chunk_size=256)
chunker = SemanticChunker(config)
assert chunker.config.target_chunk_size == 256
class TestValidation:
"""Test validation components."""
def test_validation_status_enum(self):
"""Test ValidationStatus enum."""
from src.document.validation.critic import ValidationStatus
assert ValidationStatus.VALID.value == "valid"
assert ValidationStatus.INVALID.value == "invalid"
assert ValidationStatus.ABSTAIN.value == "abstain"
def test_evidence_strength_enum(self):
"""Test EvidenceStrength enum."""
from src.document.validation.verifier import EvidenceStrength
assert EvidenceStrength.STRONG.value == "strong"
assert EvidenceStrength.NONE.value == "none"
class TestPipelineIntegration:
"""Integration tests for full pipeline."""
def test_pipeline_config_creation(self):
"""Test PipelineConfig creation."""
from src.document.pipeline import PipelineConfig
from src.document.ocr import OCRConfig
config = PipelineConfig(
ocr=OCRConfig(engine="paddleocr"),
render_dpi=300,
max_pages=10,
)
assert config.render_dpi == 300
assert config.max_pages == 10
def test_processed_document_structure(self):
"""Test ProcessedDocument structure."""
from src.document.schemas.core import (
ProcessedDocument,
DocumentMetadata,
OCRRegion,
LayoutRegion,
DocumentChunk,
ChunkType,
BoundingBox,
)
from datetime import datetime
metadata = DocumentMetadata(
document_id="test_doc",
source_path="/path/to/doc.pdf",
filename="doc.pdf",
file_type="pdf",
file_size_bytes=1000,
num_pages=1,
page_dimensions=[(800, 1000)],
processed_at=datetime.utcnow(),
total_chunks=1,
total_characters=100,
)
chunk = DocumentChunk(
chunk_id="chunk_1",
chunk_type=ChunkType.TEXT,
text="Sample text",
bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100),
page=0,
document_id="test_doc",
source_path="/path/to/doc.pdf",
sequence_index=0,
confidence=0.9,
)
doc = ProcessedDocument(
metadata=metadata,
ocr_regions=[],
layout_regions=[],
chunks=[chunk],
full_text="Sample text",
status="completed",
)
assert doc.metadata.document_id == "test_doc"
assert len(doc.chunks) == 1