Spaces:

MHamdan
/

SPARKNET

Sleeping

File size: 9,011 Bytes

d520909

"""
Integration Tests for Document Processing Pipeline

Tests the full document processing workflow:
- OCR extraction
- Layout detection
- Reading order reconstruction
- Chunking
"""

import pytest
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
import numpy as np

# Test fixtures
@pytest.fixture
def sample_image():
    """Create a sample image for testing."""
    return np.zeros((1000, 800, 3), dtype=np.uint8)


@pytest.fixture
def mock_ocr_result():
    """Mock OCR result."""
    from src.document.ocr import OCRResult
    from src.document.schemas.core import OCRRegion, BoundingBox

    regions = [
        OCRRegion(
            text="Sample Title",
            confidence=0.95,
            bbox=BoundingBox(x_min=100, y_min=50, x_max=700, y_max=100),
            page=0,
            engine="mock",
        ),
        OCRRegion(
            text="This is paragraph text that contains important information.",
            confidence=0.92,
            bbox=BoundingBox(x_min=100, y_min=150, x_max=700, y_max=250),
            page=0,
            engine="mock",
        ),
    ]

    return OCRResult(
        success=True,
        regions=regions,
        page_num=0,
        processing_time=0.5,
    )


class TestDocumentSchemas:
    """Test document schema models."""

    def test_bounding_box_creation(self):
        """Test BoundingBox creation and properties."""
        from src.document.schemas.core import BoundingBox

        bbox = BoundingBox(x_min=10, y_min=20, x_max=100, y_max=80)

        assert bbox.width == 90
        assert bbox.height == 60
        assert bbox.area == 5400
        assert bbox.center == (55.0, 50.0)

    def test_bounding_box_normalization(self):
        """Test BoundingBox normalization."""
        from src.document.schemas.core import BoundingBox

        bbox = BoundingBox(x_min=100, y_min=200, x_max=300, y_max=400)

        normalized = bbox.normalize(1000, 800)
        assert normalized.normalized is True
        assert 0 <= normalized.x_min <= 1
        assert 0 <= normalized.y_max <= 1

    def test_bounding_box_iou(self):
        """Test BoundingBox IoU calculation."""
        from src.document.schemas.core import BoundingBox

        bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100)
        bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150)
        bbox3 = BoundingBox(x_min=200, y_min=200, x_max=300, y_max=300)

        # Overlapping boxes
        iou = bbox1.iou(bbox2)
        assert 0 < iou < 1

        # Non-overlapping boxes
        iou = bbox1.iou(bbox3)
        assert iou == 0

    def test_ocr_region_creation(self):
        """Test OCRRegion creation."""
        from src.document.schemas.core import OCRRegion, BoundingBox

        region = OCRRegion(
            text="Sample text",
            confidence=0.95,
            bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=50),
            page=0,
            engine="paddleocr",
        )

        assert region.text == "Sample text"
        assert region.confidence == 0.95

    def test_document_chunk_creation(self):
        """Test DocumentChunk creation."""
        from src.document.schemas.core import DocumentChunk, ChunkType, BoundingBox

        chunk = DocumentChunk(
            chunk_id="chunk_001",
            chunk_type=ChunkType.TEXT,
            text="Sample chunk text",
            bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100),
            page=0,
            document_id="doc_001",
            source_path="/path/to/doc.pdf",
            sequence_index=0,
            confidence=0.9,
        )

        assert chunk.chunk_id == "chunk_001"
        assert chunk.chunk_type == ChunkType.TEXT


class TestOCREngines:
    """Test OCR engine implementations."""

    def test_ocr_config_defaults(self):
        """Test OCRConfig default values."""
        from src.document.ocr import OCRConfig

        config = OCRConfig()
        assert config.engine == "paddleocr"
        assert config.language == "en"

    def test_ocr_factory_paddleocr(self):
        """Test OCR factory for PaddleOCR."""
        from src.document.ocr import get_ocr_engine, OCRConfig

        with patch("src.document.ocr.paddle_ocr.PADDLEOCR_AVAILABLE", True):
            with patch("src.document.ocr.paddle_ocr.PaddleOCR"):
                config = OCRConfig(engine="paddleocr")
                # Factory should return PaddleOCREngine
                # (actual instantiation mocked)

    def test_ocr_factory_tesseract(self):
        """Test OCR factory for Tesseract."""
        from src.document.ocr import get_ocr_engine, OCRConfig

        with patch("src.document.ocr.tesseract_ocr.TESSERACT_AVAILABLE", True):
            config = OCRConfig(engine="tesseract")
            # Factory should return TesseractOCREngine


class TestLayoutDetection:
    """Test layout detection functionality."""

    def test_layout_config_defaults(self):
        """Test LayoutConfig defaults."""
        from src.document.layout import LayoutConfig

        config = LayoutConfig()
        assert config.method == "rule_based"

    def test_layout_type_enum(self):
        """Test LayoutType enum values."""
        from src.document.schemas.core import LayoutType

        assert LayoutType.TEXT.value == "text"
        assert LayoutType.TITLE.value == "title"
        assert LayoutType.TABLE.value == "table"


class TestReadingOrder:
    """Test reading order reconstruction."""

    def test_reading_order_config(self):
        """Test ReadingOrderConfig."""
        from src.document.reading_order import ReadingOrderConfig

        config = ReadingOrderConfig()
        assert config.method == "rule_based"
        assert config.reading_direction == "ltr"


class TestChunking:
    """Test document chunking."""

    def test_chunker_config(self):
        """Test ChunkerConfig."""
        from src.document.chunking import ChunkerConfig

        config = ChunkerConfig()
        assert config.target_chunk_size > 0
        assert config.max_chunk_size >= config.target_chunk_size

    def test_semantic_chunker_creation(self):
        """Test SemanticChunker creation."""
        from src.document.chunking import SemanticChunker, ChunkerConfig

        config = ChunkerConfig(target_chunk_size=256)
        chunker = SemanticChunker(config)

        assert chunker.config.target_chunk_size == 256


class TestValidation:
    """Test validation components."""

    def test_validation_status_enum(self):
        """Test ValidationStatus enum."""
        from src.document.validation.critic import ValidationStatus

        assert ValidationStatus.VALID.value == "valid"
        assert ValidationStatus.INVALID.value == "invalid"
        assert ValidationStatus.ABSTAIN.value == "abstain"

    def test_evidence_strength_enum(self):
        """Test EvidenceStrength enum."""
        from src.document.validation.verifier import EvidenceStrength

        assert EvidenceStrength.STRONG.value == "strong"
        assert EvidenceStrength.NONE.value == "none"


class TestPipelineIntegration:
    """Integration tests for full pipeline."""

    def test_pipeline_config_creation(self):
        """Test PipelineConfig creation."""
        from src.document.pipeline import PipelineConfig
        from src.document.ocr import OCRConfig

        config = PipelineConfig(
            ocr=OCRConfig(engine="paddleocr"),
            render_dpi=300,
            max_pages=10,
        )

        assert config.render_dpi == 300
        assert config.max_pages == 10

    def test_processed_document_structure(self):
        """Test ProcessedDocument structure."""
        from src.document.schemas.core import (
            ProcessedDocument,
            DocumentMetadata,
            OCRRegion,
            LayoutRegion,
            DocumentChunk,
            ChunkType,
            BoundingBox,
        )
        from datetime import datetime

        metadata = DocumentMetadata(
            document_id="test_doc",
            source_path="/path/to/doc.pdf",
            filename="doc.pdf",
            file_type="pdf",
            file_size_bytes=1000,
            num_pages=1,
            page_dimensions=[(800, 1000)],
            processed_at=datetime.utcnow(),
            total_chunks=1,
            total_characters=100,
        )

        chunk = DocumentChunk(
            chunk_id="chunk_1",
            chunk_type=ChunkType.TEXT,
            text="Sample text",
            bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100),
            page=0,
            document_id="test_doc",
            source_path="/path/to/doc.pdf",
            sequence_index=0,
            confidence=0.9,
        )

        doc = ProcessedDocument(
            metadata=metadata,
            ocr_regions=[],
            layout_regions=[],
            chunks=[chunk],
            full_text="Sample text",
            status="completed",
        )

        assert doc.metadata.document_id == "test_doc"
        assert len(doc.chunks) == 1