File size: 9,011 Bytes
d520909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
"""
Integration Tests for Document Processing Pipeline

Tests the full document processing workflow:
- OCR extraction
- Layout detection
- Reading order reconstruction
- Chunking
"""

import pytest
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
import numpy as np

# Test fixtures
@pytest.fixture
def sample_image():
    """Create a sample image for testing."""
    return np.zeros((1000, 800, 3), dtype=np.uint8)


@pytest.fixture
def mock_ocr_result():
    """Mock OCR result."""
    from src.document.ocr import OCRResult
    from src.document.schemas.core import OCRRegion, BoundingBox

    regions = [
        OCRRegion(
            text="Sample Title",
            confidence=0.95,
            bbox=BoundingBox(x_min=100, y_min=50, x_max=700, y_max=100),
            page=0,
            engine="mock",
        ),
        OCRRegion(
            text="This is paragraph text that contains important information.",
            confidence=0.92,
            bbox=BoundingBox(x_min=100, y_min=150, x_max=700, y_max=250),
            page=0,
            engine="mock",
        ),
    ]

    return OCRResult(
        success=True,
        regions=regions,
        page_num=0,
        processing_time=0.5,
    )


class TestDocumentSchemas:
    """Test document schema models."""

    def test_bounding_box_creation(self):
        """Test BoundingBox creation and properties."""
        from src.document.schemas.core import BoundingBox

        bbox = BoundingBox(x_min=10, y_min=20, x_max=100, y_max=80)

        assert bbox.width == 90
        assert bbox.height == 60
        assert bbox.area == 5400
        assert bbox.center == (55.0, 50.0)

    def test_bounding_box_normalization(self):
        """Test BoundingBox normalization."""
        from src.document.schemas.core import BoundingBox

        bbox = BoundingBox(x_min=100, y_min=200, x_max=300, y_max=400)

        normalized = bbox.normalize(1000, 800)
        assert normalized.normalized is True
        assert 0 <= normalized.x_min <= 1
        assert 0 <= normalized.y_max <= 1

    def test_bounding_box_iou(self):
        """Test BoundingBox IoU calculation."""
        from src.document.schemas.core import BoundingBox

        bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100)
        bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150)
        bbox3 = BoundingBox(x_min=200, y_min=200, x_max=300, y_max=300)

        # Overlapping boxes
        iou = bbox1.iou(bbox2)
        assert 0 < iou < 1

        # Non-overlapping boxes
        iou = bbox1.iou(bbox3)
        assert iou == 0

    def test_ocr_region_creation(self):
        """Test OCRRegion creation."""
        from src.document.schemas.core import OCRRegion, BoundingBox

        region = OCRRegion(
            text="Sample text",
            confidence=0.95,
            bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=50),
            page=0,
            engine="paddleocr",
        )

        assert region.text == "Sample text"
        assert region.confidence == 0.95

    def test_document_chunk_creation(self):
        """Test DocumentChunk creation."""
        from src.document.schemas.core import DocumentChunk, ChunkType, BoundingBox

        chunk = DocumentChunk(
            chunk_id="chunk_001",
            chunk_type=ChunkType.TEXT,
            text="Sample chunk text",
            bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100),
            page=0,
            document_id="doc_001",
            source_path="/path/to/doc.pdf",
            sequence_index=0,
            confidence=0.9,
        )

        assert chunk.chunk_id == "chunk_001"
        assert chunk.chunk_type == ChunkType.TEXT


class TestOCREngines:
    """Test OCR engine implementations."""

    def test_ocr_config_defaults(self):
        """Test OCRConfig default values."""
        from src.document.ocr import OCRConfig

        config = OCRConfig()
        assert config.engine == "paddleocr"
        assert config.language == "en"

    def test_ocr_factory_paddleocr(self):
        """Test OCR factory for PaddleOCR."""
        from src.document.ocr import get_ocr_engine, OCRConfig

        with patch("src.document.ocr.paddle_ocr.PADDLEOCR_AVAILABLE", True):
            with patch("src.document.ocr.paddle_ocr.PaddleOCR"):
                config = OCRConfig(engine="paddleocr")
                # Factory should return PaddleOCREngine
                # (actual instantiation mocked)

    def test_ocr_factory_tesseract(self):
        """Test OCR factory for Tesseract."""
        from src.document.ocr import get_ocr_engine, OCRConfig

        with patch("src.document.ocr.tesseract_ocr.TESSERACT_AVAILABLE", True):
            config = OCRConfig(engine="tesseract")
            # Factory should return TesseractOCREngine


class TestLayoutDetection:
    """Test layout detection functionality."""

    def test_layout_config_defaults(self):
        """Test LayoutConfig defaults."""
        from src.document.layout import LayoutConfig

        config = LayoutConfig()
        assert config.method == "rule_based"

    def test_layout_type_enum(self):
        """Test LayoutType enum values."""
        from src.document.schemas.core import LayoutType

        assert LayoutType.TEXT.value == "text"
        assert LayoutType.TITLE.value == "title"
        assert LayoutType.TABLE.value == "table"


class TestReadingOrder:
    """Test reading order reconstruction."""

    def test_reading_order_config(self):
        """Test ReadingOrderConfig."""
        from src.document.reading_order import ReadingOrderConfig

        config = ReadingOrderConfig()
        assert config.method == "rule_based"
        assert config.reading_direction == "ltr"


class TestChunking:
    """Test document chunking."""

    def test_chunker_config(self):
        """Test ChunkerConfig."""
        from src.document.chunking import ChunkerConfig

        config = ChunkerConfig()
        assert config.target_chunk_size > 0
        assert config.max_chunk_size >= config.target_chunk_size

    def test_semantic_chunker_creation(self):
        """Test SemanticChunker creation."""
        from src.document.chunking import SemanticChunker, ChunkerConfig

        config = ChunkerConfig(target_chunk_size=256)
        chunker = SemanticChunker(config)

        assert chunker.config.target_chunk_size == 256


class TestValidation:
    """Test validation components."""

    def test_validation_status_enum(self):
        """Test ValidationStatus enum."""
        from src.document.validation.critic import ValidationStatus

        assert ValidationStatus.VALID.value == "valid"
        assert ValidationStatus.INVALID.value == "invalid"
        assert ValidationStatus.ABSTAIN.value == "abstain"

    def test_evidence_strength_enum(self):
        """Test EvidenceStrength enum."""
        from src.document.validation.verifier import EvidenceStrength

        assert EvidenceStrength.STRONG.value == "strong"
        assert EvidenceStrength.NONE.value == "none"


class TestPipelineIntegration:
    """Integration tests for full pipeline."""

    def test_pipeline_config_creation(self):
        """Test PipelineConfig creation."""
        from src.document.pipeline import PipelineConfig
        from src.document.ocr import OCRConfig

        config = PipelineConfig(
            ocr=OCRConfig(engine="paddleocr"),
            render_dpi=300,
            max_pages=10,
        )

        assert config.render_dpi == 300
        assert config.max_pages == 10

    def test_processed_document_structure(self):
        """Test ProcessedDocument structure."""
        from src.document.schemas.core import (
            ProcessedDocument,
            DocumentMetadata,
            OCRRegion,
            LayoutRegion,
            DocumentChunk,
            ChunkType,
            BoundingBox,
        )
        from datetime import datetime

        metadata = DocumentMetadata(
            document_id="test_doc",
            source_path="/path/to/doc.pdf",
            filename="doc.pdf",
            file_type="pdf",
            file_size_bytes=1000,
            num_pages=1,
            page_dimensions=[(800, 1000)],
            processed_at=datetime.utcnow(),
            total_chunks=1,
            total_characters=100,
        )

        chunk = DocumentChunk(
            chunk_id="chunk_1",
            chunk_type=ChunkType.TEXT,
            text="Sample text",
            bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100),
            page=0,
            document_id="test_doc",
            source_path="/path/to/doc.pdf",
            sequence_index=0,
            confidence=0.9,
        )

        doc = ProcessedDocument(
            metadata=metadata,
            ocr_regions=[],
            layout_regions=[],
            chunks=[chunk],
            full_text="Sample text",
            status="completed",
        )

        assert doc.metadata.document_id == "test_doc"
        assert len(doc.chunks) == 1