Spaces:

MHamdan
/

SPARKNET

Sleeping

App Files Files Community

SPARKNET / tests /integration /test_document_pipeline.py

MHamdan

Initial commit: SPARKNET framework

d520909 26 days ago

raw

history blame contribute delete

9.01 kB

	"""
	Integration Tests for Document Processing Pipeline

	Tests the full document processing workflow:
	- OCR extraction
	- Layout detection
	- Reading order reconstruction
	- Chunking
	"""

	import pytest
	from pathlib import Path
	from unittest.mock import Mock, patch, MagicMock
	import numpy as np

	# Test fixtures
	@pytest.fixture
	def sample_image():
	"""Create a sample image for testing."""
	return np.zeros((1000, 800, 3), dtype=np.uint8)


	@pytest.fixture
	def mock_ocr_result():
	"""Mock OCR result."""
	from src.document.ocr import OCRResult
	from src.document.schemas.core import OCRRegion, BoundingBox

	regions = [
	OCRRegion(
	text="Sample Title",
	confidence=0.95,
	bbox=BoundingBox(x_min=100, y_min=50, x_max=700, y_max=100),
	page=0,
	engine="mock",
	),
	OCRRegion(
	text="This is paragraph text that contains important information.",
	confidence=0.92,
	bbox=BoundingBox(x_min=100, y_min=150, x_max=700, y_max=250),
	page=0,
	engine="mock",
	),
	]

	return OCRResult(
	success=True,
	regions=regions,
	page_num=0,
	processing_time=0.5,
	)


	class TestDocumentSchemas:
	"""Test document schema models."""

	def test_bounding_box_creation(self):
	"""Test BoundingBox creation and properties."""
	from src.document.schemas.core import BoundingBox

	bbox = BoundingBox(x_min=10, y_min=20, x_max=100, y_max=80)

	assert bbox.width == 90
	assert bbox.height == 60
	assert bbox.area == 5400
	assert bbox.center == (55.0, 50.0)

	def test_bounding_box_normalization(self):
	"""Test BoundingBox normalization."""
	from src.document.schemas.core import BoundingBox

	bbox = BoundingBox(x_min=100, y_min=200, x_max=300, y_max=400)

	normalized = bbox.normalize(1000, 800)
	assert normalized.normalized is True
	assert 0 <= normalized.x_min <= 1
	assert 0 <= normalized.y_max <= 1

	def test_bounding_box_iou(self):
	"""Test BoundingBox IoU calculation."""
	from src.document.schemas.core import BoundingBox

	bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100)
	bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150)
	bbox3 = BoundingBox(x_min=200, y_min=200, x_max=300, y_max=300)

	# Overlapping boxes
	iou = bbox1.iou(bbox2)
	assert 0 < iou < 1

	# Non-overlapping boxes
	iou = bbox1.iou(bbox3)
	assert iou == 0

	def test_ocr_region_creation(self):
	"""Test OCRRegion creation."""
	from src.document.schemas.core import OCRRegion, BoundingBox

	region = OCRRegion(
	text="Sample text",
	confidence=0.95,
	bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=50),
	page=0,
	engine="paddleocr",
	)

	assert region.text == "Sample text"
	assert region.confidence == 0.95

	def test_document_chunk_creation(self):
	"""Test DocumentChunk creation."""
	from src.document.schemas.core import DocumentChunk, ChunkType, BoundingBox

	chunk = DocumentChunk(
	chunk_id="chunk_001",
	chunk_type=ChunkType.TEXT,
	text="Sample chunk text",
	bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100),
	page=0,
	document_id="doc_001",
	source_path="/path/to/doc.pdf",
	sequence_index=0,
	confidence=0.9,
	)

	assert chunk.chunk_id == "chunk_001"
	assert chunk.chunk_type == ChunkType.TEXT


	class TestOCREngines:
	"""Test OCR engine implementations."""

	def test_ocr_config_defaults(self):
	"""Test OCRConfig default values."""
	from src.document.ocr import OCRConfig

	config = OCRConfig()
	assert config.engine == "paddleocr"
	assert config.language == "en"

	def test_ocr_factory_paddleocr(self):
	"""Test OCR factory for PaddleOCR."""
	from src.document.ocr import get_ocr_engine, OCRConfig

	with patch("src.document.ocr.paddle_ocr.PADDLEOCR_AVAILABLE", True):
	with patch("src.document.ocr.paddle_ocr.PaddleOCR"):
	config = OCRConfig(engine="paddleocr")
	# Factory should return PaddleOCREngine
	# (actual instantiation mocked)

	def test_ocr_factory_tesseract(self):
	"""Test OCR factory for Tesseract."""
	from src.document.ocr import get_ocr_engine, OCRConfig

	with patch("src.document.ocr.tesseract_ocr.TESSERACT_AVAILABLE", True):
	config = OCRConfig(engine="tesseract")
	# Factory should return TesseractOCREngine


	class TestLayoutDetection:
	"""Test layout detection functionality."""

	def test_layout_config_defaults(self):
	"""Test LayoutConfig defaults."""
	from src.document.layout import LayoutConfig

	config = LayoutConfig()
	assert config.method == "rule_based"

	def test_layout_type_enum(self):
	"""Test LayoutType enum values."""
	from src.document.schemas.core import LayoutType

	assert LayoutType.TEXT.value == "text"
	assert LayoutType.TITLE.value == "title"
	assert LayoutType.TABLE.value == "table"


	class TestReadingOrder:
	"""Test reading order reconstruction."""

	def test_reading_order_config(self):
	"""Test ReadingOrderConfig."""
	from src.document.reading_order import ReadingOrderConfig

	config = ReadingOrderConfig()
	assert config.method == "rule_based"
	assert config.reading_direction == "ltr"


	class TestChunking:
	"""Test document chunking."""

	def test_chunker_config(self):
	"""Test ChunkerConfig."""
	from src.document.chunking import ChunkerConfig

	config = ChunkerConfig()
	assert config.target_chunk_size > 0
	assert config.max_chunk_size >= config.target_chunk_size

	def test_semantic_chunker_creation(self):
	"""Test SemanticChunker creation."""
	from src.document.chunking import SemanticChunker, ChunkerConfig

	config = ChunkerConfig(target_chunk_size=256)
	chunker = SemanticChunker(config)

	assert chunker.config.target_chunk_size == 256


	class TestValidation:
	"""Test validation components."""

	def test_validation_status_enum(self):
	"""Test ValidationStatus enum."""
	from src.document.validation.critic import ValidationStatus

	assert ValidationStatus.VALID.value == "valid"
	assert ValidationStatus.INVALID.value == "invalid"
	assert ValidationStatus.ABSTAIN.value == "abstain"

	def test_evidence_strength_enum(self):
	"""Test EvidenceStrength enum."""
	from src.document.validation.verifier import EvidenceStrength

	assert EvidenceStrength.STRONG.value == "strong"
	assert EvidenceStrength.NONE.value == "none"


	class TestPipelineIntegration:
	"""Integration tests for full pipeline."""

	def test_pipeline_config_creation(self):
	"""Test PipelineConfig creation."""
	from src.document.pipeline import PipelineConfig
	from src.document.ocr import OCRConfig

	config = PipelineConfig(
	ocr=OCRConfig(engine="paddleocr"),
	render_dpi=300,
	max_pages=10,
	)

	assert config.render_dpi == 300
	assert config.max_pages == 10

	def test_processed_document_structure(self):
	"""Test ProcessedDocument structure."""
	from src.document.schemas.core import (
	ProcessedDocument,
	DocumentMetadata,
	OCRRegion,
	LayoutRegion,
	DocumentChunk,
	ChunkType,
	BoundingBox,
	)
	from datetime import datetime

	metadata = DocumentMetadata(
	document_id="test_doc",
	source_path="/path/to/doc.pdf",
	filename="doc.pdf",
	file_type="pdf",
	file_size_bytes=1000,
	num_pages=1,
	page_dimensions=[(800, 1000)],
	processed_at=datetime.utcnow(),
	total_chunks=1,
	total_characters=100,
	)

	chunk = DocumentChunk(
	chunk_id="chunk_1",
	chunk_type=ChunkType.TEXT,
	text="Sample text",
	bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100),
	page=0,
	document_id="test_doc",
	source_path="/path/to/doc.pdf",
	sequence_index=0,
	confidence=0.9,
	)

	doc = ProcessedDocument(
	metadata=metadata,
	ocr_regions=[],
	layout_regions=[],
	chunks=[chunk],
	full_text="Sample text",
	status="completed",
	)

	assert doc.metadata.document_id == "test_doc"
	assert len(doc.chunks) == 1