Spaces:

MHamdan
/

SPARKNET

Sleeping

App Files Files Community

SPARKNET / tests /unit /test_document_intelligence.py

MHamdan

Initial commit: SPARKNET framework

d520909 about 1 month ago

raw

history blame

15.1 kB

	"""
	Unit Tests for Document Intelligence Subsystem

	Tests core components:
	- BoundingBox operations
	- Chunk models
	- Schema and extraction
	- Evidence building
	"""

	import pytest
	from pathlib import Path


	class TestBoundingBox:
	"""Tests for BoundingBox model."""

	def test_create_bbox(self):
	from src.document_intelligence.chunks import BoundingBox

	bbox = BoundingBox(
	x_min=0.1,
	y_min=0.2,
	x_max=0.5,
	y_max=0.6,
	normalized=True
	)

	assert bbox.x_min == 0.1
	assert bbox.y_min == 0.2
	assert bbox.x_max == 0.5
	assert bbox.y_max == 0.6
	assert bbox.normalized is True

	def test_bbox_properties(self):
	from src.document_intelligence.chunks import BoundingBox

	bbox = BoundingBox(
	x_min=10,
	y_min=20,
	x_max=50,
	y_max=80,
	normalized=False
	)

	assert bbox.width == 40
	assert bbox.height == 60
	assert bbox.area == 2400
	assert bbox.center == (30, 50)
	assert bbox.xyxy == (10, 20, 50, 80)

	def test_bbox_to_pixel(self):
	from src.document_intelligence.chunks import BoundingBox

	bbox = BoundingBox(
	x_min=0.1,
	y_min=0.2,
	x_max=0.5,
	y_max=0.6,
	normalized=True
	)

	pixel_bbox = bbox.to_pixel(1000, 800)

	assert pixel_bbox.x_min == 100
	assert pixel_bbox.y_min == 160
	assert pixel_bbox.x_max == 500
	assert pixel_bbox.y_max == 480
	assert pixel_bbox.normalized is False

	def test_bbox_to_normalized(self):
	from src.document_intelligence.chunks import BoundingBox

	bbox = BoundingBox(
	x_min=100,
	y_min=160,
	x_max=500,
	y_max=480,
	normalized=False
	)

	norm_bbox = bbox.to_normalized(1000, 800)

	assert abs(norm_bbox.x_min - 0.1) < 0.001
	assert abs(norm_bbox.y_min - 0.2) < 0.001
	assert abs(norm_bbox.x_max - 0.5) < 0.001
	assert abs(norm_bbox.y_max - 0.6) < 0.001
	assert norm_bbox.normalized is True

	def test_bbox_iou(self):
	from src.document_intelligence.chunks import BoundingBox

	bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100)
	bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150)

	# Intersection: 50x50 = 2500
	# Union: 100x100 + 100x100 - 2500 = 17500
	# IoU = 2500/17500 ≈ 0.143
	iou = bbox1.iou(bbox2)
	assert 0.1 < iou < 0.2

	def test_bbox_contains(self):
	from src.document_intelligence.chunks import BoundingBox

	bbox = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100)

	assert bbox.contains((50, 50)) is True
	assert bbox.contains((0, 0)) is True
	assert bbox.contains((100, 100)) is True
	assert bbox.contains((150, 50)) is False


	class TestDocumentChunk:
	"""Tests for DocumentChunk model."""

	def test_create_chunk(self):
	from src.document_intelligence.chunks import (
	DocumentChunk,
	ChunkType,
	BoundingBox,
	)

	bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True)

	chunk = DocumentChunk(
	chunk_id="test_chunk_001",
	doc_id="doc_001",
	chunk_type=ChunkType.PARAGRAPH,
	text="This is a test paragraph.",
	page=1,
	bbox=bbox,
	confidence=0.95,
	sequence_index=0,
	)

	assert chunk.chunk_id == "test_chunk_001"
	assert chunk.chunk_type == ChunkType.PARAGRAPH
	assert chunk.text == "This is a test paragraph."
	assert chunk.page == 1
	assert chunk.confidence == 0.95

	def test_generate_chunk_id(self):
	from src.document_intelligence.chunks import (
	DocumentChunk,
	BoundingBox,
	)

	bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True)

	chunk_id = DocumentChunk.generate_chunk_id(
	doc_id="doc_001",
	page=1,
	bbox=bbox,
	chunk_type_str="paragraph"
	)

	# Should be deterministic
	chunk_id_2 = DocumentChunk.generate_chunk_id(
	doc_id="doc_001",
	page=1,
	bbox=bbox,
	chunk_type_str="paragraph"
	)

	assert chunk_id == chunk_id_2
	assert len(chunk_id) == 16 # md5 hex prefix


	class TestTableChunk:
	"""Tests for TableChunk model."""

	def test_create_table_chunk(self):
	from src.document_intelligence.chunks import (
	TableChunk,
	TableCell,
	BoundingBox,
	)

	bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8)

	cells = [
	TableCell(row=0, col=0, text="Header 1", is_header=True,
	bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.5, y_max=0.3)),
	TableCell(row=0, col=1, text="Header 2", is_header=True,
	bbox=BoundingBox(x_min=0.5, y_min=0.2, x_max=0.9, y_max=0.3)),
	TableCell(row=1, col=0, text="Value 1",
	bbox=BoundingBox(x_min=0.1, y_min=0.3, x_max=0.5, y_max=0.4)),
	TableCell(row=1, col=1, text="Value 2",
	bbox=BoundingBox(x_min=0.5, y_min=0.3, x_max=0.9, y_max=0.4)),
	]

	table = TableChunk(
	chunk_id="table_001",
	doc_id="doc_001",
	text="Table content",
	page=1,
	bbox=bbox,
	confidence=0.9,
	sequence_index=0,
	cells=cells,
	num_rows=2,
	num_cols=2,
	)

	assert table.num_rows == 2
	assert table.num_cols == 2
	assert len(table.cells) == 4

	def test_table_get_cell(self):
	from src.document_intelligence.chunks import (
	TableChunk,
	TableCell,
	BoundingBox,
	)

	bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8)

	cells = [
	TableCell(row=0, col=0, text="A",
	bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
	TableCell(row=0, col=1, text="B",
	bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
	TableCell(row=1, col=0, text="C",
	bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
	TableCell(row=1, col=1, text="D",
	bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
	]

	table = TableChunk(
	chunk_id="table_001",
	doc_id="doc_001",
	text="Table",
	page=1,
	bbox=bbox,
	confidence=0.9,
	sequence_index=0,
	cells=cells,
	num_rows=2,
	num_cols=2,
	)

	assert table.get_cell(0, 0).text == "A"
	assert table.get_cell(0, 1).text == "B"
	assert table.get_cell(1, 0).text == "C"
	assert table.get_cell(1, 1).text == "D"

	def test_table_to_markdown(self):
	from src.document_intelligence.chunks import (
	TableChunk,
	TableCell,
	BoundingBox,
	)

	bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8)

	cells = [
	TableCell(row=0, col=0, text="Name",
	bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
	TableCell(row=0, col=1, text="Value",
	bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
	TableCell(row=1, col=0, text="A",
	bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
	TableCell(row=1, col=1, text="100",
	bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
	]

	table = TableChunk(
	chunk_id="table_001",
	doc_id="doc_001",
	text="Table",
	page=1,
	bbox=bbox,
	confidence=0.9,
	sequence_index=0,
	cells=cells,
	num_rows=2,
	num_cols=2,
	)

	md = table.to_markdown()
	assert "\| Name \| Value \|" in md
	assert "\| --- \| --- \|" in md
	assert "\| A \| 100 \|" in md


	class TestExtractionSchema:
	"""Tests for ExtractionSchema."""

	def test_create_schema(self):
	from src.document_intelligence.extraction import (
	ExtractionSchema,
	FieldSpec,
	FieldType,
	)

	schema = ExtractionSchema(name="TestSchema")
	schema.add_string_field("name", "Person name", required=True)
	schema.add_number_field("age", "Person age", required=False, is_integer=True)
	schema.add_date_field("birth_date", "Date of birth")

	assert schema.name == "TestSchema"
	assert len(schema.fields) == 3
	assert schema.get_field("name").required is True
	assert schema.get_field("age").field_type == FieldType.INTEGER

	def test_schema_to_json_schema(self):
	from src.document_intelligence.extraction import ExtractionSchema

	schema = ExtractionSchema(name="Invoice")
	schema.add_string_field("invoice_number", required=True)
	schema.add_currency_field("total_amount", required=True)

	json_schema = schema.to_json_schema()

	assert json_schema["type"] == "object"
	assert "invoice_number" in json_schema["properties"]
	assert "total_amount" in json_schema["properties"]
	assert "invoice_number" in json_schema["required"]

	def test_schema_from_json_schema(self):
	from src.document_intelligence.extraction import ExtractionSchema

	json_schema = {
	"type": "object",
	"properties": {
	"name": {"type": "string", "description": "Name"},
	"value": {"type": "number", "minimum": 0},
	},
	"required": ["name"],
	}

	schema = ExtractionSchema.from_json_schema(json_schema, name="Test")

	assert len(schema.fields) == 2
	assert schema.get_field("name").required is True
	assert schema.get_field("value").required is False

	def test_preset_schemas(self):
	from src.document_intelligence.extraction import (
	create_invoice_schema,
	create_receipt_schema,
	create_contract_schema,
	)

	invoice = create_invoice_schema()
	assert invoice.get_field("invoice_number") is not None
	assert invoice.get_field("total_amount") is not None

	receipt = create_receipt_schema()
	assert receipt.get_field("merchant_name") is not None

	contract = create_contract_schema()
	assert contract.get_field("effective_date") is not None


	class TestEvidenceBuilder:
	"""Tests for EvidenceBuilder."""

	def test_create_evidence(self):
	from src.document_intelligence.grounding import EvidenceBuilder
	from src.document_intelligence.chunks import (
	DocumentChunk,
	ChunkType,
	BoundingBox,
	)

	chunk = DocumentChunk(
	chunk_id="chunk_001",
	doc_id="doc_001",
	chunk_type=ChunkType.PARAGRAPH,
	text="The total amount is $500.00.",
	page=1,
	bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3),
	confidence=0.9,
	sequence_index=0,
	)

	builder = EvidenceBuilder()
	evidence = builder.create_evidence(
	chunk=chunk,
	value="$500.00",
	field_name="total_amount"
	)

	assert evidence.chunk_id == "chunk_001"
	assert evidence.page == 1
	assert "$500.00" in evidence.snippet or "500" in evidence.snippet


	class TestSemanticChunker:
	"""Tests for SemanticChunker."""

	def test_chunk_text(self):
	from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig

	config = ChunkingConfig(
	min_chunk_chars=10,
	max_chunk_chars=100,
	target_chunk_chars=50,
	)

	chunker = SemanticChunker(config)

	text = """# Heading 1

	This is the first paragraph with some text content.

	This is the second paragraph with more content.

	# Heading 2

	Another section with different content.
	"""

	chunks = chunker.chunk_text(text)

	assert len(chunks) > 0
	for chunk in chunks:
	assert "text" in chunk
	assert len(chunk["text"]) >= config.min_chunk_chars

	def test_chunk_long_text(self):
	from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig

	config = ChunkingConfig(
	min_chunk_chars=10,
	max_chunk_chars=200,
	target_chunk_chars=100,
	)

	chunker = SemanticChunker(config)

	# Create a long text
	text = " ".join(["This is sentence number {}.".format(i) for i in range(50)])

	chunks = chunker.chunk_text(text)

	assert len(chunks) > 1
	for chunk in chunks:
	assert len(chunk["text"]) <= config.max_chunk_chars * 1.1 # Allow some slack


	class TestValidation:
	"""Tests for extraction validation."""

	def test_validate_extraction(self):
	from src.document_intelligence.extraction import (
	ExtractionSchema,
	ExtractionValidator,
	)
	from src.document_intelligence.chunks import ExtractionResult, FieldExtraction

	schema = ExtractionSchema(name="Test")
	schema.add_string_field("name", required=True)
	schema.add_number_field("value", required=False, is_integer=True)

	result = ExtractionResult(
	data={"name": "Test Name", "value": 42},
	fields=[],
	evidence=[],
	overall_confidence=0.8,
	abstained_fields=[],
	)

	validator = ExtractionValidator()
	validation = validator.validate(result, schema)

	assert validation.is_valid is True
	assert validation.error_count == 0

	def test_validate_missing_required(self):
	from src.document_intelligence.extraction import (
	ExtractionSchema,
	ExtractionValidator,
	)
	from src.document_intelligence.chunks import ExtractionResult

	schema = ExtractionSchema(name="Test")
	schema.add_string_field("name", required=True)
	schema.add_string_field("description", required=True)

	result = ExtractionResult(
	data={"name": "Test"}, # Missing 'description'
	fields=[],
	evidence=[],
	overall_confidence=0.5,
	abstained_fields=["description"],
	)

	validator = ExtractionValidator()
	validation = validator.validate(result, schema)

	assert validation.is_valid is False
	assert validation.error_count >= 1


	if __name__ == "__main__":
	pytest.main([__file__, "-v"])