|
|
""" |
|
|
Unit Tests for Table-Aware Chunker (FG-002) |
|
|
|
|
|
Tests the enhanced table extraction and structure preservation functionality. |
|
|
""" |
|
|
|
|
|
import pytest |
|
|
import sys |
|
|
from pathlib import Path |
|
|
from typing import List |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent)) |
|
|
|
|
|
from src.document.schemas.core import ( |
|
|
BoundingBox, |
|
|
OCRRegion, |
|
|
LayoutRegion, |
|
|
LayoutType, |
|
|
ChunkType, |
|
|
) |
|
|
from src.document.chunking.chunker import ( |
|
|
SemanticChunker, |
|
|
ChunkerConfig, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def chunker(): |
|
|
"""Create a SemanticChunker with default config.""" |
|
|
config = ChunkerConfig( |
|
|
preserve_table_structure=True, |
|
|
table_row_threshold=10.0, |
|
|
table_col_threshold=20.0, |
|
|
detect_table_headers=True, |
|
|
) |
|
|
return SemanticChunker(config) |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def simple_table_regions() -> List[OCRRegion]: |
|
|
"""Create OCR regions representing a simple 3x3 table.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
regions = [ |
|
|
|
|
|
OCRRegion( |
|
|
text="Name", |
|
|
confidence=0.95, |
|
|
bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), |
|
|
page=0 |
|
|
), |
|
|
OCRRegion( |
|
|
text="Age", |
|
|
confidence=0.95, |
|
|
bbox=BoundingBox(x_min=150, y_min=100, x_max=200, y_max=120), |
|
|
page=0 |
|
|
), |
|
|
OCRRegion( |
|
|
text="City", |
|
|
confidence=0.95, |
|
|
bbox=BoundingBox(x_min=250, y_min=100, x_max=300, y_max=120), |
|
|
page=0 |
|
|
), |
|
|
|
|
|
OCRRegion( |
|
|
text="Alice", |
|
|
confidence=0.92, |
|
|
bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150), |
|
|
page=0 |
|
|
), |
|
|
OCRRegion( |
|
|
text="25", |
|
|
confidence=0.98, |
|
|
bbox=BoundingBox(x_min=150, y_min=130, x_max=200, y_max=150), |
|
|
page=0 |
|
|
), |
|
|
OCRRegion( |
|
|
text="New York", |
|
|
confidence=0.90, |
|
|
bbox=BoundingBox(x_min=250, y_min=130, x_max=320, y_max=150), |
|
|
page=0 |
|
|
), |
|
|
|
|
|
OCRRegion( |
|
|
text="Bob", |
|
|
confidence=0.94, |
|
|
bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180), |
|
|
page=0 |
|
|
), |
|
|
OCRRegion( |
|
|
text="30", |
|
|
confidence=0.97, |
|
|
bbox=BoundingBox(x_min=150, y_min=160, x_max=200, y_max=180), |
|
|
page=0 |
|
|
), |
|
|
OCRRegion( |
|
|
text="London", |
|
|
confidence=0.93, |
|
|
bbox=BoundingBox(x_min=250, y_min=160, x_max=310, y_max=180), |
|
|
page=0 |
|
|
), |
|
|
] |
|
|
return regions |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def numeric_table_regions() -> List[OCRRegion]: |
|
|
"""Create OCR regions for a numeric data table.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
regions = [ |
|
|
|
|
|
OCRRegion(text="Year", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0), |
|
|
OCRRegion(text="Revenue", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=100, x_max=220, y_max=120), page=0), |
|
|
OCRRegion(text="Growth", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=100, x_max=330, y_max=120), page=0), |
|
|
|
|
|
OCRRegion(text="2021", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150), page=0), |
|
|
OCRRegion(text="$1.5M", confidence=0.92, bbox=BoundingBox(x_min=150, y_min=130, x_max=220, y_max=150), page=0), |
|
|
OCRRegion(text="15%", confidence=0.94, bbox=BoundingBox(x_min=270, y_min=130, x_max=330, y_max=150), page=0), |
|
|
OCRRegion(text="2022", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180), page=0), |
|
|
OCRRegion(text="$2.0M", confidence=0.93, bbox=BoundingBox(x_min=150, y_min=160, x_max=220, y_max=180), page=0), |
|
|
OCRRegion(text="33%", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=160, x_max=330, y_max=180), page=0), |
|
|
OCRRegion(text="2023", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=190, x_max=100, y_max=210), page=0), |
|
|
OCRRegion(text="$2.8M", confidence=0.91, bbox=BoundingBox(x_min=150, y_min=190, x_max=220, y_max=210), page=0), |
|
|
OCRRegion(text="40%", confidence=0.96, bbox=BoundingBox(x_min=270, y_min=190, x_max=330, y_max=210), page=0), |
|
|
] |
|
|
return regions |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def table_layout_region() -> LayoutRegion: |
|
|
"""Create a layout region for a table.""" |
|
|
return LayoutRegion( |
|
|
id="table_001", |
|
|
type=LayoutType.TABLE, |
|
|
confidence=0.95, |
|
|
bbox=BoundingBox(x_min=40, y_min=90, x_max=350, y_max=220), |
|
|
page=0, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestTableStructureReconstruction: |
|
|
"""Test table structure reconstruction from OCR regions.""" |
|
|
|
|
|
def test_reconstruct_simple_table(self, chunker, simple_table_regions): |
|
|
"""Test reconstructing a simple table structure.""" |
|
|
result = chunker._reconstruct_table_structure(simple_table_regions) |
|
|
|
|
|
assert result["row_count"] == 3 |
|
|
assert result["col_count"] == 3 |
|
|
assert result["has_header"] == True |
|
|
assert result["headers"] == ["Name", "Age", "City"] |
|
|
|
|
|
def test_detect_rows_correctly(self, chunker, simple_table_regions): |
|
|
"""Test that rows are detected based on y-coordinate proximity.""" |
|
|
result = chunker._reconstruct_table_structure(simple_table_regions) |
|
|
|
|
|
cells = result["cells"] |
|
|
assert len(cells) == 3 |
|
|
|
|
|
|
|
|
assert cells[0] == ["Name", "Age", "City"] |
|
|
|
|
|
|
|
|
assert cells[1] == ["Alice", "25", "New York"] |
|
|
assert cells[2] == ["Bob", "30", "London"] |
|
|
|
|
|
def test_detect_columns_correctly(self, chunker, simple_table_regions): |
|
|
"""Test that columns are detected based on x-coordinate clustering.""" |
|
|
result = chunker._reconstruct_table_structure(simple_table_regions) |
|
|
|
|
|
|
|
|
for row in result["cells"]: |
|
|
assert len(row) == 3 |
|
|
|
|
|
def test_header_detection_numeric_data(self, chunker, numeric_table_regions): |
|
|
"""Test header detection when data rows are numeric.""" |
|
|
result = chunker._reconstruct_table_structure(numeric_table_regions) |
|
|
|
|
|
assert result["has_header"] == True |
|
|
assert result["headers"] == ["Year", "Revenue", "Growth"] |
|
|
|
|
|
def test_empty_table(self, chunker): |
|
|
"""Test handling of empty table (no OCR regions).""" |
|
|
result = chunker._reconstruct_table_structure([]) |
|
|
|
|
|
assert result["row_count"] == 0 |
|
|
assert result["col_count"] == 0 |
|
|
assert result["cells"] == [] |
|
|
assert result["has_header"] == False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestMarkdownGeneration: |
|
|
"""Test markdown table generation.""" |
|
|
|
|
|
def test_generate_markdown_with_headers(self, chunker, simple_table_regions): |
|
|
"""Test markdown generation with detected headers.""" |
|
|
table_data = chunker._reconstruct_table_structure(simple_table_regions) |
|
|
|
|
|
markdown = chunker._table_to_markdown( |
|
|
table_data["rows"], |
|
|
table_data["headers"], |
|
|
table_data["has_header"] |
|
|
) |
|
|
|
|
|
assert "| Name | Age | City |" in markdown |
|
|
assert "| --- | --- | --- |" in markdown |
|
|
assert "| Alice | 25 | New York |" in markdown |
|
|
assert "| Bob | 30 | London |" in markdown |
|
|
|
|
|
def test_generate_markdown_without_headers(self, chunker): |
|
|
"""Test markdown generation without headers (generic Col1, Col2...).""" |
|
|
rows = [ |
|
|
["A", "B", "C"], |
|
|
["1", "2", "3"], |
|
|
] |
|
|
|
|
|
markdown = chunker._table_to_markdown(rows, [], False) |
|
|
|
|
|
assert "| Col1 | Col2 | Col3 |" in markdown |
|
|
assert "| A | B | C |" in markdown |
|
|
assert "| 1 | 2 | 3 |" in markdown |
|
|
|
|
|
def test_escape_pipe_characters(self, chunker): |
|
|
"""Test that pipe characters in cell content are escaped.""" |
|
|
rows = [ |
|
|
["Header1", "Header2"], |
|
|
["Value|With|Pipes", "Normal"], |
|
|
] |
|
|
|
|
|
markdown = chunker._table_to_markdown(rows, ["Header1", "Header2"], True) |
|
|
|
|
|
assert "Value\\|With\\|Pipes" in markdown |
|
|
|
|
|
def test_empty_table_returns_placeholder(self, chunker): |
|
|
"""Test that empty table returns placeholder text.""" |
|
|
markdown = chunker._table_to_markdown([], [], False) |
|
|
assert markdown == "[Empty Table]" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestTableChunkCreation: |
|
|
"""Test complete table chunk creation.""" |
|
|
|
|
|
def test_create_table_chunk_with_structure( |
|
|
self, chunker, simple_table_regions, table_layout_region |
|
|
): |
|
|
"""Test creating a table chunk with preserved structure.""" |
|
|
chunk = chunker._create_table_chunk( |
|
|
simple_table_regions, |
|
|
table_layout_region, |
|
|
document_id="test_doc", |
|
|
source_path="/path/to/doc.pdf" |
|
|
) |
|
|
|
|
|
|
|
|
assert chunk.chunk_type == ChunkType.TABLE |
|
|
assert chunk.document_id == "test_doc" |
|
|
assert chunk.page == 0 |
|
|
|
|
|
|
|
|
assert "| Name | Age | City |" in chunk.text |
|
|
assert "| --- |" in chunk.text |
|
|
|
|
|
|
|
|
assert "table_structure" in chunk.extra |
|
|
table_struct = chunk.extra["table_structure"] |
|
|
|
|
|
assert table_struct["row_count"] == 3 |
|
|
assert table_struct["col_count"] == 3 |
|
|
assert table_struct["has_header"] == True |
|
|
assert table_struct["headers"] == ["Name", "Age", "City"] |
|
|
assert table_struct["cells"] is not None |
|
|
|
|
|
def test_create_table_chunk_with_cell_positions( |
|
|
self, chunker, simple_table_regions, table_layout_region |
|
|
): |
|
|
"""Test that cell positions are preserved for highlighting.""" |
|
|
chunk = chunker._create_table_chunk( |
|
|
simple_table_regions, |
|
|
table_layout_region, |
|
|
document_id="test_doc", |
|
|
source_path=None |
|
|
) |
|
|
|
|
|
cell_positions = chunk.extra["table_structure"]["cell_positions"] |
|
|
|
|
|
|
|
|
assert len(cell_positions) == 3 |
|
|
for row_positions in cell_positions: |
|
|
assert len(row_positions) == 3 |
|
|
for cell in row_positions: |
|
|
assert "text" in cell |
|
|
assert "bbox" in cell |
|
|
assert "confidence" in cell |
|
|
|
|
|
def test_create_table_chunk_searchable_text( |
|
|
self, chunker, simple_table_regions, table_layout_region |
|
|
): |
|
|
"""Test that searchable text includes header context.""" |
|
|
chunk = chunker._create_table_chunk( |
|
|
simple_table_regions, |
|
|
table_layout_region, |
|
|
document_id="test_doc", |
|
|
source_path=None |
|
|
) |
|
|
|
|
|
searchable = chunk.extra["searchable_text"] |
|
|
|
|
|
|
|
|
assert "Headers:" in searchable |
|
|
|
|
|
|
|
|
assert "Name: Alice" in searchable or "Alice" in searchable |
|
|
assert "Age: 25" in searchable or "25" in searchable |
|
|
|
|
|
def test_create_empty_table_chunk(self, chunker, table_layout_region): |
|
|
"""Test creating chunk for empty table.""" |
|
|
chunk = chunker._create_table_chunk( |
|
|
[], |
|
|
table_layout_region, |
|
|
document_id="test_doc", |
|
|
source_path=None |
|
|
) |
|
|
|
|
|
assert chunk.text == "[Empty Table]" |
|
|
assert chunk.confidence == 0.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestChunkerConfiguration: |
|
|
"""Test chunker configuration options.""" |
|
|
|
|
|
def test_disable_table_structure_preservation(self, simple_table_regions, table_layout_region): |
|
|
"""Test disabling table structure preservation.""" |
|
|
config = ChunkerConfig(preserve_table_structure=False) |
|
|
chunker = SemanticChunker(config) |
|
|
|
|
|
chunk = chunker._create_table_chunk( |
|
|
simple_table_regions, |
|
|
table_layout_region, |
|
|
document_id="test_doc", |
|
|
source_path=None |
|
|
) |
|
|
|
|
|
|
|
|
assert "|" in chunk.text |
|
|
assert "| --- |" not in chunk.text |
|
|
|
|
|
def test_disable_header_detection(self, simple_table_regions, table_layout_region): |
|
|
"""Test disabling header detection.""" |
|
|
config = ChunkerConfig( |
|
|
preserve_table_structure=True, |
|
|
detect_table_headers=False |
|
|
) |
|
|
chunker = SemanticChunker(config) |
|
|
|
|
|
chunk = chunker._create_table_chunk( |
|
|
simple_table_regions, |
|
|
table_layout_region, |
|
|
document_id="test_doc", |
|
|
source_path=None |
|
|
) |
|
|
|
|
|
|
|
|
table_struct = chunk.extra["table_structure"] |
|
|
assert table_struct["has_header"] == False |
|
|
assert table_struct["headers"] == [] |
|
|
|
|
|
def test_custom_row_threshold(self): |
|
|
"""Test custom row grouping threshold.""" |
|
|
|
|
|
config = ChunkerConfig(table_row_threshold=5.0) |
|
|
chunker = SemanticChunker(config) |
|
|
|
|
|
|
|
|
regions = [ |
|
|
OCRRegion(text="A", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0), |
|
|
OCRRegion(text="B", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=108, x_max=100, y_max=128), page=0), |
|
|
] |
|
|
|
|
|
result = chunker._reconstruct_table_structure(regions) |
|
|
|
|
|
|
|
|
assert result["row_count"] == 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestNumericDetection: |
|
|
"""Test numeric value detection for header identification.""" |
|
|
|
|
|
def test_detect_pure_number(self, chunker): |
|
|
"""Test detection of pure numbers.""" |
|
|
assert chunker._is_numeric("123") == True |
|
|
assert chunker._is_numeric("0") == True |
|
|
assert chunker._is_numeric("999999") == True |
|
|
|
|
|
def test_detect_currency(self, chunker): |
|
|
"""Test detection of currency values.""" |
|
|
assert chunker._is_numeric("$1,234.56") == True |
|
|
assert chunker._is_numeric("€100") == True |
|
|
assert chunker._is_numeric("£50.00") == True |
|
|
|
|
|
def test_detect_percentage(self, chunker): |
|
|
"""Test detection of percentage values.""" |
|
|
assert chunker._is_numeric("15%") == True |
|
|
assert chunker._is_numeric("100.5%") == True |
|
|
|
|
|
def test_detect_negative_numbers(self, chunker): |
|
|
"""Test detection of negative numbers.""" |
|
|
assert chunker._is_numeric("-123") == True |
|
|
assert chunker._is_numeric("(-50)") == True |
|
|
|
|
|
def test_non_numeric_text(self, chunker): |
|
|
"""Test that text is not detected as numeric.""" |
|
|
assert chunker._is_numeric("Name") == False |
|
|
assert chunker._is_numeric("Alice") == False |
|
|
assert chunker._is_numeric("Revenue Growth") == False |
|
|
|
|
|
def test_mixed_content(self, chunker): |
|
|
"""Test mixed alphanumeric content.""" |
|
|
assert chunker._is_numeric("Q1 2023") == False |
|
|
assert chunker._is_numeric("Rev: $100") == False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestFullChunkingPipeline: |
|
|
"""Test table handling in full chunking pipeline.""" |
|
|
|
|
|
def test_chunk_document_with_table( |
|
|
self, chunker, simple_table_regions, table_layout_region |
|
|
): |
|
|
"""Test chunking a document that contains a table.""" |
|
|
layout_regions = [table_layout_region] |
|
|
|
|
|
chunks = chunker.create_chunks( |
|
|
ocr_regions=simple_table_regions, |
|
|
layout_regions=layout_regions, |
|
|
document_id="test_doc", |
|
|
source_path="/path/to/doc.pdf" |
|
|
) |
|
|
|
|
|
assert len(chunks) == 1 |
|
|
assert chunks[0].chunk_type == ChunkType.TABLE |
|
|
assert "| Name | Age | City |" in chunks[0].text |
|
|
|
|
|
def test_chunk_document_mixed_content(self, chunker): |
|
|
"""Test chunking document with tables and text.""" |
|
|
|
|
|
text_regions = [ |
|
|
OCRRegion(text="Introduction", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=50, x_max=200, y_max=70), page=0), |
|
|
OCRRegion(text="This document contains data.", confidence=0.92, bbox=BoundingBox(x_min=50, y_min=80, x_max=300, y_max=100), page=0), |
|
|
] |
|
|
|
|
|
table_regions = [ |
|
|
OCRRegion(text="Col1", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=150, x_max=100, y_max=170), page=0), |
|
|
OCRRegion(text="Col2", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=150, x_max=200, y_max=170), page=0), |
|
|
OCRRegion(text="A", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=180, x_max=100, y_max=200), page=0), |
|
|
OCRRegion(text="B", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=180, x_max=200, y_max=200), page=0), |
|
|
] |
|
|
|
|
|
all_regions = text_regions + table_regions |
|
|
|
|
|
layout_regions = [ |
|
|
LayoutRegion( |
|
|
id="text_001", |
|
|
type=LayoutType.PARAGRAPH, |
|
|
confidence=0.9, |
|
|
bbox=BoundingBox(x_min=40, y_min=40, x_max=350, y_max=110), |
|
|
page=0 |
|
|
), |
|
|
LayoutRegion( |
|
|
id="table_001", |
|
|
type=LayoutType.TABLE, |
|
|
confidence=0.95, |
|
|
bbox=BoundingBox(x_min=40, y_min=140, x_max=250, y_max=210), |
|
|
page=0 |
|
|
), |
|
|
] |
|
|
|
|
|
chunks = chunker.create_chunks( |
|
|
ocr_regions=all_regions, |
|
|
layout_regions=layout_regions, |
|
|
document_id="test_doc", |
|
|
source_path=None |
|
|
) |
|
|
|
|
|
|
|
|
assert len(chunks) == 2 |
|
|
|
|
|
chunk_types = [c.chunk_type for c in chunks] |
|
|
assert ChunkType.PARAGRAPH in chunk_types |
|
|
assert ChunkType.TABLE in chunk_types |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
pytest.main([__file__, "-v", "--tb=short"]) |
|
|
|