""" Unit Tests for Table-Aware Chunker (FG-002) Tests the enhanced table extraction and structure preservation functionality. """ import pytest import sys from pathlib import Path from typing import List # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from src.document.schemas.core import ( BoundingBox, OCRRegion, LayoutRegion, LayoutType, ChunkType, ) from src.document.chunking.chunker import ( SemanticChunker, ChunkerConfig, ) # ============================================================================== # Fixtures # ============================================================================== @pytest.fixture def chunker(): """Create a SemanticChunker with default config.""" config = ChunkerConfig( preserve_table_structure=True, table_row_threshold=10.0, table_col_threshold=20.0, detect_table_headers=True, ) return SemanticChunker(config) @pytest.fixture def simple_table_regions() -> List[OCRRegion]: """Create OCR regions representing a simple 3x3 table.""" # Simple table: # | Name | Age | City | # | Alice | 25 | New York | # | Bob | 30 | London | regions = [ # Header row (y=100) OCRRegion( text="Name", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0 ), OCRRegion( text="Age", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=100, x_max=200, y_max=120), page=0 ), OCRRegion( text="City", confidence=0.95, bbox=BoundingBox(x_min=250, y_min=100, x_max=300, y_max=120), page=0 ), # Data row 1 (y=130) OCRRegion( text="Alice", confidence=0.92, bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150), page=0 ), OCRRegion( text="25", confidence=0.98, bbox=BoundingBox(x_min=150, y_min=130, x_max=200, y_max=150), page=0 ), OCRRegion( text="New York", confidence=0.90, bbox=BoundingBox(x_min=250, y_min=130, x_max=320, y_max=150), page=0 ), # Data row 2 (y=160) OCRRegion( text="Bob", confidence=0.94, bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180), page=0 ), OCRRegion( text="30", confidence=0.97, bbox=BoundingBox(x_min=150, y_min=160, x_max=200, y_max=180), page=0 ), OCRRegion( text="London", confidence=0.93, bbox=BoundingBox(x_min=250, y_min=160, x_max=310, y_max=180), page=0 ), ] return regions @pytest.fixture def numeric_table_regions() -> List[OCRRegion]: """Create OCR regions for a numeric data table.""" # Table: # | Year | Revenue | Growth | # | 2021 | $1.5M | 15% | # | 2022 | $2.0M | 33% | # | 2023 | $2.8M | 40% | regions = [ # Header row OCRRegion(text="Year", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0), OCRRegion(text="Revenue", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=100, x_max=220, y_max=120), page=0), OCRRegion(text="Growth", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=100, x_max=330, y_max=120), page=0), # Data rows OCRRegion(text="2021", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150), page=0), OCRRegion(text="$1.5M", confidence=0.92, bbox=BoundingBox(x_min=150, y_min=130, x_max=220, y_max=150), page=0), OCRRegion(text="15%", confidence=0.94, bbox=BoundingBox(x_min=270, y_min=130, x_max=330, y_max=150), page=0), OCRRegion(text="2022", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180), page=0), OCRRegion(text="$2.0M", confidence=0.93, bbox=BoundingBox(x_min=150, y_min=160, x_max=220, y_max=180), page=0), OCRRegion(text="33%", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=160, x_max=330, y_max=180), page=0), OCRRegion(text="2023", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=190, x_max=100, y_max=210), page=0), OCRRegion(text="$2.8M", confidence=0.91, bbox=BoundingBox(x_min=150, y_min=190, x_max=220, y_max=210), page=0), OCRRegion(text="40%", confidence=0.96, bbox=BoundingBox(x_min=270, y_min=190, x_max=330, y_max=210), page=0), ] return regions @pytest.fixture def table_layout_region() -> LayoutRegion: """Create a layout region for a table.""" return LayoutRegion( id="table_001", type=LayoutType.TABLE, confidence=0.95, bbox=BoundingBox(x_min=40, y_min=90, x_max=350, y_max=220), page=0, ) # ============================================================================== # Table Structure Reconstruction Tests # ============================================================================== class TestTableStructureReconstruction: """Test table structure reconstruction from OCR regions.""" def test_reconstruct_simple_table(self, chunker, simple_table_regions): """Test reconstructing a simple table structure.""" result = chunker._reconstruct_table_structure(simple_table_regions) assert result["row_count"] == 3 assert result["col_count"] == 3 assert result["has_header"] == True assert result["headers"] == ["Name", "Age", "City"] def test_detect_rows_correctly(self, chunker, simple_table_regions): """Test that rows are detected based on y-coordinate proximity.""" result = chunker._reconstruct_table_structure(simple_table_regions) cells = result["cells"] assert len(cells) == 3 # 3 rows # First row is header assert cells[0] == ["Name", "Age", "City"] # Data rows assert cells[1] == ["Alice", "25", "New York"] assert cells[2] == ["Bob", "30", "London"] def test_detect_columns_correctly(self, chunker, simple_table_regions): """Test that columns are detected based on x-coordinate clustering.""" result = chunker._reconstruct_table_structure(simple_table_regions) # All rows should have 3 columns for row in result["cells"]: assert len(row) == 3 def test_header_detection_numeric_data(self, chunker, numeric_table_regions): """Test header detection when data rows are numeric.""" result = chunker._reconstruct_table_structure(numeric_table_regions) assert result["has_header"] == True assert result["headers"] == ["Year", "Revenue", "Growth"] def test_empty_table(self, chunker): """Test handling of empty table (no OCR regions).""" result = chunker._reconstruct_table_structure([]) assert result["row_count"] == 0 assert result["col_count"] == 0 assert result["cells"] == [] assert result["has_header"] == False # ============================================================================== # Markdown Generation Tests # ============================================================================== class TestMarkdownGeneration: """Test markdown table generation.""" def test_generate_markdown_with_headers(self, chunker, simple_table_regions): """Test markdown generation with detected headers.""" table_data = chunker._reconstruct_table_structure(simple_table_regions) markdown = chunker._table_to_markdown( table_data["rows"], table_data["headers"], table_data["has_header"] ) assert "| Name | Age | City |" in markdown assert "| --- | --- | --- |" in markdown assert "| Alice | 25 | New York |" in markdown assert "| Bob | 30 | London |" in markdown def test_generate_markdown_without_headers(self, chunker): """Test markdown generation without headers (generic Col1, Col2...).""" rows = [ ["A", "B", "C"], ["1", "2", "3"], ] markdown = chunker._table_to_markdown(rows, [], False) assert "| Col1 | Col2 | Col3 |" in markdown assert "| A | B | C |" in markdown assert "| 1 | 2 | 3 |" in markdown def test_escape_pipe_characters(self, chunker): """Test that pipe characters in cell content are escaped.""" rows = [ ["Header1", "Header2"], ["Value|With|Pipes", "Normal"], ] markdown = chunker._table_to_markdown(rows, ["Header1", "Header2"], True) assert "Value\\|With\\|Pipes" in markdown def test_empty_table_returns_placeholder(self, chunker): """Test that empty table returns placeholder text.""" markdown = chunker._table_to_markdown([], [], False) assert markdown == "[Empty Table]" # ============================================================================== # Table Chunk Creation Tests # ============================================================================== class TestTableChunkCreation: """Test complete table chunk creation.""" def test_create_table_chunk_with_structure( self, chunker, simple_table_regions, table_layout_region ): """Test creating a table chunk with preserved structure.""" chunk = chunker._create_table_chunk( simple_table_regions, table_layout_region, document_id="test_doc", source_path="/path/to/doc.pdf" ) # Basic chunk properties assert chunk.chunk_type == ChunkType.TABLE assert chunk.document_id == "test_doc" assert chunk.page == 0 # Text should be markdown assert "| Name | Age | City |" in chunk.text assert "| --- |" in chunk.text # Extra should contain structured data assert "table_structure" in chunk.extra table_struct = chunk.extra["table_structure"] assert table_struct["row_count"] == 3 assert table_struct["col_count"] == 3 assert table_struct["has_header"] == True assert table_struct["headers"] == ["Name", "Age", "City"] assert table_struct["cells"] is not None def test_create_table_chunk_with_cell_positions( self, chunker, simple_table_regions, table_layout_region ): """Test that cell positions are preserved for highlighting.""" chunk = chunker._create_table_chunk( simple_table_regions, table_layout_region, document_id="test_doc", source_path=None ) cell_positions = chunk.extra["table_structure"]["cell_positions"] # Should have positions for all cells assert len(cell_positions) == 3 # 3 rows for row_positions in cell_positions: assert len(row_positions) == 3 # 3 cols per row for cell in row_positions: assert "text" in cell assert "bbox" in cell assert "confidence" in cell def test_create_table_chunk_searchable_text( self, chunker, simple_table_regions, table_layout_region ): """Test that searchable text includes header context.""" chunk = chunker._create_table_chunk( simple_table_regions, table_layout_region, document_id="test_doc", source_path=None ) searchable = chunk.extra["searchable_text"] # Headers should be labeled assert "Headers:" in searchable # Data should have header context assert "Name: Alice" in searchable or "Alice" in searchable assert "Age: 25" in searchable or "25" in searchable def test_create_empty_table_chunk(self, chunker, table_layout_region): """Test creating chunk for empty table.""" chunk = chunker._create_table_chunk( [], table_layout_region, document_id="test_doc", source_path=None ) assert chunk.text == "[Empty Table]" assert chunk.confidence == 0.0 # ============================================================================== # Configuration Tests # ============================================================================== class TestChunkerConfiguration: """Test chunker configuration options.""" def test_disable_table_structure_preservation(self, simple_table_regions, table_layout_region): """Test disabling table structure preservation.""" config = ChunkerConfig(preserve_table_structure=False) chunker = SemanticChunker(config) chunk = chunker._create_table_chunk( simple_table_regions, table_layout_region, document_id="test_doc", source_path=None ) # Should use simple pipe-separated format assert "|" in chunk.text assert "| --- |" not in chunk.text # No markdown separator def test_disable_header_detection(self, simple_table_regions, table_layout_region): """Test disabling header detection.""" config = ChunkerConfig( preserve_table_structure=True, detect_table_headers=False ) chunker = SemanticChunker(config) chunk = chunker._create_table_chunk( simple_table_regions, table_layout_region, document_id="test_doc", source_path=None ) # Should use generic headers table_struct = chunk.extra["table_structure"] assert table_struct["has_header"] == False assert table_struct["headers"] == [] def test_custom_row_threshold(self): """Test custom row grouping threshold.""" # With small threshold, rows might be split incorrectly config = ChunkerConfig(table_row_threshold=5.0) chunker = SemanticChunker(config) # Create regions with y-positions slightly apart regions = [ OCRRegion(text="A", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0), OCRRegion(text="B", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=108, x_max=100, y_max=128), page=0), ] result = chunker._reconstruct_table_structure(regions) # With threshold of 5, these should be separate rows (8 > 5) assert result["row_count"] == 2 # ============================================================================== # Numeric Detection Tests # ============================================================================== class TestNumericDetection: """Test numeric value detection for header identification.""" def test_detect_pure_number(self, chunker): """Test detection of pure numbers.""" assert chunker._is_numeric("123") == True assert chunker._is_numeric("0") == True assert chunker._is_numeric("999999") == True def test_detect_currency(self, chunker): """Test detection of currency values.""" assert chunker._is_numeric("$1,234.56") == True assert chunker._is_numeric("€100") == True assert chunker._is_numeric("£50.00") == True def test_detect_percentage(self, chunker): """Test detection of percentage values.""" assert chunker._is_numeric("15%") == True assert chunker._is_numeric("100.5%") == True def test_detect_negative_numbers(self, chunker): """Test detection of negative numbers.""" assert chunker._is_numeric("-123") == True assert chunker._is_numeric("(-50)") == True def test_non_numeric_text(self, chunker): """Test that text is not detected as numeric.""" assert chunker._is_numeric("Name") == False assert chunker._is_numeric("Alice") == False assert chunker._is_numeric("Revenue Growth") == False def test_mixed_content(self, chunker): """Test mixed alphanumeric content.""" assert chunker._is_numeric("Q1 2023") == False assert chunker._is_numeric("Rev: $100") == False # ============================================================================== # Integration with Full Chunking Pipeline # ============================================================================== class TestFullChunkingPipeline: """Test table handling in full chunking pipeline.""" def test_chunk_document_with_table( self, chunker, simple_table_regions, table_layout_region ): """Test chunking a document that contains a table.""" layout_regions = [table_layout_region] chunks = chunker.create_chunks( ocr_regions=simple_table_regions, layout_regions=layout_regions, document_id="test_doc", source_path="/path/to/doc.pdf" ) assert len(chunks) == 1 assert chunks[0].chunk_type == ChunkType.TABLE assert "| Name | Age | City |" in chunks[0].text def test_chunk_document_mixed_content(self, chunker): """Test chunking document with tables and text.""" # Create mixed content: text + table text_regions = [ OCRRegion(text="Introduction", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=50, x_max=200, y_max=70), page=0), OCRRegion(text="This document contains data.", confidence=0.92, bbox=BoundingBox(x_min=50, y_min=80, x_max=300, y_max=100), page=0), ] table_regions = [ OCRRegion(text="Col1", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=150, x_max=100, y_max=170), page=0), OCRRegion(text="Col2", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=150, x_max=200, y_max=170), page=0), OCRRegion(text="A", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=180, x_max=100, y_max=200), page=0), OCRRegion(text="B", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=180, x_max=200, y_max=200), page=0), ] all_regions = text_regions + table_regions layout_regions = [ LayoutRegion( id="text_001", type=LayoutType.PARAGRAPH, confidence=0.9, bbox=BoundingBox(x_min=40, y_min=40, x_max=350, y_max=110), page=0 ), LayoutRegion( id="table_001", type=LayoutType.TABLE, confidence=0.95, bbox=BoundingBox(x_min=40, y_min=140, x_max=250, y_max=210), page=0 ), ] chunks = chunker.create_chunks( ocr_regions=all_regions, layout_regions=layout_regions, document_id="test_doc", source_path=None ) # Should have 2 chunks: text and table assert len(chunks) == 2 chunk_types = [c.chunk_type for c in chunks] assert ChunkType.PARAGRAPH in chunk_types assert ChunkType.TABLE in chunk_types # ============================================================================== # Main Entry Point # ============================================================================== if __name__ == "__main__": pytest.main([__file__, "-v", "--tb=short"])