"""
Unit Tests for Table-Aware Chunker (FG-002)

Tests the enhanced table extraction and structure preservation functionality.
"""

import pytest
import sys
from pathlib import Path
from typing import List

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from src.document.schemas.core import (
    BoundingBox,
    OCRRegion,
    LayoutRegion,
    LayoutType,
    ChunkType,
)
from src.document.chunking.chunker import (
    SemanticChunker,
    ChunkerConfig,
)


# ==============================================================================
# Fixtures
# ==============================================================================

@pytest.fixture
def chunker():
    """Create a SemanticChunker with default config."""
    config = ChunkerConfig(
        preserve_table_structure=True,
        table_row_threshold=10.0,
        table_col_threshold=20.0,
        detect_table_headers=True,
    )
    return SemanticChunker(config)


@pytest.fixture
def simple_table_regions() -> List[OCRRegion]:
    """Create OCR regions representing a simple 3x3 table."""
    # Simple table:
    # | Name    | Age | City     |
    # | Alice   | 25  | New York |
    # | Bob     | 30  | London   |

    regions = [
        # Header row (y=100)
        OCRRegion(
            text="Name",
            confidence=0.95,
            bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120),
            page=0
        ),
        OCRRegion(
            text="Age",
            confidence=0.95,
            bbox=BoundingBox(x_min=150, y_min=100, x_max=200, y_max=120),
            page=0
        ),
        OCRRegion(
            text="City",
            confidence=0.95,
            bbox=BoundingBox(x_min=250, y_min=100, x_max=300, y_max=120),
            page=0
        ),
        # Data row 1 (y=130)
        OCRRegion(
            text="Alice",
            confidence=0.92,
            bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150),
            page=0
        ),
        OCRRegion(
            text="25",
            confidence=0.98,
            bbox=BoundingBox(x_min=150, y_min=130, x_max=200, y_max=150),
            page=0
        ),
        OCRRegion(
            text="New York",
            confidence=0.90,
            bbox=BoundingBox(x_min=250, y_min=130, x_max=320, y_max=150),
            page=0
        ),
        # Data row 2 (y=160)
        OCRRegion(
            text="Bob",
            confidence=0.94,
            bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180),
            page=0
        ),
        OCRRegion(
            text="30",
            confidence=0.97,
            bbox=BoundingBox(x_min=150, y_min=160, x_max=200, y_max=180),
            page=0
        ),
        OCRRegion(
            text="London",
            confidence=0.93,
            bbox=BoundingBox(x_min=250, y_min=160, x_max=310, y_max=180),
            page=0
        ),
    ]
    return regions


@pytest.fixture
def numeric_table_regions() -> List[OCRRegion]:
    """Create OCR regions for a numeric data table."""
    # Table:
    # | Year | Revenue | Growth |
    # | 2021 | $1.5M   | 15%    |
    # | 2022 | $2.0M   | 33%    |
    # | 2023 | $2.8M   | 40%    |

    regions = [
        # Header row
        OCRRegion(text="Year", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0),
        OCRRegion(text="Revenue", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=100, x_max=220, y_max=120), page=0),
        OCRRegion(text="Growth", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=100, x_max=330, y_max=120), page=0),
        # Data rows
        OCRRegion(text="2021", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150), page=0),
        OCRRegion(text="$1.5M", confidence=0.92, bbox=BoundingBox(x_min=150, y_min=130, x_max=220, y_max=150), page=0),
        OCRRegion(text="15%", confidence=0.94, bbox=BoundingBox(x_min=270, y_min=130, x_max=330, y_max=150), page=0),
        OCRRegion(text="2022", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180), page=0),
        OCRRegion(text="$2.0M", confidence=0.93, bbox=BoundingBox(x_min=150, y_min=160, x_max=220, y_max=180), page=0),
        OCRRegion(text="33%", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=160, x_max=330, y_max=180), page=0),
        OCRRegion(text="2023", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=190, x_max=100, y_max=210), page=0),
        OCRRegion(text="$2.8M", confidence=0.91, bbox=BoundingBox(x_min=150, y_min=190, x_max=220, y_max=210), page=0),
        OCRRegion(text="40%", confidence=0.96, bbox=BoundingBox(x_min=270, y_min=190, x_max=330, y_max=210), page=0),
    ]
    return regions


@pytest.fixture
def table_layout_region() -> LayoutRegion:
    """Create a layout region for a table."""
    return LayoutRegion(
        id="table_001",
        type=LayoutType.TABLE,
        confidence=0.95,
        bbox=BoundingBox(x_min=40, y_min=90, x_max=350, y_max=220),
        page=0,
    )


# ==============================================================================
# Table Structure Reconstruction Tests
# ==============================================================================

class TestTableStructureReconstruction:
    """Test table structure reconstruction from OCR regions."""

    def test_reconstruct_simple_table(self, chunker, simple_table_regions):
        """Test reconstructing a simple table structure."""
        result = chunker._reconstruct_table_structure(simple_table_regions)

        assert result["row_count"] == 3
        assert result["col_count"] == 3
        assert result["has_header"] == True
        assert result["headers"] == ["Name", "Age", "City"]

    def test_detect_rows_correctly(self, chunker, simple_table_regions):
        """Test that rows are detected based on y-coordinate proximity."""
        result = chunker._reconstruct_table_structure(simple_table_regions)

        cells = result["cells"]
        assert len(cells) == 3  # 3 rows

        # First row is header
        assert cells[0] == ["Name", "Age", "City"]

        # Data rows
        assert cells[1] == ["Alice", "25", "New York"]
        assert cells[2] == ["Bob", "30", "London"]

    def test_detect_columns_correctly(self, chunker, simple_table_regions):
        """Test that columns are detected based on x-coordinate clustering."""
        result = chunker._reconstruct_table_structure(simple_table_regions)

        # All rows should have 3 columns
        for row in result["cells"]:
            assert len(row) == 3

    def test_header_detection_numeric_data(self, chunker, numeric_table_regions):
        """Test header detection when data rows are numeric."""
        result = chunker._reconstruct_table_structure(numeric_table_regions)

        assert result["has_header"] == True
        assert result["headers"] == ["Year", "Revenue", "Growth"]

    def test_empty_table(self, chunker):
        """Test handling of empty table (no OCR regions)."""
        result = chunker._reconstruct_table_structure([])

        assert result["row_count"] == 0
        assert result["col_count"] == 0
        assert result["cells"] == []
        assert result["has_header"] == False


# ==============================================================================
# Markdown Generation Tests
# ==============================================================================

class TestMarkdownGeneration:
    """Test markdown table generation."""

    def test_generate_markdown_with_headers(self, chunker, simple_table_regions):
        """Test markdown generation with detected headers."""
        table_data = chunker._reconstruct_table_structure(simple_table_regions)

        markdown = chunker._table_to_markdown(
            table_data["rows"],
            table_data["headers"],
            table_data["has_header"]
        )

        assert "| Name | Age | City |" in markdown
        assert "| --- | --- | --- |" in markdown
        assert "| Alice | 25 | New York |" in markdown
        assert "| Bob | 30 | London |" in markdown

    def test_generate_markdown_without_headers(self, chunker):
        """Test markdown generation without headers (generic Col1, Col2...)."""
        rows = [
            ["A", "B", "C"],
            ["1", "2", "3"],
        ]

        markdown = chunker._table_to_markdown(rows, [], False)

        assert "| Col1 | Col2 | Col3 |" in markdown
        assert "| A | B | C |" in markdown
        assert "| 1 | 2 | 3 |" in markdown

    def test_escape_pipe_characters(self, chunker):
        """Test that pipe characters in cell content are escaped."""
        rows = [
            ["Header1", "Header2"],
            ["Value|With|Pipes", "Normal"],
        ]

        markdown = chunker._table_to_markdown(rows, ["Header1", "Header2"], True)

        assert "Value\\|With\\|Pipes" in markdown

    def test_empty_table_returns_placeholder(self, chunker):
        """Test that empty table returns placeholder text."""
        markdown = chunker._table_to_markdown([], [], False)
        assert markdown == "[Empty Table]"


# ==============================================================================
# Table Chunk Creation Tests
# ==============================================================================

class TestTableChunkCreation:
    """Test complete table chunk creation."""

    def test_create_table_chunk_with_structure(
        self, chunker, simple_table_regions, table_layout_region
    ):
        """Test creating a table chunk with preserved structure."""
        chunk = chunker._create_table_chunk(
            simple_table_regions,
            table_layout_region,
            document_id="test_doc",
            source_path="/path/to/doc.pdf"
        )

        # Basic chunk properties
        assert chunk.chunk_type == ChunkType.TABLE
        assert chunk.document_id == "test_doc"
        assert chunk.page == 0

        # Text should be markdown
        assert "| Name | Age | City |" in chunk.text
        assert "| --- |" in chunk.text

        # Extra should contain structured data
        assert "table_structure" in chunk.extra
        table_struct = chunk.extra["table_structure"]

        assert table_struct["row_count"] == 3
        assert table_struct["col_count"] == 3
        assert table_struct["has_header"] == True
        assert table_struct["headers"] == ["Name", "Age", "City"]
        assert table_struct["cells"] is not None

    def test_create_table_chunk_with_cell_positions(
        self, chunker, simple_table_regions, table_layout_region
    ):
        """Test that cell positions are preserved for highlighting."""
        chunk = chunker._create_table_chunk(
            simple_table_regions,
            table_layout_region,
            document_id="test_doc",
            source_path=None
        )

        cell_positions = chunk.extra["table_structure"]["cell_positions"]

        # Should have positions for all cells
        assert len(cell_positions) == 3  # 3 rows
        for row_positions in cell_positions:
            assert len(row_positions) == 3  # 3 cols per row
            for cell in row_positions:
                assert "text" in cell
                assert "bbox" in cell
                assert "confidence" in cell

    def test_create_table_chunk_searchable_text(
        self, chunker, simple_table_regions, table_layout_region
    ):
        """Test that searchable text includes header context."""
        chunk = chunker._create_table_chunk(
            simple_table_regions,
            table_layout_region,
            document_id="test_doc",
            source_path=None
        )

        searchable = chunk.extra["searchable_text"]

        # Headers should be labeled
        assert "Headers:" in searchable

        # Data should have header context
        assert "Name: Alice" in searchable or "Alice" in searchable
        assert "Age: 25" in searchable or "25" in searchable

    def test_create_empty_table_chunk(self, chunker, table_layout_region):
        """Test creating chunk for empty table."""
        chunk = chunker._create_table_chunk(
            [],
            table_layout_region,
            document_id="test_doc",
            source_path=None
        )

        assert chunk.text == "[Empty Table]"
        assert chunk.confidence == 0.0


# ==============================================================================
# Configuration Tests
# ==============================================================================

class TestChunkerConfiguration:
    """Test chunker configuration options."""

    def test_disable_table_structure_preservation(self, simple_table_regions, table_layout_region):
        """Test disabling table structure preservation."""
        config = ChunkerConfig(preserve_table_structure=False)
        chunker = SemanticChunker(config)

        chunk = chunker._create_table_chunk(
            simple_table_regions,
            table_layout_region,
            document_id="test_doc",
            source_path=None
        )

        # Should use simple pipe-separated format
        assert "|" in chunk.text
        assert "| --- |" not in chunk.text  # No markdown separator

    def test_disable_header_detection(self, simple_table_regions, table_layout_region):
        """Test disabling header detection."""
        config = ChunkerConfig(
            preserve_table_structure=True,
            detect_table_headers=False
        )
        chunker = SemanticChunker(config)

        chunk = chunker._create_table_chunk(
            simple_table_regions,
            table_layout_region,
            document_id="test_doc",
            source_path=None
        )

        # Should use generic headers
        table_struct = chunk.extra["table_structure"]
        assert table_struct["has_header"] == False
        assert table_struct["headers"] == []

    def test_custom_row_threshold(self):
        """Test custom row grouping threshold."""
        # With small threshold, rows might be split incorrectly
        config = ChunkerConfig(table_row_threshold=5.0)
        chunker = SemanticChunker(config)

        # Create regions with y-positions slightly apart
        regions = [
            OCRRegion(text="A", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0),
            OCRRegion(text="B", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=108, x_max=100, y_max=128), page=0),
        ]

        result = chunker._reconstruct_table_structure(regions)

        # With threshold of 5, these should be separate rows (8 > 5)
        assert result["row_count"] == 2


# ==============================================================================
# Numeric Detection Tests
# ==============================================================================

class TestNumericDetection:
    """Test numeric value detection for header identification."""

    def test_detect_pure_number(self, chunker):
        """Test detection of pure numbers."""
        assert chunker._is_numeric("123") == True
        assert chunker._is_numeric("0") == True
        assert chunker._is_numeric("999999") == True

    def test_detect_currency(self, chunker):
        """Test detection of currency values."""
        assert chunker._is_numeric("$1,234.56") == True
        assert chunker._is_numeric("€100") == True
        assert chunker._is_numeric("£50.00") == True

    def test_detect_percentage(self, chunker):
        """Test detection of percentage values."""
        assert chunker._is_numeric("15%") == True
        assert chunker._is_numeric("100.5%") == True

    def test_detect_negative_numbers(self, chunker):
        """Test detection of negative numbers."""
        assert chunker._is_numeric("-123") == True
        assert chunker._is_numeric("(-50)") == True

    def test_non_numeric_text(self, chunker):
        """Test that text is not detected as numeric."""
        assert chunker._is_numeric("Name") == False
        assert chunker._is_numeric("Alice") == False
        assert chunker._is_numeric("Revenue Growth") == False

    def test_mixed_content(self, chunker):
        """Test mixed alphanumeric content."""
        assert chunker._is_numeric("Q1 2023") == False
        assert chunker._is_numeric("Rev: $100") == False


# ==============================================================================
# Integration with Full Chunking Pipeline
# ==============================================================================

class TestFullChunkingPipeline:
    """Test table handling in full chunking pipeline."""

    def test_chunk_document_with_table(
        self, chunker, simple_table_regions, table_layout_region
    ):
        """Test chunking a document that contains a table."""
        layout_regions = [table_layout_region]

        chunks = chunker.create_chunks(
            ocr_regions=simple_table_regions,
            layout_regions=layout_regions,
            document_id="test_doc",
            source_path="/path/to/doc.pdf"
        )

        assert len(chunks) == 1
        assert chunks[0].chunk_type == ChunkType.TABLE
        assert "| Name | Age | City |" in chunks[0].text

    def test_chunk_document_mixed_content(self, chunker):
        """Test chunking document with tables and text."""
        # Create mixed content: text + table
        text_regions = [
            OCRRegion(text="Introduction", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=50, x_max=200, y_max=70), page=0),
            OCRRegion(text="This document contains data.", confidence=0.92, bbox=BoundingBox(x_min=50, y_min=80, x_max=300, y_max=100), page=0),
        ]

        table_regions = [
            OCRRegion(text="Col1", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=150, x_max=100, y_max=170), page=0),
            OCRRegion(text="Col2", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=150, x_max=200, y_max=170), page=0),
            OCRRegion(text="A", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=180, x_max=100, y_max=200), page=0),
            OCRRegion(text="B", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=180, x_max=200, y_max=200), page=0),
        ]

        all_regions = text_regions + table_regions

        layout_regions = [
            LayoutRegion(
                id="text_001",
                type=LayoutType.PARAGRAPH,
                confidence=0.9,
                bbox=BoundingBox(x_min=40, y_min=40, x_max=350, y_max=110),
                page=0
            ),
            LayoutRegion(
                id="table_001",
                type=LayoutType.TABLE,
                confidence=0.95,
                bbox=BoundingBox(x_min=40, y_min=140, x_max=250, y_max=210),
                page=0
            ),
        ]

        chunks = chunker.create_chunks(
            ocr_regions=all_regions,
            layout_regions=layout_regions,
            document_id="test_doc",
            source_path=None
        )

        # Should have 2 chunks: text and table
        assert len(chunks) == 2

        chunk_types = [c.chunk_type for c in chunks]
        assert ChunkType.PARAGRAPH in chunk_types
        assert ChunkType.TABLE in chunk_types


# ==============================================================================
# Main Entry Point
# ==============================================================================

if __name__ == "__main__":
    pytest.main([__file__, "-v", "--tb=short"])