"""
Tests for OCR fallback — issue #282.

All external I/O (fitz, pytesseract, easyocr, Pillow) is mocked so the
suite runs without Tesseract or any GPU dependency installed.
"""
import types
from unittest.mock import MagicMock, patch, PropertyMock

import pytest


# ── helpers ──────────────────────────────────────────────────────────────────

def _make_fitz_page(text: str = "") -> MagicMock:
    """Return a mock fitz.Page whose get_text() returns *text*."""
    page = MagicMock()
    page.get_text.return_value = text
    pix = MagicMock()
    pix.tobytes.return_value = b"PNG_BYTES"
    page.get_pixmap.return_value = pix
    return page


def _make_fitz_doc(pages_text: list[str]):
    """Return a mock fitz document iterating over mock pages."""
    pages = [_make_fitz_page(t) for t in pages_text]
    doc = MagicMock()
    doc.__iter__ = MagicMock(return_value=iter(pages))
    doc.__len__ = MagicMock(return_value=len(pages))
    doc.__enter__ = MagicMock(return_value=doc)
    doc.__exit__ = MagicMock(return_value=False)
    return doc, pages


# ── _page_is_image_only ───────────────────────────────────────────────────────

class TestPageIsImageOnly:
    def test_empty_page_is_image_only(self):
        from app.rag.ocr import _page_is_image_only
        page = _make_fitz_page("")
        assert _page_is_image_only(page) is True

    def test_sparse_page_is_image_only(self):
        from app.rag.ocr import _page_is_image_only
        page = _make_fitz_page("hi")
        assert _page_is_image_only(page) is True

    def test_page_with_enough_text_is_not_image_only(self):
        from app.rag.ocr import _page_is_image_only
        page = _make_fitz_page("A" * 50)
        assert _page_is_image_only(page) is False

    def test_boundary_exactly_at_min_chars(self):
        from app.rag.ocr import _page_is_image_only, MIN_TEXT_CHARS
        page = _make_fitz_page("A" * MIN_TEXT_CHARS)
        assert _page_is_image_only(page) is False

    def test_one_below_boundary_is_image_only(self):
        from app.rag.ocr import _page_is_image_only, MIN_TEXT_CHARS
        page = _make_fitz_page("A" * (MIN_TEXT_CHARS - 1))
        assert _page_is_image_only(page) is True


# ── _render_page_to_image ─────────────────────────────────────────────────────

class TestRenderPageToImage:
    def test_returns_png_bytes(self):
        from app.rag.ocr import _render_page_to_image
        page = _make_fitz_page()
        result = _render_page_to_image(page, dpi=72)
        assert result == b"PNG_BYTES"
        page.get_pixmap.assert_called_once()


# ── _ocr_with_tesseract ───────────────────────────────────────────────────────

class TestOcrWithTesseract:
    def test_returns_extracted_text(self):
        from app.rag.ocr import _ocr_with_tesseract

        mock_pytesseract = types.ModuleType("pytesseract")
        mock_pytesseract.image_to_string = MagicMock(return_value="  Hello OCR  ")

        mock_pil_image = MagicMock()
        mock_pil_module = types.ModuleType("PIL")
        mock_pil_image_class = MagicMock(return_value=mock_pil_image)
        mock_pil_module.Image = MagicMock()
        mock_pil_module.Image.open = MagicMock(return_value=mock_pil_image)

        with patch.dict(
            "sys.modules",
            {"pytesseract": mock_pytesseract, "PIL": mock_pil_module, "PIL.Image": mock_pil_module.Image},
        ):
            result = _ocr_with_tesseract(b"PNG_BYTES")

        assert result == "Hello OCR"

    def test_raises_import_error_when_tesseract_missing(self):
        from app.rag.ocr import _ocr_with_tesseract
        with patch.dict("sys.modules", {"pytesseract": None, "PIL": None, "PIL.Image": None}):
            with pytest.raises(ImportError, match="pytesseract"):
                _ocr_with_tesseract(b"PNG_BYTES")


# ── ocr_page ─────────────────────────────────────────────────────────────────

class TestOcrPage:
    def test_uses_tesseract_by_default(self, monkeypatch):
        import app.rag.ocr as ocr_module
        monkeypatch.setattr(ocr_module, "OCR_BACKEND", "tesseract")
        monkeypatch.setattr(ocr_module, "_render_page_to_image", lambda page, dpi: b"PNG")
        monkeypatch.setattr(ocr_module, "_ocr_with_tesseract", lambda b: "tesseract text")

        page = _make_fitz_page()
        result = ocr_module.ocr_page(page)
        assert result == "tesseract text"

    def test_uses_easyocr_when_configured(self, monkeypatch):
        import app.rag.ocr as ocr_module
        monkeypatch.setattr(ocr_module, "OCR_BACKEND", "easyocr")
        monkeypatch.setattr(ocr_module, "_render_page_to_image", lambda page, dpi: b"PNG")
        monkeypatch.setattr(ocr_module, "_ocr_with_easyocr", lambda b: "easyocr text")

        page = _make_fitz_page()
        result = ocr_module.ocr_page(page)
        assert result == "easyocr text"


# ── extract_pdf_with_ocr ─────────────────────────────────────────────────────

class TestExtractPdfWithOcr:
    def test_native_text_pages_skip_ocr(self, monkeypatch, tmp_path):
        import app.rag.ocr as ocr_module

        rich_text = "A" * 100
        doc, pages = _make_fitz_doc([rich_text])

        monkeypatch.setattr("fitz.open", lambda path: doc)
        ocr_called = []
        monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: ocr_called.append(1) or "")

        result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "test.pdf"))

        assert len(result) == 1
        assert result[0]["ocr"] is False
        assert result[0]["text"] == rich_text.strip()
        assert ocr_called == []

    def test_image_only_pages_trigger_ocr(self, monkeypatch, tmp_path):
        import app.rag.ocr as ocr_module

        doc, _ = _make_fitz_doc([""])  # empty page
        monkeypatch.setattr("fitz.open", lambda path: doc)
        monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: "Scanned text here")

        result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "scan.pdf"))

        assert len(result) == 1
        assert result[0]["ocr"] is True
        assert result[0]["text"] == "Scanned text here"
        assert result[0]["page"] == 1

    def test_mixed_pages_handled_correctly(self, monkeypatch, tmp_path):
        import app.rag.ocr as ocr_module

        rich = "B" * 100
        doc, _ = _make_fitz_doc([rich, ""])
        monkeypatch.setattr("fitz.open", lambda path: doc)
        monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: "OCR result")

        result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "mixed.pdf"))

        assert len(result) == 2
        assert result[0]["ocr"] is False
        assert result[0]["page"] == 1
        assert result[1]["ocr"] is True
        assert result[1]["page"] == 2

    def test_ocr_returning_empty_skips_page(self, monkeypatch, tmp_path):
        import app.rag.ocr as ocr_module

        doc, _ = _make_fitz_doc([""])
        monkeypatch.setattr("fitz.open", lambda path: doc)
        monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: "")

        result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "blank.pdf"))
        assert result == []

    def test_ocr_import_error_skips_page_gracefully(self, monkeypatch, tmp_path):
        import app.rag.ocr as ocr_module

        doc, _ = _make_fitz_doc([""])
        monkeypatch.setattr("fitz.open", lambda path: doc)
        monkeypatch.setattr(
            ocr_module, "ocr_page", MagicMock(side_effect=ImportError("no tesseract"))
        )

        result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "fail.pdf"))
        assert result == []

    def test_ocr_exception_skips_page_gracefully(self, monkeypatch, tmp_path):
        import app.rag.ocr as ocr_module

        doc, _ = _make_fitz_doc([""])
        monkeypatch.setattr("fitz.open", lambda path: doc)
        monkeypatch.setattr(
            ocr_module, "ocr_page", MagicMock(side_effect=RuntimeError("segfault"))
        )

        result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "crash.pdf"))
        assert result == []


# ── extract_pdf fallback chain (chunker integration) ─────────────────────────

class TestExtractPdfOcrFallback:
    def test_ocr_called_when_all_extractors_return_empty(self, monkeypatch):
        import app.rag.chunker as chunker_module
        import app.rag.ocr as ocr_module

        monkeypatch.setattr(
            chunker_module, "extract_pdf_with_unstructured",
            MagicMock(side_effect=Exception("unavailable")),
        )
        monkeypatch.setattr(
            chunker_module, "extract_pdf_with_tables",
            MagicMock(side_effect=Exception("unavailable")),
        )
        monkeypatch.setattr(
            chunker_module, "extract_pdf_with_pymupdf",
            MagicMock(return_value=[]),
        )
        monkeypatch.setattr(
            ocr_module, "extract_pdf_with_ocr",
            MagicMock(return_value=[{"text": "OCR text", "page": 1, "chunk_type": "text", "ocr": True}]),
        )

        result = chunker_module.extract_pdf("dummy.pdf")

        assert len(result) == 1
        assert result[0]["ocr"] is True
        assert result[0]["text"] == "OCR text"

    def test_ocr_not_called_when_extractor_succeeds(self, monkeypatch):
        import app.rag.chunker as chunker_module
        import app.rag.ocr as ocr_module

        monkeypatch.setattr(
            chunker_module, "extract_pdf_with_unstructured",
            MagicMock(return_value=[{"text": "Native text", "page": 1, "chunk_type": "text"}]),
        )
        ocr_spy = MagicMock(return_value=[])
        monkeypatch.setattr(ocr_module, "extract_pdf_with_ocr", ocr_spy)

        result = chunker_module.extract_pdf("dummy.pdf")

        ocr_spy.assert_not_called()
        assert result[0]["text"] == "Native text"