Spaces:
Running
Running
| """ | |
| Tests for OCR fallback β issue #282. | |
| All external I/O (fitz, pytesseract, easyocr, Pillow) is mocked so the | |
| suite runs without Tesseract or any GPU dependency installed. | |
| """ | |
| import types | |
| from unittest.mock import MagicMock, patch, PropertyMock | |
| import pytest | |
| # ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _make_fitz_page(text: str = "") -> MagicMock: | |
| """Return a mock fitz.Page whose get_text() returns *text*.""" | |
| page = MagicMock() | |
| page.get_text.return_value = text | |
| pix = MagicMock() | |
| pix.tobytes.return_value = b"PNG_BYTES" | |
| page.get_pixmap.return_value = pix | |
| return page | |
| def _make_fitz_doc(pages_text: list[str]): | |
| """Return a mock fitz document iterating over mock pages.""" | |
| pages = [_make_fitz_page(t) for t in pages_text] | |
| doc = MagicMock() | |
| doc.__iter__ = MagicMock(return_value=iter(pages)) | |
| doc.__len__ = MagicMock(return_value=len(pages)) | |
| doc.__enter__ = MagicMock(return_value=doc) | |
| doc.__exit__ = MagicMock(return_value=False) | |
| return doc, pages | |
| # ββ _page_is_image_only βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestPageIsImageOnly: | |
| def test_empty_page_is_image_only(self): | |
| from app.rag.ocr import _page_is_image_only | |
| page = _make_fitz_page("") | |
| assert _page_is_image_only(page) is True | |
| def test_sparse_page_is_image_only(self): | |
| from app.rag.ocr import _page_is_image_only | |
| page = _make_fitz_page("hi") | |
| assert _page_is_image_only(page) is True | |
| def test_page_with_enough_text_is_not_image_only(self): | |
| from app.rag.ocr import _page_is_image_only | |
| page = _make_fitz_page("A" * 50) | |
| assert _page_is_image_only(page) is False | |
| def test_boundary_exactly_at_min_chars(self): | |
| from app.rag.ocr import _page_is_image_only, MIN_TEXT_CHARS | |
| page = _make_fitz_page("A" * MIN_TEXT_CHARS) | |
| assert _page_is_image_only(page) is False | |
| def test_one_below_boundary_is_image_only(self): | |
| from app.rag.ocr import _page_is_image_only, MIN_TEXT_CHARS | |
| page = _make_fitz_page("A" * (MIN_TEXT_CHARS - 1)) | |
| assert _page_is_image_only(page) is True | |
| # ββ _render_page_to_image βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRenderPageToImage: | |
| def test_returns_png_bytes(self): | |
| from app.rag.ocr import _render_page_to_image | |
| page = _make_fitz_page() | |
| result = _render_page_to_image(page, dpi=72) | |
| assert result == b"PNG_BYTES" | |
| page.get_pixmap.assert_called_once() | |
| # ββ _ocr_with_tesseract βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestOcrWithTesseract: | |
| def test_returns_extracted_text(self): | |
| from app.rag.ocr import _ocr_with_tesseract | |
| mock_pytesseract = types.ModuleType("pytesseract") | |
| mock_pytesseract.image_to_string = MagicMock(return_value=" Hello OCR ") | |
| mock_pil_image = MagicMock() | |
| mock_pil_module = types.ModuleType("PIL") | |
| mock_pil_image_class = MagicMock(return_value=mock_pil_image) | |
| mock_pil_module.Image = MagicMock() | |
| mock_pil_module.Image.open = MagicMock(return_value=mock_pil_image) | |
| with patch.dict( | |
| "sys.modules", | |
| {"pytesseract": mock_pytesseract, "PIL": mock_pil_module, "PIL.Image": mock_pil_module.Image}, | |
| ): | |
| result = _ocr_with_tesseract(b"PNG_BYTES") | |
| assert result == "Hello OCR" | |
| def test_raises_import_error_when_tesseract_missing(self): | |
| from app.rag.ocr import _ocr_with_tesseract | |
| with patch.dict("sys.modules", {"pytesseract": None, "PIL": None, "PIL.Image": None}): | |
| with pytest.raises(ImportError, match="pytesseract"): | |
| _ocr_with_tesseract(b"PNG_BYTES") | |
| # ββ ocr_page βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestOcrPage: | |
| def test_uses_tesseract_by_default(self, monkeypatch): | |
| import app.rag.ocr as ocr_module | |
| monkeypatch.setattr(ocr_module, "OCR_BACKEND", "tesseract") | |
| monkeypatch.setattr(ocr_module, "_render_page_to_image", lambda page, dpi: b"PNG") | |
| monkeypatch.setattr(ocr_module, "_ocr_with_tesseract", lambda b: "tesseract text") | |
| page = _make_fitz_page() | |
| result = ocr_module.ocr_page(page) | |
| assert result == "tesseract text" | |
| def test_uses_easyocr_when_configured(self, monkeypatch): | |
| import app.rag.ocr as ocr_module | |
| monkeypatch.setattr(ocr_module, "OCR_BACKEND", "easyocr") | |
| monkeypatch.setattr(ocr_module, "_render_page_to_image", lambda page, dpi: b"PNG") | |
| monkeypatch.setattr(ocr_module, "_ocr_with_easyocr", lambda b: "easyocr text") | |
| page = _make_fitz_page() | |
| result = ocr_module.ocr_page(page) | |
| assert result == "easyocr text" | |
| # ββ extract_pdf_with_ocr βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestExtractPdfWithOcr: | |
| def test_native_text_pages_skip_ocr(self, monkeypatch, tmp_path): | |
| import app.rag.ocr as ocr_module | |
| rich_text = "A" * 100 | |
| doc, pages = _make_fitz_doc([rich_text]) | |
| monkeypatch.setattr("fitz.open", lambda path: doc) | |
| ocr_called = [] | |
| monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: ocr_called.append(1) or "") | |
| result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "test.pdf")) | |
| assert len(result) == 1 | |
| assert result[0]["ocr"] is False | |
| assert result[0]["text"] == rich_text.strip() | |
| assert ocr_called == [] | |
| def test_image_only_pages_trigger_ocr(self, monkeypatch, tmp_path): | |
| import app.rag.ocr as ocr_module | |
| doc, _ = _make_fitz_doc([""]) # empty page | |
| monkeypatch.setattr("fitz.open", lambda path: doc) | |
| monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: "Scanned text here") | |
| result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "scan.pdf")) | |
| assert len(result) == 1 | |
| assert result[0]["ocr"] is True | |
| assert result[0]["text"] == "Scanned text here" | |
| assert result[0]["page"] == 1 | |
| def test_mixed_pages_handled_correctly(self, monkeypatch, tmp_path): | |
| import app.rag.ocr as ocr_module | |
| rich = "B" * 100 | |
| doc, _ = _make_fitz_doc([rich, ""]) | |
| monkeypatch.setattr("fitz.open", lambda path: doc) | |
| monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: "OCR result") | |
| result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "mixed.pdf")) | |
| assert len(result) == 2 | |
| assert result[0]["ocr"] is False | |
| assert result[0]["page"] == 1 | |
| assert result[1]["ocr"] is True | |
| assert result[1]["page"] == 2 | |
| def test_ocr_returning_empty_skips_page(self, monkeypatch, tmp_path): | |
| import app.rag.ocr as ocr_module | |
| doc, _ = _make_fitz_doc([""]) | |
| monkeypatch.setattr("fitz.open", lambda path: doc) | |
| monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: "") | |
| result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "blank.pdf")) | |
| assert result == [] | |
| def test_ocr_import_error_skips_page_gracefully(self, monkeypatch, tmp_path): | |
| import app.rag.ocr as ocr_module | |
| doc, _ = _make_fitz_doc([""]) | |
| monkeypatch.setattr("fitz.open", lambda path: doc) | |
| monkeypatch.setattr( | |
| ocr_module, "ocr_page", MagicMock(side_effect=ImportError("no tesseract")) | |
| ) | |
| result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "fail.pdf")) | |
| assert result == [] | |
| def test_ocr_exception_skips_page_gracefully(self, monkeypatch, tmp_path): | |
| import app.rag.ocr as ocr_module | |
| doc, _ = _make_fitz_doc([""]) | |
| monkeypatch.setattr("fitz.open", lambda path: doc) | |
| monkeypatch.setattr( | |
| ocr_module, "ocr_page", MagicMock(side_effect=RuntimeError("segfault")) | |
| ) | |
| result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "crash.pdf")) | |
| assert result == [] | |
| # ββ extract_pdf fallback chain (chunker integration) βββββββββββββββββββββββββ | |
| class TestExtractPdfOcrFallback: | |
| def test_ocr_called_when_all_extractors_return_empty(self, monkeypatch): | |
| import app.rag.chunker as chunker_module | |
| import app.rag.ocr as ocr_module | |
| monkeypatch.setattr( | |
| chunker_module, "extract_pdf_with_unstructured", | |
| MagicMock(side_effect=Exception("unavailable")), | |
| ) | |
| monkeypatch.setattr( | |
| chunker_module, "extract_pdf_with_tables", | |
| MagicMock(side_effect=Exception("unavailable")), | |
| ) | |
| monkeypatch.setattr( | |
| chunker_module, "extract_pdf_with_pymupdf", | |
| MagicMock(return_value=[]), | |
| ) | |
| monkeypatch.setattr( | |
| ocr_module, "extract_pdf_with_ocr", | |
| MagicMock(return_value=[{"text": "OCR text", "page": 1, "chunk_type": "text", "ocr": True}]), | |
| ) | |
| result = chunker_module.extract_pdf("dummy.pdf") | |
| assert len(result) == 1 | |
| assert result[0]["ocr"] is True | |
| assert result[0]["text"] == "OCR text" | |
| def test_ocr_not_called_when_extractor_succeeds(self, monkeypatch): | |
| import app.rag.chunker as chunker_module | |
| import app.rag.ocr as ocr_module | |
| monkeypatch.setattr( | |
| chunker_module, "extract_pdf_with_unstructured", | |
| MagicMock(return_value=[{"text": "Native text", "page": 1, "chunk_type": "text"}]), | |
| ) | |
| ocr_spy = MagicMock(return_value=[]) | |
| monkeypatch.setattr(ocr_module, "extract_pdf_with_ocr", ocr_spy) | |
| result = chunker_module.extract_pdf("dummy.pdf") | |
| ocr_spy.assert_not_called() | |
| assert result[0]["text"] == "Native text" | |