Spaces:

Param20h
/

PDF-Assit_RAG

Running

App Files Files Community

PDF-Assit_RAG / backend /tests /test_ocr.py

Param20h

deploy: pure backend API with keywords fix

7c46845 unverified 6 days ago

Raw

History Blame Contribute Delete

10.5 kB

	"""
	Tests for OCR fallback — issue #282.

	All external I/O (fitz, pytesseract, easyocr, Pillow) is mocked so the
	suite runs without Tesseract or any GPU dependency installed.
	"""
	import types
	from unittest.mock import MagicMock, patch, PropertyMock

	import pytest


	# ── helpers ──────────────────────────────────────────────────────────────────

	def _make_fitz_page(text: str = "") -> MagicMock:
	"""Return a mock fitz.Page whose get_text() returns text."""
	page = MagicMock()
	page.get_text.return_value = text
	pix = MagicMock()
	pix.tobytes.return_value = b"PNG_BYTES"
	page.get_pixmap.return_value = pix
	return page


	def _make_fitz_doc(pages_text: list[str]):
	"""Return a mock fitz document iterating over mock pages."""
	pages = [_make_fitz_page(t) for t in pages_text]
	doc = MagicMock()
	doc.__iter__ = MagicMock(return_value=iter(pages))
	doc.__len__ = MagicMock(return_value=len(pages))
	doc.__enter__ = MagicMock(return_value=doc)
	doc.__exit__ = MagicMock(return_value=False)
	return doc, pages


	# ── _page_is_image_only ───────────────────────────────────────────────────────

	class TestPageIsImageOnly:
	def test_empty_page_is_image_only(self):
	from app.rag.ocr import _page_is_image_only
	page = _make_fitz_page("")
	assert _page_is_image_only(page) is True

	def test_sparse_page_is_image_only(self):
	from app.rag.ocr import _page_is_image_only
	page = _make_fitz_page("hi")
	assert _page_is_image_only(page) is True

	def test_page_with_enough_text_is_not_image_only(self):
	from app.rag.ocr import _page_is_image_only
	page = _make_fitz_page("A" * 50)
	assert _page_is_image_only(page) is False

	def test_boundary_exactly_at_min_chars(self):
	from app.rag.ocr import _page_is_image_only, MIN_TEXT_CHARS
	page = _make_fitz_page("A" * MIN_TEXT_CHARS)
	assert _page_is_image_only(page) is False

	def test_one_below_boundary_is_image_only(self):
	from app.rag.ocr import _page_is_image_only, MIN_TEXT_CHARS
	page = _make_fitz_page("A" * (MIN_TEXT_CHARS - 1))
	assert _page_is_image_only(page) is True


	# ── _render_page_to_image ─────────────────────────────────────────────────────

	class TestRenderPageToImage:
	def test_returns_png_bytes(self):
	from app.rag.ocr import _render_page_to_image
	page = _make_fitz_page()
	result = _render_page_to_image(page, dpi=72)
	assert result == b"PNG_BYTES"
	page.get_pixmap.assert_called_once()


	# ── _ocr_with_tesseract ───────────────────────────────────────────────────────

	class TestOcrWithTesseract:
	def test_returns_extracted_text(self):
	from app.rag.ocr import _ocr_with_tesseract

	mock_pytesseract = types.ModuleType("pytesseract")
	mock_pytesseract.image_to_string = MagicMock(return_value=" Hello OCR ")

	mock_pil_image = MagicMock()
	mock_pil_module = types.ModuleType("PIL")
	mock_pil_image_class = MagicMock(return_value=mock_pil_image)
	mock_pil_module.Image = MagicMock()
	mock_pil_module.Image.open = MagicMock(return_value=mock_pil_image)

	with patch.dict(
	"sys.modules",
	{"pytesseract": mock_pytesseract, "PIL": mock_pil_module, "PIL.Image": mock_pil_module.Image},
	):
	result = _ocr_with_tesseract(b"PNG_BYTES")

	assert result == "Hello OCR"

	def test_raises_import_error_when_tesseract_missing(self):
	from app.rag.ocr import _ocr_with_tesseract
	with patch.dict("sys.modules", {"pytesseract": None, "PIL": None, "PIL.Image": None}):
	with pytest.raises(ImportError, match="pytesseract"):
	_ocr_with_tesseract(b"PNG_BYTES")


	# ── ocr_page ─────────────────────────────────────────────────────────────────

	class TestOcrPage:
	def test_uses_tesseract_by_default(self, monkeypatch):
	import app.rag.ocr as ocr_module
	monkeypatch.setattr(ocr_module, "OCR_BACKEND", "tesseract")
	monkeypatch.setattr(ocr_module, "_render_page_to_image", lambda page, dpi: b"PNG")
	monkeypatch.setattr(ocr_module, "_ocr_with_tesseract", lambda b: "tesseract text")

	page = _make_fitz_page()
	result = ocr_module.ocr_page(page)
	assert result == "tesseract text"

	def test_uses_easyocr_when_configured(self, monkeypatch):
	import app.rag.ocr as ocr_module
	monkeypatch.setattr(ocr_module, "OCR_BACKEND", "easyocr")
	monkeypatch.setattr(ocr_module, "_render_page_to_image", lambda page, dpi: b"PNG")
	monkeypatch.setattr(ocr_module, "_ocr_with_easyocr", lambda b: "easyocr text")

	page = _make_fitz_page()
	result = ocr_module.ocr_page(page)
	assert result == "easyocr text"


	# ── extract_pdf_with_ocr ─────────────────────────────────────────────────────

	class TestExtractPdfWithOcr:
	def test_native_text_pages_skip_ocr(self, monkeypatch, tmp_path):
	import app.rag.ocr as ocr_module

	rich_text = "A" * 100
	doc, pages = _make_fitz_doc([rich_text])

	monkeypatch.setattr("fitz.open", lambda path: doc)
	ocr_called = []
	monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: ocr_called.append(1) or "")

	result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "test.pdf"))

	assert len(result) == 1
	assert result[0]["ocr"] is False
	assert result[0]["text"] == rich_text.strip()
	assert ocr_called == []

	def test_image_only_pages_trigger_ocr(self, monkeypatch, tmp_path):
	import app.rag.ocr as ocr_module

	doc, _ = _make_fitz_doc([""]) # empty page
	monkeypatch.setattr("fitz.open", lambda path: doc)
	monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: "Scanned text here")

	result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "scan.pdf"))

	assert len(result) == 1
	assert result[0]["ocr"] is True
	assert result[0]["text"] == "Scanned text here"
	assert result[0]["page"] == 1

	def test_mixed_pages_handled_correctly(self, monkeypatch, tmp_path):
	import app.rag.ocr as ocr_module

	rich = "B" * 100
	doc, _ = _make_fitz_doc([rich, ""])
	monkeypatch.setattr("fitz.open", lambda path: doc)
	monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: "OCR result")

	result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "mixed.pdf"))

	assert len(result) == 2
	assert result[0]["ocr"] is False
	assert result[0]["page"] == 1
	assert result[1]["ocr"] is True
	assert result[1]["page"] == 2

	def test_ocr_returning_empty_skips_page(self, monkeypatch, tmp_path):
	import app.rag.ocr as ocr_module

	doc, _ = _make_fitz_doc([""])
	monkeypatch.setattr("fitz.open", lambda path: doc)
	monkeypatch.setattr(ocr_module, "ocr_page", lambda page, dpi=200: "")

	result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "blank.pdf"))
	assert result == []

	def test_ocr_import_error_skips_page_gracefully(self, monkeypatch, tmp_path):
	import app.rag.ocr as ocr_module

	doc, _ = _make_fitz_doc([""])
	monkeypatch.setattr("fitz.open", lambda path: doc)
	monkeypatch.setattr(
	ocr_module, "ocr_page", MagicMock(side_effect=ImportError("no tesseract"))
	)

	result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "fail.pdf"))
	assert result == []

	def test_ocr_exception_skips_page_gracefully(self, monkeypatch, tmp_path):
	import app.rag.ocr as ocr_module

	doc, _ = _make_fitz_doc([""])
	monkeypatch.setattr("fitz.open", lambda path: doc)
	monkeypatch.setattr(
	ocr_module, "ocr_page", MagicMock(side_effect=RuntimeError("segfault"))
	)

	result = ocr_module.extract_pdf_with_ocr(str(tmp_path / "crash.pdf"))
	assert result == []


	# ── extract_pdf fallback chain (chunker integration) ─────────────────────────

	class TestExtractPdfOcrFallback:
	def test_ocr_called_when_all_extractors_return_empty(self, monkeypatch):
	import app.rag.chunker as chunker_module
	import app.rag.ocr as ocr_module

	monkeypatch.setattr(
	chunker_module, "extract_pdf_with_unstructured",
	MagicMock(side_effect=Exception("unavailable")),
	)
	monkeypatch.setattr(
	chunker_module, "extract_pdf_with_tables",
	MagicMock(side_effect=Exception("unavailable")),
	)
	monkeypatch.setattr(
	chunker_module, "extract_pdf_with_pymupdf",
	MagicMock(return_value=[]),
	)
	monkeypatch.setattr(
	ocr_module, "extract_pdf_with_ocr",
	MagicMock(return_value=[{"text": "OCR text", "page": 1, "chunk_type": "text", "ocr": True}]),
	)

	result = chunker_module.extract_pdf("dummy.pdf")

	assert len(result) == 1
	assert result[0]["ocr"] is True
	assert result[0]["text"] == "OCR text"

	def test_ocr_not_called_when_extractor_succeeds(self, monkeypatch):
	import app.rag.chunker as chunker_module
	import app.rag.ocr as ocr_module

	monkeypatch.setattr(
	chunker_module, "extract_pdf_with_unstructured",
	MagicMock(return_value=[{"text": "Native text", "page": 1, "chunk_type": "text"}]),
	)
	ocr_spy = MagicMock(return_value=[])
	monkeypatch.setattr(ocr_module, "extract_pdf_with_ocr", ocr_spy)

	result = chunker_module.extract_pdf("dummy.pdf")

	ocr_spy.assert_not_called()
	assert result[0]["text"] == "Native text"