Spaces:

seanpedrickcase
/

document_redaction

Running

document_redaction / test /test_package_api_smoke.py

Sync: Fixes on OCR reading order, minor docker compose changes. minor skill changes. AWS Textract json now cached when running examples through AWS

35c27cc about 17 hours ago

raw

history blame contribute delete

5.09 kB

	"""
	Smoke tests aligned with quarto_site/python_package_usage.qmd examples.

	Keeps ``doc_redaction.api`` and ``merge_csv_files`` regressions from slipping in.
	"""

	from __future__ import annotations

	import os
	import shutil
	import tempfile
	from pathlib import Path

	import pytest

	REPO_ROOT = Path(__file__).resolve().parent.parent


	@pytest.fixture
	def repo_root() -> Path:
	return REPO_ROOT


	def test_merge_csv_files_accepts_str_paths(repo_root: Path, tmp_path: Path) -> None:
	from tools.helper_functions import merge_csv_files

	f1 = (
	repo_root
	/ "doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv"
	)
	f2 = (
	repo_root
	/ "doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv"
	)
	assert f1.is_file() and f2.is_file()
	out = merge_csv_files([str(f1), str(f2)], output_folder=str(tmp_path) + os.sep)
	assert len(out) == 1
	assert Path(out[0]).is_file()


	def test_merge_csv_files_accepts_gradio_like_named_objects(
	repo_root: Path, tmp_path: Path
	) -> None:
	from tools.helper_functions import merge_csv_files

	class _Named:
	__slots__ = ("name",)

	def __init__(self, path: str) -> None:
	self.name = path

	f1 = (
	repo_root
	/ "doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv"
	)
	f2 = (
	repo_root
	/ "doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv"
	)
	out = merge_csv_files(
	[_Named(str(f1)), _Named(str(f2))], output_folder=str(tmp_path) + os.sep
	)
	assert Path(out[0]).is_file()


	def test_combine_review_csvs_api(repo_root: Path, tmp_path: Path, monkeypatch) -> None:
	monkeypatch.chdir(repo_root)
	from doc_redaction.api import combine_review_csvs

	out = combine_review_csvs(
	input_files=[
	"doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv",
	"doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv",
	],
	output_dir=str(tmp_path),
	)
	assert out
	assert Path(out[0]).exists()


	def test_export_review_page_ocr_visualisation_dict_bbox(
	repo_root: Path, monkeypatch
	) -> None:
	monkeypatch.chdir(repo_root)
	from doc_redaction.api import export_review_page_ocr_visualisation

	ocr_results = {
	"line_1": {
	"words": [
	{
	"text": "Example",
	"bounding_box": {
	"left": 0.1,
	"top": 0.1,
	"width": 0.2,
	"height": 0.05,
	},
	"conf": 0.99,
	}
	]
	}
	}
	out = export_review_page_ocr_visualisation(
	page_image_path="doc_redaction/example_data/example_complaint_letter.jpg",
	ocr_results=ocr_results,
	page_number=1,
	doc_base_name="quarto_smoke_ocr_viz",
	)
	assert out
	assert Path(out[0]).exists()


	def test_export_review_redaction_overlay_minimal(repo_root: Path, monkeypatch) -> None:
	pytest.importorskip(
	"gradio_image_annotation_redaction",
	reason="required by tools.redaction_review for overlay export",
	)
	monkeypatch.chdir(repo_root)
	from doc_redaction.api import export_review_redaction_overlay

	boxes = [
	{
	"label": "PERSON",
	"color": "#ff0000",
	"xmin": 0.1,
	"ymin": 0.1,
	"xmax": 0.4,
	"ymax": 0.2,
	}
	]
	out = export_review_redaction_overlay(
	page_image_path="doc_redaction/example_data/example_complaint_letter.jpg",
	boxes=boxes,
	page_number=1,
	doc_base_name="quarto_smoke_overlay",
	)
	assert out
	assert Path(out[0]).exists()


	def test_find_duplicate_pages_temp_output(repo_root: Path, monkeypatch) -> None:
	monkeypatch.chdir(repo_root)
	from doc_redaction.api import find_duplicate_pages

	out_dir = tempfile.mkdtemp(prefix="doc_redaction_dup_pages_smoke_")
	try:
	out_paths = find_duplicate_pages(
	input_files="doc_redaction/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
	output_dir=out_dir,
	similarity_threshold=0.95,
	)
	assert isinstance(out_paths, list)
	finally:
	shutil.rmtree(out_dir, ignore_errors=True)


	def test_load_and_prepare_documents_or_data_notimplemented() -> None:
	from doc_redaction.api import load_and_prepare_documents_or_data

	with pytest.raises(NotImplementedError):
	load_and_prepare_documents_or_data()


	def test_word_level_ocr_text_search_notimplemented() -> None:
	from doc_redaction.api import word_level_ocr_text_search

	with pytest.raises(NotImplementedError):
	word_level_ocr_text_search()