document_redaction / test /test_package_api_smoke.py
seanpedrickcase's picture
Sync: Fixes on OCR reading order, minor docker compose changes. minor skill changes. AWS Textract json now cached when running examples through AWS
35c27cc
"""
Smoke tests aligned with quarto_site/python_package_usage.qmd examples.
Keeps ``doc_redaction.api`` and ``merge_csv_files`` regressions from slipping in.
"""
from __future__ import annotations
import os
import shutil
import tempfile
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parent.parent
@pytest.fixture
def repo_root() -> Path:
return REPO_ROOT
def test_merge_csv_files_accepts_str_paths(repo_root: Path, tmp_path: Path) -> None:
from tools.helper_functions import merge_csv_files
f1 = (
repo_root
/ "doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv"
)
f2 = (
repo_root
/ "doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv"
)
assert f1.is_file() and f2.is_file()
out = merge_csv_files([str(f1), str(f2)], output_folder=str(tmp_path) + os.sep)
assert len(out) == 1
assert Path(out[0]).is_file()
def test_merge_csv_files_accepts_gradio_like_named_objects(
repo_root: Path, tmp_path: Path
) -> None:
from tools.helper_functions import merge_csv_files
class _Named:
__slots__ = ("name",)
def __init__(self, path: str) -> None:
self.name = path
f1 = (
repo_root
/ "doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv"
)
f2 = (
repo_root
/ "doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv"
)
out = merge_csv_files(
[_Named(str(f1)), _Named(str(f2))], output_folder=str(tmp_path) + os.sep
)
assert Path(out[0]).is_file()
def test_combine_review_csvs_api(repo_root: Path, tmp_path: Path, monkeypatch) -> None:
monkeypatch.chdir(repo_root)
from doc_redaction.api import combine_review_csvs
out = combine_review_csvs(
input_files=[
"doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv",
"doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv",
],
output_dir=str(tmp_path),
)
assert out
assert Path(out[0]).exists()
def test_export_review_page_ocr_visualisation_dict_bbox(
repo_root: Path, monkeypatch
) -> None:
monkeypatch.chdir(repo_root)
from doc_redaction.api import export_review_page_ocr_visualisation
ocr_results = {
"line_1": {
"words": [
{
"text": "Example",
"bounding_box": {
"left": 0.1,
"top": 0.1,
"width": 0.2,
"height": 0.05,
},
"conf": 0.99,
}
]
}
}
out = export_review_page_ocr_visualisation(
page_image_path="doc_redaction/example_data/example_complaint_letter.jpg",
ocr_results=ocr_results,
page_number=1,
doc_base_name="quarto_smoke_ocr_viz",
)
assert out
assert Path(out[0]).exists()
def test_export_review_redaction_overlay_minimal(repo_root: Path, monkeypatch) -> None:
pytest.importorskip(
"gradio_image_annotation_redaction",
reason="required by tools.redaction_review for overlay export",
)
monkeypatch.chdir(repo_root)
from doc_redaction.api import export_review_redaction_overlay
boxes = [
{
"label": "PERSON",
"color": "#ff0000",
"xmin": 0.1,
"ymin": 0.1,
"xmax": 0.4,
"ymax": 0.2,
}
]
out = export_review_redaction_overlay(
page_image_path="doc_redaction/example_data/example_complaint_letter.jpg",
boxes=boxes,
page_number=1,
doc_base_name="quarto_smoke_overlay",
)
assert out
assert Path(out[0]).exists()
def test_find_duplicate_pages_temp_output(repo_root: Path, monkeypatch) -> None:
monkeypatch.chdir(repo_root)
from doc_redaction.api import find_duplicate_pages
out_dir = tempfile.mkdtemp(prefix="doc_redaction_dup_pages_smoke_")
try:
out_paths = find_duplicate_pages(
input_files="doc_redaction/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
output_dir=out_dir,
similarity_threshold=0.95,
)
assert isinstance(out_paths, list)
finally:
shutil.rmtree(out_dir, ignore_errors=True)
def test_load_and_prepare_documents_or_data_notimplemented() -> None:
from doc_redaction.api import load_and_prepare_documents_or_data
with pytest.raises(NotImplementedError):
load_and_prepare_documents_or_data()
def test_word_level_ocr_text_search_notimplemented() -> None:
from doc_redaction.api import word_level_ocr_text_search
with pytest.raises(NotImplementedError):
word_level_ocr_text_search()