Sync: Fixes on OCR reading order, minor docker compose changes. minor skill changes. AWS Textract json now cached when running examples through AWS
35c27cc | """ | |
| Smoke tests aligned with quarto_site/python_package_usage.qmd examples. | |
| Keeps ``doc_redaction.api`` and ``merge_csv_files`` regressions from slipping in. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import shutil | |
| import tempfile | |
| from pathlib import Path | |
| import pytest | |
| REPO_ROOT = Path(__file__).resolve().parent.parent | |
| def repo_root() -> Path: | |
| return REPO_ROOT | |
| def test_merge_csv_files_accepts_str_paths(repo_root: Path, tmp_path: Path) -> None: | |
| from tools.helper_functions import merge_csv_files | |
| f1 = ( | |
| repo_root | |
| / "doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv" | |
| ) | |
| f2 = ( | |
| repo_root | |
| / "doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv" | |
| ) | |
| assert f1.is_file() and f2.is_file() | |
| out = merge_csv_files([str(f1), str(f2)], output_folder=str(tmp_path) + os.sep) | |
| assert len(out) == 1 | |
| assert Path(out[0]).is_file() | |
| def test_merge_csv_files_accepts_gradio_like_named_objects( | |
| repo_root: Path, tmp_path: Path | |
| ) -> None: | |
| from tools.helper_functions import merge_csv_files | |
| class _Named: | |
| __slots__ = ("name",) | |
| def __init__(self, path: str) -> None: | |
| self.name = path | |
| f1 = ( | |
| repo_root | |
| / "doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv" | |
| ) | |
| f2 = ( | |
| repo_root | |
| / "doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv" | |
| ) | |
| out = merge_csv_files( | |
| [_Named(str(f1)), _Named(str(f2))], output_folder=str(tmp_path) + os.sep | |
| ) | |
| assert Path(out[0]).is_file() | |
| def test_combine_review_csvs_api(repo_root: Path, tmp_path: Path, monkeypatch) -> None: | |
| monkeypatch.chdir(repo_root) | |
| from doc_redaction.api import combine_review_csvs | |
| out = combine_review_csvs( | |
| input_files=[ | |
| "doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv", | |
| "doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv", | |
| ], | |
| output_dir=str(tmp_path), | |
| ) | |
| assert out | |
| assert Path(out[0]).exists() | |
| def test_export_review_page_ocr_visualisation_dict_bbox( | |
| repo_root: Path, monkeypatch | |
| ) -> None: | |
| monkeypatch.chdir(repo_root) | |
| from doc_redaction.api import export_review_page_ocr_visualisation | |
| ocr_results = { | |
| "line_1": { | |
| "words": [ | |
| { | |
| "text": "Example", | |
| "bounding_box": { | |
| "left": 0.1, | |
| "top": 0.1, | |
| "width": 0.2, | |
| "height": 0.05, | |
| }, | |
| "conf": 0.99, | |
| } | |
| ] | |
| } | |
| } | |
| out = export_review_page_ocr_visualisation( | |
| page_image_path="doc_redaction/example_data/example_complaint_letter.jpg", | |
| ocr_results=ocr_results, | |
| page_number=1, | |
| doc_base_name="quarto_smoke_ocr_viz", | |
| ) | |
| assert out | |
| assert Path(out[0]).exists() | |
| def test_export_review_redaction_overlay_minimal(repo_root: Path, monkeypatch) -> None: | |
| pytest.importorskip( | |
| "gradio_image_annotation_redaction", | |
| reason="required by tools.redaction_review for overlay export", | |
| ) | |
| monkeypatch.chdir(repo_root) | |
| from doc_redaction.api import export_review_redaction_overlay | |
| boxes = [ | |
| { | |
| "label": "PERSON", | |
| "color": "#ff0000", | |
| "xmin": 0.1, | |
| "ymin": 0.1, | |
| "xmax": 0.4, | |
| "ymax": 0.2, | |
| } | |
| ] | |
| out = export_review_redaction_overlay( | |
| page_image_path="doc_redaction/example_data/example_complaint_letter.jpg", | |
| boxes=boxes, | |
| page_number=1, | |
| doc_base_name="quarto_smoke_overlay", | |
| ) | |
| assert out | |
| assert Path(out[0]).exists() | |
| def test_find_duplicate_pages_temp_output(repo_root: Path, monkeypatch) -> None: | |
| monkeypatch.chdir(repo_root) | |
| from doc_redaction.api import find_duplicate_pages | |
| out_dir = tempfile.mkdtemp(prefix="doc_redaction_dup_pages_smoke_") | |
| try: | |
| out_paths = find_duplicate_pages( | |
| input_files="doc_redaction/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv", | |
| output_dir=out_dir, | |
| similarity_threshold=0.95, | |
| ) | |
| assert isinstance(out_paths, list) | |
| finally: | |
| shutil.rmtree(out_dir, ignore_errors=True) | |
| def test_load_and_prepare_documents_or_data_notimplemented() -> None: | |
| from doc_redaction.api import load_and_prepare_documents_or_data | |
| with pytest.raises(NotImplementedError): | |
| load_and_prepare_documents_or_data() | |
| def test_word_level_ocr_text_search_notimplemented() -> None: | |
| from doc_redaction.api import word_level_ocr_text_search | |
| with pytest.raises(NotImplementedError): | |
| word_level_ocr_text_search() | |