File size: 5,734 Bytes
83bc885 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | """
Smoke tests aligned with quarto_site/python_package_usage.qmd examples.
Keeps ``doc_redaction.api`` and ``merge_csv_files`` regressions from slipping in.
"""
from __future__ import annotations
import os
import shutil
import tempfile
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parent.parent
@pytest.fixture
def repo_root() -> Path:
return REPO_ROOT
def test_merge_csv_files_accepts_str_paths(repo_root: Path, tmp_path: Path) -> None:
from tools.helper_functions import merge_csv_files
f1 = (
repo_root
/ "doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv"
)
f2 = (
repo_root
/ "doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv"
)
assert f1.is_file() and f2.is_file()
out = merge_csv_files([str(f1), str(f2)], output_folder=str(tmp_path) + os.sep)
assert len(out) == 1
assert Path(out[0]).is_file()
def test_merge_csv_files_accepts_gradio_like_named_objects(
repo_root: Path, tmp_path: Path
) -> None:
from tools.helper_functions import merge_csv_files
class _Named:
__slots__ = ("name",)
def __init__(self, path: str) -> None:
self.name = path
f1 = (
repo_root
/ "doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv"
)
f2 = (
repo_root
/ "doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv"
)
out = merge_csv_files(
[_Named(str(f1)), _Named(str(f2))], output_folder=str(tmp_path) + os.sep
)
assert Path(out[0]).is_file()
def test_combine_review_csvs_api(repo_root: Path, tmp_path: Path, monkeypatch) -> None:
monkeypatch.chdir(repo_root)
from doc_redaction.api import combine_review_csvs
out = combine_review_csvs(
input_files=[
"doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv",
"doc_redaction/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv",
],
output_dir=str(tmp_path),
)
assert out
assert Path(out[0]).exists()
def test_export_review_page_ocr_visualisation_dict_bbox(
repo_root: Path, monkeypatch
) -> None:
monkeypatch.chdir(repo_root)
from doc_redaction.api import export_review_page_ocr_visualisation
ocr_results = {
"line_1": {
"words": [
{
"text": "Example",
"bounding_box": {
"left": 0.1,
"top": 0.1,
"width": 0.2,
"height": 0.05,
},
"conf": 0.99,
}
]
}
}
out = export_review_page_ocr_visualisation(
page_image_path="doc_redaction/example_data/example_complaint_letter.jpg",
ocr_results=ocr_results,
page_number=1,
doc_base_name="quarto_smoke_ocr_viz",
)
assert out
assert Path(out[0]).exists()
def test_export_review_redaction_overlay_minimal(repo_root: Path, monkeypatch) -> None:
pytest.importorskip(
"gradio_image_annotation_redaction",
reason="required by tools.redaction_review for overlay export",
)
monkeypatch.chdir(repo_root)
from doc_redaction.api import export_review_redaction_overlay
boxes = [
{
"label": "PERSON",
"color": "#ff0000",
"xmin": 0.1,
"ymin": 0.1,
"xmax": 0.4,
"ymax": 0.2,
}
]
out = export_review_redaction_overlay(
page_image_path="doc_redaction/example_data/example_complaint_letter.jpg",
boxes=boxes,
page_number=1,
doc_base_name="quarto_smoke_overlay",
)
assert out
assert Path(out[0]).exists()
def test_find_duplicate_pages_temp_output(repo_root: Path, monkeypatch) -> None:
monkeypatch.chdir(repo_root)
from doc_redaction.api import find_duplicate_pages
out_dir = tempfile.mkdtemp(prefix="doc_redaction_dup_pages_smoke_")
try:
out_paths = find_duplicate_pages(
input_files="doc_redaction/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
output_dir=out_dir,
similarity_threshold=0.95,
)
assert isinstance(out_paths, list)
finally:
shutil.rmtree(out_dir, ignore_errors=True)
def test_load_and_prepare_documents_or_data_notimplemented() -> None:
from doc_redaction.api import load_and_prepare_documents_or_data
with pytest.raises(NotImplementedError):
load_and_prepare_documents_or_data()
def test_word_level_ocr_text_search_requires_paths() -> None:
from tools.verify_redaction_coverage import run_word_level_ocr_text_search
paths = _partnership_paths_optional()
if not paths:
pytest.skip("Partnership example fixtures not present")
_, words = paths
out = run_word_level_ocr_text_search(str(words), "Partnership")
assert out["match_count"] >= 1
def _partnership_paths_optional():
from pathlib import Path
root = Path(__file__).resolve().parent.parent
ex = root / "doc_redaction" / "example_data" / "example_outputs"
review = ex / "Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv"
words = (
ex / "Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_local_ocr.csv"
)
if review.is_file() and words.is_file():
return review, words
return None
|