document_redaction / test /test_cli_api_output_paths.py
seanpedrickcase's picture
Sync: Merge pull request #199 from seanpedrick-case/startup_optimise
a2e06b3
Raw
History Blame Contribute Delete
5.32 kB
"""Tests for doc_redaction.cli_api output path discovery."""
from __future__ import annotations
import sys
import time
from pathlib import Path
import pytest
pytest.importorskip("pikepdf")
REPO_ROOT = Path(__file__).resolve().parent.parent
_PI_SRC = REPO_ROOT / "agent-redact" / "pi"
if str(_PI_SRC) not in sys.path:
sys.path.insert(0, str(_PI_SRC))
from remote_redaction import ( # noqa: E402
discover_redaction_outputs,
resolve_redaction_output_paths,
)
from doc_redaction.cli_api import ( # noqa: E402
_run_cli,
_snapshot_files_newer_than,
)
def test_snapshot_files_newer_than_includes_overwritten_files(tmp_path: Path) -> None:
existing = tmp_path / "doc_review_file.csv"
existing.write_text("old", encoding="utf-8")
time.sleep(0.02)
started = time.time()
time.sleep(0.02)
existing.write_text("new", encoding="utf-8")
found = _snapshot_files_newer_than(str(tmp_path), started)
assert str(existing.resolve()) in found
def test_run_cli_pins_session_hash_for_stateless_doc_redact(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""SESSION_OUTPUT_FOLDER + empty username must not generate two session hashes."""
out_base = tmp_path / "output"
out_base.mkdir()
hashes = ["abcd1234", "wxyz9876"]
monkeypatch.setattr(
"cli_redact._generate_session_hash",
lambda: hashes.pop(0),
)
from cli_redact import get_username_and_folders
written_dirs: list[str] = []
cli_usernames: list[str] = []
def _fake_cli_main(direct_mode_args: dict | None = None) -> None:
assert direct_mode_args is not None
cli_usernames.append(str(direct_mode_args.get("username") or ""))
_, effective_out, _, _, _, _, _, _ = get_username_and_folders(
username=cli_usernames[-1],
output_folder_textbox=str(direct_mode_args["output_dir"]),
input_folder_textbox=str(direct_mode_args.get("input_dir") or ""),
session_output_folder=bool(direct_mode_args.get("save_to_user_folders")),
)
written_dirs.append(effective_out)
Path(effective_out).mkdir(parents=True, exist_ok=True)
(Path(effective_out) / "example_redacted.pdf").write_bytes(b"%PDF")
monkeypatch.setattr("cli_redact.main", _fake_cli_main)
paths = _run_cli(
gradio_api_name="doc_redact",
overrides={
"task": "redact",
"input_file": ["example.pdf"],
"save_to_user_folders": True,
},
output_dir=str(out_base) + "/",
)
assert cli_usernames == ["abcd1234"]
assert len(written_dirs) == 1
assert "abcd1234" in written_dirs[0]
assert len(paths) == 1
assert paths[0].endswith("example_redacted.pdf")
assert "abcd1234" in paths[0]
def test_run_cli_returns_touched_files_on_rerun(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
effective = tmp_path / "session_out"
effective.mkdir(parents=True)
stale = effective / "example_review_file.csv"
stale.write_text("from prior run", encoding="utf-8")
def _fake_effective_output_dir(merged: dict) -> str:
return str(effective)
def _fake_cli_main(direct_mode_args: dict | None = None) -> None:
target = effective / "example_review_file.csv"
target.write_text("updated this run", encoding="utf-8")
(effective / "example_redacted.pdf").write_bytes(b"%PDF")
monkeypatch.setattr(
"doc_redaction.cli_api._effective_output_dir",
_fake_effective_output_dir,
)
monkeypatch.setattr("cli_redact.main", _fake_cli_main)
paths = _run_cli(
gradio_api_name="doc_redact",
overrides={"task": "redact", "input_file": ["example.pdf"]},
output_dir=str(tmp_path / "base_out"),
)
assert str((effective / "example_review_file.csv").resolve()) in paths
assert str((effective / "example_redacted.pdf").resolve()) in paths
def test_resolve_redaction_output_paths_falls_back_to_discover(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
out_root = tmp_path / "output" / "user_session"
out_root.mkdir(parents=True)
review = out_root / "example_of_emails_sent_review_file.csv"
review.write_text("a,b\n", encoding="utf-8")
monkeypatch.setattr(
"remote_redaction.doc_redaction_output_root",
lambda: tmp_path / "output",
)
paths = resolve_redaction_output_paths(
([], "doc_redact completed"),
document_stem="example_of_emails_sent",
)
assert str(review.resolve()) in paths
def test_discover_redaction_outputs_respects_since(tmp_path: Path) -> None:
out_root = tmp_path / "output"
out_root.mkdir()
old = out_root / "example_of_emails_old.csv"
old.write_text("old", encoding="utf-8")
time.sleep(0.02)
since = time.time()
time.sleep(0.02)
new = out_root / "example_of_emails_new.csv"
new.write_text("new", encoding="utf-8")
import remote_redaction as rr
original = rr.doc_redaction_output_root
rr.doc_redaction_output_root = lambda: out_root
try:
found = discover_redaction_outputs("example_of_emails", since=since)
finally:
rr.doc_redaction_output_root = original
assert str(new.resolve()) in found
assert str(old.resolve()) not in found