Spaces:
Running on Zero
Running on Zero
| """ | |
| CLI-first programmatic API surface. | |
| These functions provide a minimal, runnable Python interface that mirrors the | |
| Gradio `api_name` routes, but executes the underlying workflows via the CLI | |
| engine (`cli_redact.main(direct_mode_args=...)`). | |
| Return values are lists of output file paths created in `output_dir`. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Any, Iterable | |
| def _ensure_list(v: str | list[str] | tuple[str, ...]) -> list[str]: | |
| if isinstance(v, (list, tuple)): | |
| return [str(x) for x in v] | |
| return [str(v)] | |
| def _snapshot_files(folder: str) -> set[str]: | |
| root = Path(folder) | |
| if not root.exists(): | |
| return set() | |
| out: set[str] = set() | |
| for dirpath, _, filenames in os.walk(root): | |
| for name in filenames: | |
| out.add(str(Path(dirpath) / name)) | |
| return out | |
| def _default_output_dir(prefix: str) -> str: | |
| return tempfile.mkdtemp(prefix=f"doc_redaction_{prefix}_") | |
| def _run_cli( | |
| *, | |
| gradio_api_name: str, | |
| overrides: dict[str, Any], | |
| output_dir: str | None, | |
| ) -> list[str]: | |
| """ | |
| Run cli_redact.main with merged defaults and return newly created files. | |
| """ | |
| from cli_redact import get_cli_default_args_dict | |
| from cli_redact import main as cli_main | |
| merged = get_cli_default_args_dict() | |
| merged.update(overrides) | |
| if output_dir is None: | |
| output_dir = _default_output_dir(gradio_api_name) | |
| merged["output_dir"] = str(output_dir) | |
| before = _snapshot_files(str(output_dir)) | |
| cli_main(direct_mode_args=merged) | |
| after = _snapshot_files(str(output_dir)) | |
| created = sorted(after - before) | |
| return created | |
| # --------------------------------------------------------------------------- | |
| # Implemented via CLI engine (matches agent_routes.py) | |
| # --------------------------------------------------------------------------- | |
| def redact_document( | |
| input_files: str | list[str], | |
| *, | |
| output_dir: str | None = None, | |
| ocr_method: str | None = None, | |
| pii_detector: str | None = None, | |
| instruction: str | None = None, | |
| overrides: dict[str, Any] | None = None, | |
| ) -> list[str]: | |
| """ | |
| Parity with Gradio `api_name='redact_document'`. | |
| Runs CLI task `redact` (PDF/PNG/JPG) or relevant workflow based on file type. | |
| """ | |
| direct: dict[str, Any] = { | |
| "task": "redact", | |
| "input_file": _ensure_list(input_files), | |
| } | |
| if ocr_method is not None: | |
| direct["ocr_method"] = ocr_method | |
| if pii_detector is not None: | |
| direct["pii_detector"] = pii_detector | |
| if instruction is not None: | |
| direct["custom_llm_instructions"] = instruction | |
| if overrides: | |
| direct.update(overrides) | |
| return _run_cli( | |
| gradio_api_name="redact_document", overrides=direct, output_dir=output_dir | |
| ) | |
| def redact_data( | |
| input_files: str | list[str], | |
| *, | |
| output_dir: str | None = None, | |
| instruction: str | None = None, | |
| overrides: dict[str, Any] | None = None, | |
| ) -> list[str]: | |
| """Parity with Gradio `api_name='redact_data'` (same CLI task: `redact`).""" | |
| direct: dict[str, Any] = {"task": "redact", "input_file": _ensure_list(input_files)} | |
| if instruction is not None: | |
| direct["custom_llm_instructions"] = instruction | |
| if overrides: | |
| direct.update(overrides) | |
| return _run_cli( | |
| gradio_api_name="redact_data", overrides=direct, output_dir=output_dir | |
| ) | |
| def find_duplicate_pages( | |
| input_files: str | list[str], | |
| *, | |
| output_dir: str | None = None, | |
| similarity_threshold: float | None = None, | |
| min_word_count: int | None = None, | |
| min_consecutive_pages: int | None = None, | |
| greedy_match: bool | None = None, | |
| combine_pages: bool | None = None, | |
| overrides: dict[str, Any] | None = None, | |
| ) -> list[str]: | |
| """Parity with Gradio `api_name='find_duplicate_pages'`.""" | |
| direct: dict[str, Any] = { | |
| "task": "deduplicate", | |
| "duplicate_type": "pages", | |
| "input_file": _ensure_list(input_files), | |
| } | |
| if similarity_threshold is not None: | |
| direct["similarity_threshold"] = similarity_threshold | |
| if min_word_count is not None: | |
| direct["min_word_count"] = min_word_count | |
| if min_consecutive_pages is not None: | |
| direct["min_consecutive_pages"] = min_consecutive_pages | |
| if greedy_match is not None: | |
| direct["greedy_match"] = "True" if greedy_match else "False" | |
| if combine_pages is not None: | |
| direct["combine_pages"] = "True" if combine_pages else "False" | |
| if overrides: | |
| direct.update(overrides) | |
| return _run_cli( | |
| gradio_api_name="find_duplicate_pages", overrides=direct, output_dir=output_dir | |
| ) | |
| def find_duplicate_tabular( | |
| input_files: str | list[str], | |
| *, | |
| output_dir: str | None = None, | |
| text_columns: list[str] | None = None, | |
| similarity_threshold: float | None = None, | |
| min_word_count: int | None = None, | |
| overrides: dict[str, Any] | None = None, | |
| ) -> list[str]: | |
| """Parity with Gradio `api_name='find_duplicate_tabular'`.""" | |
| direct: dict[str, Any] = { | |
| "task": "deduplicate", | |
| "duplicate_type": "tabular", | |
| "input_file": _ensure_list(input_files), | |
| } | |
| if text_columns is not None: | |
| direct["text_columns"] = list(text_columns) | |
| if similarity_threshold is not None: | |
| direct["similarity_threshold"] = similarity_threshold | |
| if min_word_count is not None: | |
| direct["min_word_count"] = min_word_count | |
| if overrides: | |
| direct.update(overrides) | |
| return _run_cli( | |
| gradio_api_name="find_duplicate_tabular", | |
| overrides=direct, | |
| output_dir=output_dir, | |
| ) | |
| def summarise_document( | |
| input_files: str | list[str], | |
| *, | |
| output_dir: str | None = None, | |
| overrides: dict[str, Any] | None = None, | |
| ) -> list[str]: | |
| """Parity with Gradio `api_name='summarise_document'` (CLI task: `summarise`).""" | |
| direct: dict[str, Any] = { | |
| "task": "summarise", | |
| "input_file": _ensure_list(input_files), | |
| } | |
| if overrides: | |
| direct.update(overrides) | |
| return _run_cli( | |
| gradio_api_name="summarise_document", overrides=direct, output_dir=output_dir | |
| ) | |
| def combine_review_pdfs( | |
| input_files: str | list[str], | |
| *, | |
| output_dir: str | None = None, | |
| overrides: dict[str, Any] | None = None, | |
| ) -> list[str]: | |
| """Parity with Gradio `api_name='combine_review_pdfs'` (CLI task: `combine_review_pdfs`).""" | |
| direct: dict[str, Any] = { | |
| "task": "combine_review_pdfs", | |
| "input_file": _ensure_list(input_files), | |
| } | |
| if overrides: | |
| direct.update(overrides) | |
| return _run_cli( | |
| gradio_api_name="combine_review_pdfs", overrides=direct, output_dir=output_dir | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Implemented without CLI (as per agent_routes.py) | |
| # --------------------------------------------------------------------------- | |
| def combine_review_csvs( | |
| input_files: Iterable[str], | |
| *, | |
| output_dir: str | None = None, | |
| ) -> list[str]: | |
| """Parity with Gradio `api_name='combine_review_csvs'`.""" | |
| from tools.config import OUTPUT_FOLDER | |
| from tools.helper_functions import merge_csv_files | |
| out_dir = str(output_dir or OUTPUT_FOLDER) | |
| Path(out_dir).mkdir(parents=True, exist_ok=True) | |
| sep = "/" if not out_dir.endswith(("/", "\\")) else "" | |
| return merge_csv_files([str(p) for p in input_files], output_folder=out_dir + sep) | |
| def export_review_redaction_overlay( | |
| *, | |
| page_image_path: str, | |
| boxes: list[dict[str, Any]], | |
| page_number: int = 1, | |
| doc_base_name: str = "review", | |
| review_df_records: list[dict[str, Any]] | None = None, | |
| label_abbrev_chars: int | None = None, | |
| ) -> list[str]: | |
| """Same behaviour as Gradio ``api_name='page_redaction_review_image'``; Agent API route ``export_review_redaction_overlay``.""" | |
| import pandas as pd | |
| from tools.config import OUTPUT_FOLDER | |
| from tools.redaction_review import visualise_review_redaction_boxes | |
| annotator: dict[str, Any] = {"image": page_image_path, "boxes": boxes} | |
| review_df = pd.DataFrame(review_df_records) if review_df_records else pd.DataFrame() | |
| out_dir = str(Path(OUTPUT_FOLDER).expanduser().resolve()) | |
| Path(out_dir).mkdir(parents=True, exist_ok=True) | |
| out_path = visualise_review_redaction_boxes( | |
| annotator, | |
| review_df=review_df, | |
| output_folder=out_dir, | |
| page_number=page_number, | |
| doc_base_name=doc_base_name, | |
| label_abbrev_chars=label_abbrev_chars, | |
| ) | |
| return [out_path] if out_path else [] | |
| def export_review_page_ocr_visualisation( | |
| *, | |
| page_image_path: str, | |
| ocr_results: dict[str, Any], | |
| page_number: int = 1, | |
| doc_base_name: str = "review", | |
| ) -> list[str]: | |
| """Same behaviour as Gradio ``api_name='page_ocr_review_image'``; Agent API route ``export_review_page_ocr_visualisation``.""" | |
| from PIL import Image | |
| from tools.config import OUTPUT_FOLDER | |
| from tools.file_redaction import visualise_ocr_words_bounding_boxes | |
| out_dir = str(Path(OUTPUT_FOLDER).expanduser().resolve()) | |
| Path(out_dir).mkdir(parents=True, exist_ok=True) | |
| image_name = f"{str(doc_base_name or 'review')}_page{int(page_number)}.png" | |
| log_paths: list[str] = [] | |
| log_paths = visualise_ocr_words_bounding_boxes( | |
| Image.open(page_image_path).convert("RGB"), | |
| ocr_results, | |
| image_name=image_name, | |
| output_folder=out_dir, | |
| visualisation_folder="review_ocr_visualisations", | |
| add_legend=True, | |
| log_files_output_paths=log_paths, | |
| ) | |
| return list(log_paths) | |
| # --------------------------------------------------------------------------- | |
| # Gradio-session-only (no single CLI task) | |
| # --------------------------------------------------------------------------- | |
| def load_and_prepare_documents_or_data(*args: Any, **kwargs: Any) -> list[str]: | |
| raise NotImplementedError( | |
| "load_and_prepare_documents_or_data is Gradio-session-state driven and is not exposed as a single CLI task." | |
| ) | |
| def apply_review_redactions( | |
| pdf_path: str, | |
| review_csv_path: str, | |
| *, | |
| output_dir: str | None = None, | |
| input_dir: str | None = None, | |
| text_extract_method: str | None = None, | |
| efficient_ocr: bool | None = None, | |
| ) -> list[str]: | |
| """ | |
| Headless parity with Gradio ``api_name='apply_review_redactions'``. | |
| Returns output file paths (redacted PDF, review CSV, logs, etc.). | |
| """ | |
| from tools.simplified_api import run_apply_review_redactions | |
| r = run_apply_review_redactions( | |
| pdf_path=pdf_path, | |
| review_csv_path=review_csv_path, | |
| output_dir=output_dir, | |
| input_dir=input_dir, | |
| text_extract_method=text_extract_method, | |
| efficient_ocr=efficient_ocr, | |
| ) | |
| return list(r.get("output_paths") or []) | |
| def word_level_ocr_text_search( | |
| ocr_words_csv_path: str, | |
| search_text: str, | |
| *, | |
| similarity_threshold: float = 1.0, | |
| use_regex: bool = False, | |
| review_csv_path: str | None = None, | |
| ) -> dict: | |
| """Headless word-level OCR search against ``*_ocr_results_with_words_*.csv``.""" | |
| from tools.verify_redaction_coverage import run_word_level_ocr_text_search | |
| return run_word_level_ocr_text_search( | |
| ocr_words_csv_path, | |
| search_text, | |
| similarity_threshold=similarity_threshold, | |
| use_regex=use_regex, | |
| review_csv_path=review_csv_path, | |
| ) | |
| def verify_redaction_coverage( | |
| review_csv_path: str, | |
| ocr_words_csv_path: str, | |
| *, | |
| must_redact: list[str] | None = None, | |
| must_not_redact: list[str] | None = None, | |
| redacted_pdf_path: str | None = None, | |
| total_pages: int | None = None, | |
| min_word_length: int = 3, | |
| sample_pixels: bool = False, | |
| auto_prune_suspicious: bool = False, | |
| pruned_output_path: str | None = None, | |
| ) -> dict: | |
| """Pass 1 programmatic coverage report (no VLM).""" | |
| from tools.simplified_api import run_verify_redaction_coverage | |
| report, _, _ = run_verify_redaction_coverage( | |
| review_csv_path, | |
| ocr_words_csv_path, | |
| must_redact=must_redact, | |
| must_not_redact=must_not_redact, | |
| redacted_pdf_path=redacted_pdf_path, | |
| total_pages=total_pages, | |
| min_word_length=min_word_length, | |
| sample_pixels=sample_pixels, | |
| auto_prune_suspicious=auto_prune_suspicious, | |
| pruned_output_path=pruned_output_path, | |
| ) | |
| return report | |
| __all__ = [ | |
| "redact_document", | |
| "load_and_prepare_documents_or_data", | |
| "apply_review_redactions", | |
| "export_review_page_ocr_visualisation", | |
| "export_review_redaction_overlay", | |
| "word_level_ocr_text_search", | |
| "verify_redaction_coverage", | |
| "redact_data", | |
| "find_duplicate_pages", | |
| "find_duplicate_tabular", | |
| "summarise_document", | |
| "combine_review_csvs", | |
| "combine_review_pdfs", | |
| ] | |