Spaces:

seanpedrickcase
/

document_redaction_vlm

Running on Zero

App Files Files Community

document_redaction_vlm / doc_redaction /cli_api.py

seanpedrickcase

Sync: Fixed minor s3 prefix construction issue

ee6490e 1 day ago

raw

history blame contribute delete

12.9 kB

	"""
	CLI-first programmatic API surface.

	These functions provide a minimal, runnable Python interface that mirrors the
	Gradio `api_name` routes, but executes the underlying workflows via the CLI
	engine (`cli_redact.main(direct_mode_args=...)`).

	Return values are lists of output file paths created in `output_dir`.
	"""

	from __future__ import annotations

	import os
	import tempfile
	from pathlib import Path
	from typing import Any, Iterable


	def _ensure_list(v: str \| list[str] \| tuple[str, ...]) -> list[str]:
	if isinstance(v, (list, tuple)):
	return [str(x) for x in v]
	return [str(v)]


	def _snapshot_files(folder: str) -> set[str]:
	root = Path(folder)
	if not root.exists():
	return set()
	out: set[str] = set()
	for dirpath, _, filenames in os.walk(root):
	for name in filenames:
	out.add(str(Path(dirpath) / name))
	return out


	def _default_output_dir(prefix: str) -> str:
	return tempfile.mkdtemp(prefix=f"doc_redaction_{prefix}_")


	def _run_cli(
	*,
	gradio_api_name: str,
	overrides: dict[str, Any],
	output_dir: str \| None,
	) -> list[str]:
	"""
	Run cli_redact.main with merged defaults and return newly created files.
	"""
	from cli_redact import get_cli_default_args_dict
	from cli_redact import main as cli_main

	merged = get_cli_default_args_dict()
	merged.update(overrides)

	if output_dir is None:
	output_dir = _default_output_dir(gradio_api_name)
	merged["output_dir"] = str(output_dir)

	before = _snapshot_files(str(output_dir))
	cli_main(direct_mode_args=merged)
	after = _snapshot_files(str(output_dir))

	created = sorted(after - before)
	return created


	# ---------------------------------------------------------------------------
	# Implemented via CLI engine (matches agent_routes.py)
	# ---------------------------------------------------------------------------


	def redact_document(
	input_files: str \| list[str],
	*,
	output_dir: str \| None = None,
	ocr_method: str \| None = None,
	pii_detector: str \| None = None,
	instruction: str \| None = None,
	overrides: dict[str, Any] \| None = None,
	) -> list[str]:
	"""
	Parity with Gradio `api_name='redact_document'`.
	Runs CLI task `redact` (PDF/PNG/JPG) or relevant workflow based on file type.
	"""
	direct: dict[str, Any] = {
	"task": "redact",
	"input_file": _ensure_list(input_files),
	}
	if ocr_method is not None:
	direct["ocr_method"] = ocr_method
	if pii_detector is not None:
	direct["pii_detector"] = pii_detector
	if instruction is not None:
	direct["custom_llm_instructions"] = instruction
	if overrides:
	direct.update(overrides)
	return _run_cli(
	gradio_api_name="redact_document", overrides=direct, output_dir=output_dir
	)


	def redact_data(
	input_files: str \| list[str],
	*,
	output_dir: str \| None = None,
	instruction: str \| None = None,
	overrides: dict[str, Any] \| None = None,
	) -> list[str]:
	"""Parity with Gradio `api_name='redact_data'` (same CLI task: `redact`)."""
	direct: dict[str, Any] = {"task": "redact", "input_file": _ensure_list(input_files)}
	if instruction is not None:
	direct["custom_llm_instructions"] = instruction
	if overrides:
	direct.update(overrides)
	return _run_cli(
	gradio_api_name="redact_data", overrides=direct, output_dir=output_dir
	)


	def find_duplicate_pages(
	input_files: str \| list[str],
	*,
	output_dir: str \| None = None,
	similarity_threshold: float \| None = None,
	min_word_count: int \| None = None,
	min_consecutive_pages: int \| None = None,
	greedy_match: bool \| None = None,
	combine_pages: bool \| None = None,
	overrides: dict[str, Any] \| None = None,
	) -> list[str]:
	"""Parity with Gradio `api_name='find_duplicate_pages'`."""
	direct: dict[str, Any] = {
	"task": "deduplicate",
	"duplicate_type": "pages",
	"input_file": _ensure_list(input_files),
	}
	if similarity_threshold is not None:
	direct["similarity_threshold"] = similarity_threshold
	if min_word_count is not None:
	direct["min_word_count"] = min_word_count
	if min_consecutive_pages is not None:
	direct["min_consecutive_pages"] = min_consecutive_pages
	if greedy_match is not None:
	direct["greedy_match"] = "True" if greedy_match else "False"
	if combine_pages is not None:
	direct["combine_pages"] = "True" if combine_pages else "False"
	if overrides:
	direct.update(overrides)
	return _run_cli(
	gradio_api_name="find_duplicate_pages", overrides=direct, output_dir=output_dir
	)


	def find_duplicate_tabular(
	input_files: str \| list[str],
	*,
	output_dir: str \| None = None,
	text_columns: list[str] \| None = None,
	similarity_threshold: float \| None = None,
	min_word_count: int \| None = None,
	overrides: dict[str, Any] \| None = None,
	) -> list[str]:
	"""Parity with Gradio `api_name='find_duplicate_tabular'`."""
	direct: dict[str, Any] = {
	"task": "deduplicate",
	"duplicate_type": "tabular",
	"input_file": _ensure_list(input_files),
	}
	if text_columns is not None:
	direct["text_columns"] = list(text_columns)
	if similarity_threshold is not None:
	direct["similarity_threshold"] = similarity_threshold
	if min_word_count is not None:
	direct["min_word_count"] = min_word_count
	if overrides:
	direct.update(overrides)
	return _run_cli(
	gradio_api_name="find_duplicate_tabular",
	overrides=direct,
	output_dir=output_dir,
	)


	def summarise_document(
	input_files: str \| list[str],
	*,
	output_dir: str \| None = None,
	overrides: dict[str, Any] \| None = None,
	) -> list[str]:
	"""Parity with Gradio `api_name='summarise_document'` (CLI task: `summarise`)."""
	direct: dict[str, Any] = {
	"task": "summarise",
	"input_file": _ensure_list(input_files),
	}
	if overrides:
	direct.update(overrides)
	return _run_cli(
	gradio_api_name="summarise_document", overrides=direct, output_dir=output_dir
	)


	def combine_review_pdfs(
	input_files: str \| list[str],
	*,
	output_dir: str \| None = None,
	overrides: dict[str, Any] \| None = None,
	) -> list[str]:
	"""Parity with Gradio `api_name='combine_review_pdfs'` (CLI task: `combine_review_pdfs`)."""
	direct: dict[str, Any] = {
	"task": "combine_review_pdfs",
	"input_file": _ensure_list(input_files),
	}
	if overrides:
	direct.update(overrides)
	return _run_cli(
	gradio_api_name="combine_review_pdfs", overrides=direct, output_dir=output_dir
	)


	# ---------------------------------------------------------------------------
	# Implemented without CLI (as per agent_routes.py)
	# ---------------------------------------------------------------------------


	def combine_review_csvs(
	input_files: Iterable[str],
	*,
	output_dir: str \| None = None,
	) -> list[str]:
	"""Parity with Gradio `api_name='combine_review_csvs'`."""
	from tools.config import OUTPUT_FOLDER
	from tools.helper_functions import merge_csv_files

	out_dir = str(output_dir or OUTPUT_FOLDER)
	Path(out_dir).mkdir(parents=True, exist_ok=True)
	sep = "/" if not out_dir.endswith(("/", "\\")) else ""

	return merge_csv_files([str(p) for p in input_files], output_folder=out_dir + sep)


	def export_review_redaction_overlay(
	*,
	page_image_path: str,
	boxes: list[dict[str, Any]],
	page_number: int = 1,
	doc_base_name: str = "review",
	review_df_records: list[dict[str, Any]] \| None = None,
	label_abbrev_chars: int \| None = None,
	) -> list[str]:
	"""Same behaviour as Gradio ``api_name='page_redaction_review_image'``; Agent API route ``export_review_redaction_overlay``."""
	import pandas as pd

	from tools.config import OUTPUT_FOLDER
	from tools.redaction_review import visualise_review_redaction_boxes

	annotator: dict[str, Any] = {"image": page_image_path, "boxes": boxes}
	review_df = pd.DataFrame(review_df_records) if review_df_records else pd.DataFrame()

	out_dir = str(Path(OUTPUT_FOLDER).expanduser().resolve())
	Path(out_dir).mkdir(parents=True, exist_ok=True)
	out_path = visualise_review_redaction_boxes(
	annotator,
	review_df=review_df,
	output_folder=out_dir,
	page_number=page_number,
	doc_base_name=doc_base_name,
	label_abbrev_chars=label_abbrev_chars,
	)
	return [out_path] if out_path else []


	def export_review_page_ocr_visualisation(
	*,
	page_image_path: str,
	ocr_results: dict[str, Any],
	page_number: int = 1,
	doc_base_name: str = "review",
	) -> list[str]:
	"""Same behaviour as Gradio ``api_name='page_ocr_review_image'``; Agent API route ``export_review_page_ocr_visualisation``."""
	from PIL import Image

	from tools.config import OUTPUT_FOLDER
	from tools.file_redaction import visualise_ocr_words_bounding_boxes

	out_dir = str(Path(OUTPUT_FOLDER).expanduser().resolve())
	Path(out_dir).mkdir(parents=True, exist_ok=True)

	image_name = f"{str(doc_base_name or 'review')}_page{int(page_number)}.png"
	log_paths: list[str] = []
	log_paths = visualise_ocr_words_bounding_boxes(
	Image.open(page_image_path).convert("RGB"),
	ocr_results,
	image_name=image_name,
	output_folder=out_dir,
	visualisation_folder="review_ocr_visualisations",
	add_legend=True,
	log_files_output_paths=log_paths,
	)
	return list(log_paths)


	# ---------------------------------------------------------------------------
	# Gradio-session-only (no single CLI task)
	# ---------------------------------------------------------------------------


	def load_and_prepare_documents_or_data(args: Any, *kwargs: Any) -> list[str]:
	raise NotImplementedError(
	"load_and_prepare_documents_or_data is Gradio-session-state driven and is not exposed as a single CLI task."
	)


	def apply_review_redactions(
	pdf_path: str,
	review_csv_path: str,
	*,
	output_dir: str \| None = None,
	input_dir: str \| None = None,
	text_extract_method: str \| None = None,
	efficient_ocr: bool \| None = None,
	) -> list[str]:
	"""
	Headless parity with Gradio ``api_name='apply_review_redactions'``.

	Returns output file paths (redacted PDF, review CSV, logs, etc.).
	"""
	from tools.simplified_api import run_apply_review_redactions

	r = run_apply_review_redactions(
	pdf_path=pdf_path,
	review_csv_path=review_csv_path,
	output_dir=output_dir,
	input_dir=input_dir,
	text_extract_method=text_extract_method,
	efficient_ocr=efficient_ocr,
	)
	return list(r.get("output_paths") or [])


	def word_level_ocr_text_search(
	ocr_words_csv_path: str,
	search_text: str,
	*,
	similarity_threshold: float = 1.0,
	use_regex: bool = False,
	review_csv_path: str \| None = None,
	) -> dict:
	"""Headless word-level OCR search against ``_ocr_results_with_words_.csv``."""
	from tools.verify_redaction_coverage import run_word_level_ocr_text_search

	return run_word_level_ocr_text_search(
	ocr_words_csv_path,
	search_text,
	similarity_threshold=similarity_threshold,
	use_regex=use_regex,
	review_csv_path=review_csv_path,
	)


	def verify_redaction_coverage(
	review_csv_path: str,
	ocr_words_csv_path: str,
	*,
	must_redact: list[str] \| None = None,
	must_not_redact: list[str] \| None = None,
	redacted_pdf_path: str \| None = None,
	total_pages: int \| None = None,
	min_word_length: int = 3,
	sample_pixels: bool = False,
	auto_prune_suspicious: bool = False,
	pruned_output_path: str \| None = None,
	) -> dict:
	"""Pass 1 programmatic coverage report (no VLM)."""
	from tools.simplified_api import run_verify_redaction_coverage

	report, _, _ = run_verify_redaction_coverage(
	review_csv_path,
	ocr_words_csv_path,
	must_redact=must_redact,
	must_not_redact=must_not_redact,
	redacted_pdf_path=redacted_pdf_path,
	total_pages=total_pages,
	min_word_length=min_word_length,
	sample_pixels=sample_pixels,
	auto_prune_suspicious=auto_prune_suspicious,
	pruned_output_path=pruned_output_path,
	)
	return report


	__all__ = [
	"redact_document",
	"load_and_prepare_documents_or_data",
	"apply_review_redactions",
	"export_review_page_ocr_visualisation",
	"export_review_redaction_overlay",
	"word_level_ocr_text_search",
	"verify_redaction_coverage",
	"redact_data",
	"find_duplicate_pages",
	"find_duplicate_tabular",
	"summarise_document",
	"combine_review_csvs",
	"combine_review_pdfs",
	]