Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

document_redaction / tools /preview_redaction_boxes.py

seanpedrickcase

Sync: further adjustments to simple text extraction with single column layouts

5c6e6ce 1 day ago

raw

history blame contribute delete

11.6 kB

	"""
	preview_redaction_boxes.py
	==========================
	Local-first coordinate preview tool for the Document Redaction app.

	Purpose
	-------
	Render proposed redaction boxes from a ``*_review_file.csv`` onto the
	original (un-redacted) PDF pages and save the result as PNG images.
	Because this runs entirely locally with PyMuPDF + Pillow, iteration is
	instantaneous — no server round-trip, no waiting for ``/review_apply``.

	Primary use-case
	----------------
	Called by agents or humans **between CSV edits and the API call to
	``/review_apply``*. Iterate until the preview looks right, then*
	send to the server. This avoids the expensive cycle of:

	guess coordinates → apply → download → render → spot the miss → repeat

	Typical agent workflow
	----------------------
	1. Edit ``*_review_file_edited.csv`` (remove FPs, add signatures, etc.).
	2. Call ``preview_redaction_boxes(pdf_path, csv_path, out_dir)`` locally.
	3. Inspect the saved PNGs.
	4. If anything is wrong, adjust the CSV and go to step 2.
	5. Only when satisfied, call ``/review_apply`` on the server.

	API endpoint (server-side fallback)
	------------------------------------
	When the agent does not have a local copy of the original PDF,
	``preview_boxes_api()`` exposes the same logic as a short ``gr.api``
	endpoint registered as ``/preview_boxes`` in ``app.py``. The caller
	uploads the original PDF and the edited review CSV; the server returns a
	ZIP of preview PNGs.

	CLI usage
	---------
	python tools/preview_redaction_boxes.py original.pdf review_file.csv

	# Optional flags:
	python tools/preview_redaction_boxes.py original.pdf review_file.csv \\
	--out-dir output/preview \\
	--dpi 150 \\
	--max-width 1280 \\
	--grid # draw percentage-grid lines
	--pages 1,3,5 # only render specific pages (1-indexed)
	"""

	from __future__ import annotations

	import argparse
	import csv
	import zipfile
	from io import BytesIO
	from pathlib import Path
	from typing import Sequence

	import pymupdf
	from PIL import Image, ImageDraw, ImageFont

	# ── Colour palette per label type ──────────────────────────────────────────
	_LABEL_COLOURS: dict[str, str] = {
	"PERSON": "#e74c3c", # red
	"SIGNATURE": "#8e44ad", # purple
	"LOCATION": "#2980b9", # blue
	"EMAIL_ADDRESS": "#e67e22", # orange
	"PHONE_NUMBER": "#27ae60", # green
	"CUSTOM": "#f39c12", # amber
	"DATE_TIME": "#16a085", # teal
	"ORG": "#7f8c8d", # grey
	}
	_DEFAULT_COLOUR = "#c0392b"

	# ── Grid style ─────────────────────────────────────────────────────────────
	_GRID_COLOUR = "#cc0000"
	_GRID_STEP = 5 # percentage intervals


	def _label_colour(label: str) -> str:
	for key, colour in _LABEL_COLOURS.items():
	if key in label.upper():
	return colour
	return _DEFAULT_COLOUR


	def _load_font(size: int = 11) -> ImageFont.ImageFont:
	"""Return a PIL font; fall back to the default if no TTF is available."""
	for name in ("DejaVuSans.ttf", "Arial.ttf", "LiberationSans-Regular.ttf"):
	try:
	return ImageFont.truetype(name, size)
	except OSError:
	pass
	return ImageFont.load_default()


	def preview_redaction_boxes(
	pdf_path: str \| Path,
	csv_path: str \| Path,
	out_dir: str \| Path \| None = None,
	*,
	dpi: int = 150,
	max_width: int = 1280,
	draw_grid: bool = True,
	pages: Sequence[int] \| None = None,
	) -> list[Path]:
	"""
	Render proposed redaction boxes from csv_path onto the original PDF
	at pdf_path and save one PNG per page to out_dir.

	Parameters
	----------
	pdf_path:
	Path to the original (un-redacted) PDF.
	csv_path:
	Path to the ``*_review_file.csv`` (original or edited).
	out_dir:
	Directory for output PNGs. Defaults to a ``preview/`` subfolder
	next to the CSV.
	dpi:
	Render resolution. 150 is a good balance of speed vs. detail.
	Use 200-300 for detailed inspection of small text.
	max_width:
	Downscale rendered pages to at most this width (pixels) before
	drawing boxes, to keep file sizes manageable.
	draw_grid:
	If True, overlay horizontal lines at every _GRID_STEP percent of
	page height with percentage labels so you can read off normalized
	y-coordinates by eye.
	pages:
	If given, only render these 1-indexed page numbers. Useful when
	you are iterating on a single page and don't want to wait for the
	whole document.

	Returns
	-------
	list[Path]
	Sorted list of saved PNG paths.
	"""
	pdf_path = Path(pdf_path)
	csv_path = Path(csv_path)

	if out_dir is None:
	out_dir = csv_path.parent / "preview"
	out_dir = Path(out_dir)
	out_dir.mkdir(parents=True, exist_ok=True)

	# ── Load CSV ────────────────────────────────────────────────────────────
	with csv_path.open(newline="", encoding="utf-8-sig") as fh:
	rows = list(csv.DictReader(fh))

	rows_by_page: dict[int, list[dict]] = {}
	for row in rows:
	try:
	page_num = int(float(row.get("page", "0") or 0))
	except ValueError:
	continue
	rows_by_page.setdefault(page_num, []).append(row)

	# ── Render pages ────────────────────────────────────────────────────────
	doc = pymupdf.open(str(pdf_path))
	font = _load_font(11)
	saved: list[Path] = []

	page_range = range(1, doc.page_count + 1)
	if pages:
	page_range = [p for p in pages if 1 <= p <= doc.page_count]

	for page_num in page_range:
	pix = doc[page_num - 1].get_pixmap(dpi=dpi)
	render_w, render_h = pix.width, pix.height

	img = Image.frombytes("RGB", [render_w, render_h], pix.samples)

	# ── Downscale if needed ──────────────────────────────────────────
	if render_w > max_width:
	scale = max_width / render_w
	img = img.resize((max_width, int(render_h * scale)), Image.LANCZOS)
	draw_w, draw_h = img.size

	draw = ImageDraw.Draw(img, "RGBA")

	# ── Percentage grid ──────────────────────────────────────────────
	if draw_grid:
	for pct in range(0, 101, _GRID_STEP):
	y = int(pct / 100 * draw_h)
	draw.line([(0, y), (draw_w, y)], fill=_GRID_COLOUR + "55", width=1)
	draw.text((3, max(0, y - 11)), f"{pct}%", fill=_GRID_COLOUR, font=font)

	# ── Redaction boxes ──────────────────────────────────────────────
	for row in rows_by_page.get(page_num, []):
	try:
	x0 = float(row["xmin"]) * draw_w
	y0 = float(row["ymin"]) * draw_h
	x1 = float(row["xmax"]) * draw_w
	y1 = float(row["ymax"]) * draw_h
	except (KeyError, ValueError):
	continue

	label = row.get("label", "CUSTOM")
	colour = _label_colour(label)
	text_snippet = (row.get("text", "") or "")[:30]

	# Semi-transparent fill
	draw.rectangle(
	[x0, y0, x1, y1], fill=colour + "33", outline=colour, width=2
	)

	# Label text
	tag = f"{label}: {text_snippet}" if text_snippet else label
	draw.text((x0 + 3, y0 + 2), tag, fill=colour, font=font)

	# ── Legend (top-right corner) ────────────────────────────────────
	legend_labels = sorted(
	{r.get("label", "CUSTOM") for r in rows_by_page.get(page_num, [])}
	)
	lx, ly = draw_w - 200, 8
	for lbl in legend_labels:
	col = _label_colour(lbl)
	draw.rectangle(
	[lx, ly, lx + 14, ly + 14], fill=col + "cc", outline=col, width=1
	)
	draw.text((lx + 18, ly + 1), lbl, fill=col, font=font)
	ly += 17

	out_path = out_dir / f"page_{page_num:03d}_preview.png"
	img.save(out_path)
	saved.append(out_path)

	doc.close()
	print(f"Saved {len(saved)} preview image(s) to: {out_dir}")
	return sorted(saved)


	def preview_redaction_boxes_to_zip(
	pdf_path: str \| Path,
	csv_path: str \| Path,
	*,
	dpi: int = 150,
	max_width: int = 1280,
	draw_grid: bool = True,
	pages: Sequence[int] \| None = None,
	) -> bytes:
	"""
	Same as ``preview_redaction_boxes`` but returns a ZIP of PNGs as bytes.

	Used by the ``preview_boxes_api`` server endpoint so callers receive
	all preview images in a single response without needing a shared
	filesystem.
	"""
	import tempfile

	with tempfile.TemporaryDirectory() as tmp:
	paths = preview_redaction_boxes(
	pdf_path,
	csv_path,
	out_dir=tmp,
	dpi=dpi,
	max_width=max_width,
	draw_grid=draw_grid,
	pages=pages,
	)
	buf = BytesIO()
	with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
	for p in paths:
	zf.write(p, arcname=Path(p).name)
	return buf.getvalue()


	# ── CLI entry-point ─────────────────────────────────────────────────────────
	def _main() -> None:
	parser = argparse.ArgumentParser(
	description="Render proposed redaction boxes from a review CSV onto the original PDF."
	)
	parser.add_argument("pdf", help="Path to the original (un-redacted) PDF")
	parser.add_argument("csv", help="Path to the *_review_file.csv")
	parser.add_argument(
	"--out-dir",
	default=None,
	help="Output directory for PNGs (default: <csv-dir>/preview/)",
	)
	parser.add_argument(
	"--dpi", type=int, default=150, help="Render DPI (default: 150)"
	)
	parser.add_argument(
	"--max-width",
	type=int,
	default=1280,
	help="Max image width in pixels (default: 1280)",
	)
	parser.add_argument(
	"--grid",
	action="store_true",
	default=True,
	help="Draw percentage grid (default: on)",
	)
	parser.add_argument(
	"--no-grid", dest="grid", action="store_false", help="Disable percentage grid"
	)
	parser.add_argument(
	"--pages",
	default=None,
	help="Comma-separated 1-indexed page numbers to render, e.g. 1,3,5 (default: all)",
	)
	args = parser.parse_args()

	pages = None
	if args.pages:
	pages = [int(p.strip()) for p in args.pages.split(",")]

	preview_redaction_boxes(
	args.pdf,
	args.csv,
	out_dir=args.out_dir,
	dpi=args.dpi,
	max_width=args.max_width,
	draw_grid=args.grid,
	pages=pages,
	)


	if __name__ == "__main__":
	_main()