""" preview_redaction_boxes.py ========================== Local-first coordinate preview tool for the Document Redaction app. Purpose ------- Render proposed redaction boxes from a ``*_review_file.csv`` onto the **original** (un-redacted) PDF pages and save the result as PNG images. Because this runs entirely locally with PyMuPDF + Pillow, iteration is instantaneous — no server round-trip, no waiting for ``/review_apply``. Primary use-case ---------------- Called by agents or humans **between CSV edits and the API call to ``/review_apply``**. Iterate until the preview looks right, *then* send to the server. This avoids the expensive cycle of: guess coordinates → apply → download → render → spot the miss → repeat Typical agent workflow ---------------------- 1. Edit ``*_review_file_edited.csv`` (remove FPs, add signatures, etc.). 2. Call ``preview_redaction_boxes(pdf_path, csv_path, out_dir)`` locally. 3. Inspect the saved PNGs. 4. If anything is wrong, adjust the CSV and go to step 2. 5. Only when satisfied, call ``/review_apply`` on the server. API endpoint (server-side fallback) ------------------------------------ When the agent does not have a local copy of the original PDF, ``preview_boxes_api()`` exposes the same logic as a short ``gr.api`` endpoint registered as ``/preview_boxes`` in ``app.py``. The caller uploads the original PDF and the edited review CSV; the server returns a ZIP of preview PNGs. CLI usage --------- python tools/preview_redaction_boxes.py original.pdf review_file.csv # Optional flags: python tools/preview_redaction_boxes.py original.pdf review_file.csv \\ --out-dir output/preview \\ --dpi 150 \\ --max-width 1280 \\ --grid # draw percentage-grid lines --pages 1,3,5 # only render specific pages (1-indexed) """ from __future__ import annotations import argparse import csv import zipfile from io import BytesIO from pathlib import Path from typing import Sequence import pymupdf from PIL import Image, ImageDraw, ImageFont # ── Colour palette per label type ────────────────────────────────────────── _LABEL_COLOURS: dict[str, str] = { "PERSON": "#e74c3c", # red "SIGNATURE": "#8e44ad", # purple "LOCATION": "#2980b9", # blue "EMAIL_ADDRESS": "#e67e22", # orange "PHONE_NUMBER": "#27ae60", # green "CUSTOM": "#f39c12", # amber "DATE_TIME": "#16a085", # teal "ORG": "#7f8c8d", # grey } _DEFAULT_COLOUR = "#c0392b" # ── Grid style ───────────────────────────────────────────────────────────── _GRID_COLOUR = "#cc0000" _GRID_STEP = 5 # percentage intervals def _label_colour(label: str) -> str: for key, colour in _LABEL_COLOURS.items(): if key in label.upper(): return colour return _DEFAULT_COLOUR def _load_font(size: int = 11) -> ImageFont.ImageFont: """Return a PIL font; fall back to the default if no TTF is available.""" for name in ("DejaVuSans.ttf", "Arial.ttf", "LiberationSans-Regular.ttf"): try: return ImageFont.truetype(name, size) except OSError: pass return ImageFont.load_default() def preview_redaction_boxes( pdf_path: str | Path, csv_path: str | Path, out_dir: str | Path | None = None, *, dpi: int = 150, max_width: int = 1280, draw_grid: bool = True, pages: Sequence[int] | None = None, ) -> list[Path]: """ Render proposed redaction boxes from *csv_path* onto the original PDF at *pdf_path* and save one PNG per page to *out_dir*. Parameters ---------- pdf_path: Path to the original (un-redacted) PDF. csv_path: Path to the ``*_review_file.csv`` (original or edited). out_dir: Directory for output PNGs. Defaults to a ``preview/`` subfolder next to the CSV. dpi: Render resolution. 150 is a good balance of speed vs. detail. Use 200-300 for detailed inspection of small text. max_width: Downscale rendered pages to at most this width (pixels) before drawing boxes, to keep file sizes manageable. draw_grid: If True, overlay horizontal lines at every *_GRID_STEP* percent of page height with percentage labels so you can read off normalized y-coordinates by eye. pages: If given, only render these 1-indexed page numbers. Useful when you are iterating on a single page and don't want to wait for the whole document. Returns ------- list[Path] Sorted list of saved PNG paths. """ pdf_path = Path(pdf_path) csv_path = Path(csv_path) if out_dir is None: out_dir = csv_path.parent / "preview" out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) # ── Load CSV ──────────────────────────────────────────────────────────── with csv_path.open(newline="", encoding="utf-8-sig") as fh: rows = list(csv.DictReader(fh)) rows_by_page: dict[int, list[dict]] = {} for row in rows: try: page_num = int(float(row.get("page", "0") or 0)) except ValueError: continue rows_by_page.setdefault(page_num, []).append(row) # ── Render pages ──────────────────────────────────────────────────────── doc = pymupdf.open(str(pdf_path)) font = _load_font(11) saved: list[Path] = [] page_range = range(1, doc.page_count + 1) if pages: page_range = [p for p in pages if 1 <= p <= doc.page_count] for page_num in page_range: pix = doc[page_num - 1].get_pixmap(dpi=dpi) render_w, render_h = pix.width, pix.height img = Image.frombytes("RGB", [render_w, render_h], pix.samples) # ── Downscale if needed ────────────────────────────────────────── if render_w > max_width: scale = max_width / render_w img = img.resize((max_width, int(render_h * scale)), Image.LANCZOS) draw_w, draw_h = img.size draw = ImageDraw.Draw(img, "RGBA") # ── Percentage grid ────────────────────────────────────────────── if draw_grid: for pct in range(0, 101, _GRID_STEP): y = int(pct / 100 * draw_h) draw.line([(0, y), (draw_w, y)], fill=_GRID_COLOUR + "55", width=1) draw.text((3, max(0, y - 11)), f"{pct}%", fill=_GRID_COLOUR, font=font) # ── Redaction boxes ────────────────────────────────────────────── for row in rows_by_page.get(page_num, []): try: x0 = float(row["xmin"]) * draw_w y0 = float(row["ymin"]) * draw_h x1 = float(row["xmax"]) * draw_w y1 = float(row["ymax"]) * draw_h except (KeyError, ValueError): continue label = row.get("label", "CUSTOM") colour = _label_colour(label) text_snippet = (row.get("text", "") or "")[:30] # Semi-transparent fill draw.rectangle( [x0, y0, x1, y1], fill=colour + "33", outline=colour, width=2 ) # Label text tag = f"{label}: {text_snippet}" if text_snippet else label draw.text((x0 + 3, y0 + 2), tag, fill=colour, font=font) # ── Legend (top-right corner) ──────────────────────────────────── legend_labels = sorted( {r.get("label", "CUSTOM") for r in rows_by_page.get(page_num, [])} ) lx, ly = draw_w - 200, 8 for lbl in legend_labels: col = _label_colour(lbl) draw.rectangle( [lx, ly, lx + 14, ly + 14], fill=col + "cc", outline=col, width=1 ) draw.text((lx + 18, ly + 1), lbl, fill=col, font=font) ly += 17 out_path = out_dir / f"page_{page_num:03d}_preview.png" img.save(out_path) saved.append(out_path) doc.close() print(f"Saved {len(saved)} preview image(s) to: {out_dir}") return sorted(saved) def preview_redaction_boxes_to_zip( pdf_path: str | Path, csv_path: str | Path, *, dpi: int = 150, max_width: int = 1280, draw_grid: bool = True, pages: Sequence[int] | None = None, ) -> bytes: """ Same as ``preview_redaction_boxes`` but returns a ZIP of PNGs as bytes. Used by the ``preview_boxes_api`` server endpoint so callers receive all preview images in a single response without needing a shared filesystem. """ import tempfile with tempfile.TemporaryDirectory() as tmp: paths = preview_redaction_boxes( pdf_path, csv_path, out_dir=tmp, dpi=dpi, max_width=max_width, draw_grid=draw_grid, pages=pages, ) buf = BytesIO() with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: for p in paths: zf.write(p, arcname=Path(p).name) return buf.getvalue() # ── CLI entry-point ───────────────────────────────────────────────────────── def _main() -> None: parser = argparse.ArgumentParser( description="Render proposed redaction boxes from a review CSV onto the original PDF." ) parser.add_argument("pdf", help="Path to the original (un-redacted) PDF") parser.add_argument("csv", help="Path to the *_review_file.csv") parser.add_argument( "--out-dir", default=None, help="Output directory for PNGs (default: /preview/)", ) parser.add_argument( "--dpi", type=int, default=150, help="Render DPI (default: 150)" ) parser.add_argument( "--max-width", type=int, default=1280, help="Max image width in pixels (default: 1280)", ) parser.add_argument( "--grid", action="store_true", default=True, help="Draw percentage grid (default: on)", ) parser.add_argument( "--no-grid", dest="grid", action="store_false", help="Disable percentage grid" ) parser.add_argument( "--pages", default=None, help="Comma-separated 1-indexed page numbers to render, e.g. 1,3,5 (default: all)", ) args = parser.parse_args() pages = None if args.pages: pages = [int(p.strip()) for p in args.pages.split(",")] preview_redaction_boxes( args.pdf, args.csv, out_dir=args.out_dir, dpi=args.dpi, max_width=args.max_width, draw_grid=args.grid, pages=pages, ) if __name__ == "__main__": _main()