datamatters24
/

research-document-archive

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 41 - Redaction Detection\n",
+    "\n",
+    "Pipeline notebook that detects redacted (blacked-out) regions in PDF pages using\n",
+    "PyMuPDF rendering and OpenCV contour analysis.\n",
+    "\n",
+    "**Page features:** `redaction_count`, `redaction_area_pct`\n",
+    "\n",
+    "**Document features:** `total_redactions`, `has_redactions` (1.0 / 0.0)\n",
+    "\n",
+    "Uses `joblib.Parallel(n_jobs=12)` for parallel processing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "source_section = None\n",
+    "batch_size = 500\n",
+    "min_black_area_pct = 0.5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, warnings, time\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import fitz\n",
+    "import cv2\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "from joblib import Parallel, delayed\n",
+    "\n",
+    "from research_lib.config import RAW_DIR, COLLECTIONS\n",
+    "from research_lib.db import fetch_df, upsert_feature, get_conn\n",
+    "from research_lib.incremental import (\n",
+    "    start_run, finish_run, get_unprocessed_documents, get_processed_doc_ids\n",
+    ")\n",
+    "\n",
+    "print('Libraries loaded.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Redaction detection function ----\n",
+    "def detect_redactions(pdf_path, dpi=150, black_thresh=30, min_area=500, min_rect=0.85):\n",
+    "    \"\"\"Detect blacked-out rectangular regions in a PDF.\n",
+    "\n",
+    "    Returns list of dicts with page_number, redaction_count, redaction_area_pct.\n",
+    "    \"\"\"\n",
+    "    results = []\n",
+    "    try:\n",
+    "        doc = fitz.open(pdf_path)\n",
+    "        for page_num in range(len(doc)):\n",
+    "            page = doc[page_num]\n",
+    "            mat = fitz.Matrix(dpi / 72, dpi / 72)\n",
+    "            pix = page.get_pixmap(matrix=mat)\n",
+    "            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)\n",
+    "            if pix.n == 4:\n",
+    "                img = cv2.cvtColor(img, cv2.COLOR_RGBA2GRAY)\n",
+    "            elif pix.n == 3:\n",
+    "                img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)\n",
+    "            _, binary = cv2.threshold(img, black_thresh, 255, cv2.THRESH_BINARY_INV)\n",
+    "            contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
+    "\n",
+    "            redaction_count = 0\n",
+    "            redaction_area = 0\n",
+    "            page_area = pix.h * pix.w\n",
+    "\n",
+    "            for cnt in contours:\n",
+    "                area = cv2.contourArea(cnt)\n",
+    "                if area < min_area:\n",
+    "                    continue\n",
+    "                x, y, w, h = cv2.boundingRect(cnt)\n",
+    "                rect_area = w * h\n",
+    "                rectangularity = area / rect_area if rect_area > 0 else 0\n",
+    "                aspect = max(w, h) / (min(w, h) + 1)\n",
+    "                if rectangularity >= min_rect and 0.1 <= aspect <= 10:\n",
+    "                    redaction_count += 1\n",
+    "                    redaction_area += area\n",
+    "\n",
+    "            results.append({\n",
+    "                'page_number': page_num + 1,\n",
+    "                'redaction_count': redaction_count,\n",
+    "                'redaction_area_pct': (redaction_area / page_area * 100) if page_area > 0 else 0,\n",
+    "            })\n",
+    "        doc.close()\n",
+    "    except Exception as e:\n",
+    "        pass\n",
+    "    return results\n",
+    "\n",
+    "print('Detection function defined.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Identify unprocessed documents ----\n",
+    "PIPELINE = 'redaction_detection'\n",
+    "run_id = start_run(PIPELINE, source_section=source_section, parameters={\n",
+    "    'batch_size': batch_size,\n",
+    "    'min_black_area_pct': min_black_area_pct,\n",
+    "})\n",
+    "\n",
+    "docs_df = get_unprocessed_documents(\n",
+    "    PIPELINE, source_section=source_section,\n",
+    "    feature_table='document_features', feature_name='total_redactions',\n",
+    ")\n",
+    "print(f'Documents to process: {len(docs_df)}')\n",
+    "if len(docs_df) > 0:\n",
+    "    print(docs_df['source_section'].value_counts().to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Process documents in batches with parallel rendering ----\n",
+    "def process_one_document(row):\n",
+    "    \"\"\"Process a single document and return (doc_id, page_results).\"\"\"\n",
+    "    pdf_path = Path(row['file_path']) if 'file_path' in row and row['file_path'] else None\n",
+    "    if pdf_path is None or not pdf_path.exists():\n",
+    "        # Try constructing path from source_section + filename\n",
+    "        pdf_path = RAW_DIR / row['source_section'] / row['filename']\n",
+    "    if not pdf_path.exists():\n",
+    "        return (row['id'], [])\n",
+    "    results = detect_redactions(str(pdf_path))\n",
+    "    return (row['id'], results)\n",
+    "\n",
+    "total_processed = 0\n",
+    "total_batches = (len(docs_df) + batch_size - 1) // batch_size\n",
+    "\n",
+    "for batch_idx in range(total_batches):\n",
+    "    start = batch_idx * batch_size\n",
+    "    end = min(start + batch_size, len(docs_df))\n",
+    "    batch = docs_df.iloc[start:end]\n",
+    "    print(f'\\nBatch {batch_idx + 1}/{total_batches}: documents {start}-{end - 1}')\n",
+    "\n",
+    "    t0 = time.time()\n",
+    "    results = Parallel(n_jobs=12, backend='loky')(\n",
+    "        delayed(process_one_document)(row) for _, row in batch.iterrows()\n",
+    "    )\n",
+    "    elapsed = time.time() - t0\n",
+    "    print(f'  Rendered in {elapsed:.1f}s')\n",
+    "\n",
+    "    # ---- Store page features ----\n",
+    "    page_rows_count = []\n",
+    "    page_rows_area = []\n",
+    "    doc_rows_total = []\n",
+    "    doc_rows_flag = []\n",
+    "\n",
+    "    for doc_id, page_results in results:\n",
+    "        if not page_results:\n",
+    "            # No pages rendered -- still mark document as processed\n",
+    "            doc_rows_total.append((doc_id, 'total_redactions', 0.0, None))\n",
+    "            doc_rows_flag.append((doc_id, 'has_redactions', 0.0, None))\n",
+    "            continue\n",
+    "\n",
+    "        # Look up page IDs for this document\n",
+    "        page_id_df = fetch_df(\n",
+    "            'SELECT id, page_number FROM pages WHERE document_id = %s ORDER BY page_number',\n",
+    "            [doc_id],\n",
+    "        )\n",
+    "        page_id_map = dict(zip(page_id_df['page_number'], page_id_df['id']))\n",
+    "\n",
+    "        total_redactions = 0\n",
+    "        for pr in page_results:\n",
+    "            pid = page_id_map.get(pr['page_number'])\n",
+    "            if pid is None:\n",
+    "                continue\n",
+    "            page_rows_count.append((int(pid), 'redaction_count', float(pr['redaction_count']), None))\n",
+    "            page_rows_area.append((int(pid), 'redaction_area_pct', float(pr['redaction_area_pct']), None))\n",
+    "            total_redactions += pr['redaction_count']\n",
+    "\n",
+    "        doc_rows_total.append((doc_id, 'total_redactions', float(total_redactions), None))\n",
+    "        doc_rows_flag.append((doc_id, 'has_redactions', 1.0 if total_redactions > 0 else 0.0, None))\n",
+    "\n",
+    "    # Upsert page features\n",
+    "    for label, rows in [('redaction_count', page_rows_count),\n",
+    "                         ('redaction_area_pct', page_rows_area)]:\n",
+    "        if rows:\n",
+    "            n = upsert_feature('page_features', ['page_id', 'feature_name'],\n",
+    "                               ['feature_value', 'feature_json'], rows)\n",
+    "            print(f'  page_features: {n} rows for {label}')\n",
+    "\n",
+    "    # Upsert document features\n",
+    "    for label, rows in [('total_redactions', doc_rows_total),\n",
+    "                         ('has_redactions', doc_rows_flag)]:\n",
+    "        if rows:\n",
+    "            n = upsert_feature('document_features', ['document_id', 'feature_name'],\n",
+    "                               ['feature_value', 'feature_json'], rows)\n",
+    "            print(f'  document_features: {n} rows for {label}')\n",
+    "\n",
+    "    total_processed += len(batch)\n",
+    "\n",
+    "finish_run(run_id, documents_processed=total_processed)\n",
+    "print(f'\\nRun {run_id} complete: {total_processed} documents processed.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Print stats ----\n",
+    "stats_df = fetch_df(\"\"\"\n",
+    "    SELECT\n",
+    "        d.source_section,\n",
+    "        COUNT(DISTINCT df.document_id) AS docs_with_redactions\n",
+    "    FROM document_features df\n",
+    "    JOIN documents d ON d.id = df.document_id\n",
+    "    WHERE df.feature_name = 'has_redactions' AND df.feature_value = 1.0\n",
+    "    GROUP BY d.source_section\n",
+    "    ORDER BY docs_with_redactions DESC\n",
+    "\"\"\")\n",
+    "\n",
+    "total_pairs = fetch_df(\"\"\"\n",
+    "    SELECT COUNT(*) AS cnt FROM document_features\n",
+    "    WHERE feature_name = 'total_redactions' AND feature_value > 0\n",
+    "\"\"\")\n",
+    "\n",
+    "print('Documents with redactions by collection:')\n",
+    "print(stats_df.to_string(index=False))\n",
+    "print(f\"\\nTotal documents with at least one redaction: {total_pairs['cnt'].iloc[0]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Top 20 most redacted documents ----\n",
+    "top_redacted = fetch_df(\"\"\"\n",
+    "    SELECT df.document_id, d.source_section, d.filename, df.feature_value AS total_redactions\n",
+    "    FROM document_features df\n",
+    "    JOIN documents d ON d.id = df.document_id\n",
+    "    WHERE df.feature_name = 'total_redactions'\n",
+    "    ORDER BY df.feature_value DESC\n",
+    "    LIMIT 20\n",
+    "\"\"\")\n",
+    "print('Top 20 most redacted documents:')\n",
+    "print(top_redacted.to_string(index=False))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}