datamatters24 commited on
Commit
63eed31
·
verified ·
1 Parent(s): bd76614

Upload notebooks/04_forensic/41_redaction_detection.ipynb with huggingface_hub

Browse files
notebooks/04_forensic/41_redaction_detection.ipynb ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 41 - Redaction Detection\n",
8
+ "\n",
9
+ "Pipeline notebook that detects redacted (blacked-out) regions in PDF pages using\n",
10
+ "PyMuPDF rendering and OpenCV contour analysis.\n",
11
+ "\n",
12
+ "**Page features:** `redaction_count`, `redaction_area_pct`\n",
13
+ "\n",
14
+ "**Document features:** `total_redactions`, `has_redactions` (1.0 / 0.0)\n",
15
+ "\n",
16
+ "Uses `joblib.Parallel(n_jobs=12)` for parallel processing."
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {
23
+ "tags": [
24
+ "parameters"
25
+ ]
26
+ },
27
+ "outputs": [],
28
+ "source": [
29
+ "# Parameters\n",
30
+ "source_section = None\n",
31
+ "batch_size = 500\n",
32
+ "min_black_area_pct = 0.5"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "import sys, warnings, time\n",
42
+ "sys.path.insert(0, '/opt/epstein_env/research')\n",
43
+ "warnings.filterwarnings('ignore')\n",
44
+ "\n",
45
+ "import fitz\n",
46
+ "import cv2\n",
47
+ "import numpy as np\n",
48
+ "import pandas as pd\n",
49
+ "from pathlib import Path\n",
50
+ "from joblib import Parallel, delayed\n",
51
+ "\n",
52
+ "from research_lib.config import RAW_DIR, COLLECTIONS\n",
53
+ "from research_lib.db import fetch_df, upsert_feature, get_conn\n",
54
+ "from research_lib.incremental import (\n",
55
+ " start_run, finish_run, get_unprocessed_documents, get_processed_doc_ids\n",
56
+ ")\n",
57
+ "\n",
58
+ "print('Libraries loaded.')"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": null,
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "# ---- Redaction detection function ----\n",
68
+ "def detect_redactions(pdf_path, dpi=150, black_thresh=30, min_area=500, min_rect=0.85):\n",
69
+ " \"\"\"Detect blacked-out rectangular regions in a PDF.\n",
70
+ "\n",
71
+ " Returns list of dicts with page_number, redaction_count, redaction_area_pct.\n",
72
+ " \"\"\"\n",
73
+ " results = []\n",
74
+ " try:\n",
75
+ " doc = fitz.open(pdf_path)\n",
76
+ " for page_num in range(len(doc)):\n",
77
+ " page = doc[page_num]\n",
78
+ " mat = fitz.Matrix(dpi / 72, dpi / 72)\n",
79
+ " pix = page.get_pixmap(matrix=mat)\n",
80
+ " img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)\n",
81
+ " if pix.n == 4:\n",
82
+ " img = cv2.cvtColor(img, cv2.COLOR_RGBA2GRAY)\n",
83
+ " elif pix.n == 3:\n",
84
+ " img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)\n",
85
+ " _, binary = cv2.threshold(img, black_thresh, 255, cv2.THRESH_BINARY_INV)\n",
86
+ " contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
87
+ "\n",
88
+ " redaction_count = 0\n",
89
+ " redaction_area = 0\n",
90
+ " page_area = pix.h * pix.w\n",
91
+ "\n",
92
+ " for cnt in contours:\n",
93
+ " area = cv2.contourArea(cnt)\n",
94
+ " if area < min_area:\n",
95
+ " continue\n",
96
+ " x, y, w, h = cv2.boundingRect(cnt)\n",
97
+ " rect_area = w * h\n",
98
+ " rectangularity = area / rect_area if rect_area > 0 else 0\n",
99
+ " aspect = max(w, h) / (min(w, h) + 1)\n",
100
+ " if rectangularity >= min_rect and 0.1 <= aspect <= 10:\n",
101
+ " redaction_count += 1\n",
102
+ " redaction_area += area\n",
103
+ "\n",
104
+ " results.append({\n",
105
+ " 'page_number': page_num + 1,\n",
106
+ " 'redaction_count': redaction_count,\n",
107
+ " 'redaction_area_pct': (redaction_area / page_area * 100) if page_area > 0 else 0,\n",
108
+ " })\n",
109
+ " doc.close()\n",
110
+ " except Exception as e:\n",
111
+ " pass\n",
112
+ " return results\n",
113
+ "\n",
114
+ "print('Detection function defined.')"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": null,
120
+ "metadata": {},
121
+ "outputs": [],
122
+ "source": [
123
+ "# ---- Identify unprocessed documents ----\n",
124
+ "PIPELINE = 'redaction_detection'\n",
125
+ "run_id = start_run(PIPELINE, source_section=source_section, parameters={\n",
126
+ " 'batch_size': batch_size,\n",
127
+ " 'min_black_area_pct': min_black_area_pct,\n",
128
+ "})\n",
129
+ "\n",
130
+ "docs_df = get_unprocessed_documents(\n",
131
+ " PIPELINE, source_section=source_section,\n",
132
+ " feature_table='document_features', feature_name='total_redactions',\n",
133
+ ")\n",
134
+ "print(f'Documents to process: {len(docs_df)}')\n",
135
+ "if len(docs_df) > 0:\n",
136
+ " print(docs_df['source_section'].value_counts().to_string())"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": null,
142
+ "metadata": {},
143
+ "outputs": [],
144
+ "source": [
145
+ "# ---- Process documents in batches with parallel rendering ----\n",
146
+ "def process_one_document(row):\n",
147
+ " \"\"\"Process a single document and return (doc_id, page_results).\"\"\"\n",
148
+ " pdf_path = Path(row['file_path']) if 'file_path' in row and row['file_path'] else None\n",
149
+ " if pdf_path is None or not pdf_path.exists():\n",
150
+ " # Try constructing path from source_section + filename\n",
151
+ " pdf_path = RAW_DIR / row['source_section'] / row['filename']\n",
152
+ " if not pdf_path.exists():\n",
153
+ " return (row['id'], [])\n",
154
+ " results = detect_redactions(str(pdf_path))\n",
155
+ " return (row['id'], results)\n",
156
+ "\n",
157
+ "total_processed = 0\n",
158
+ "total_batches = (len(docs_df) + batch_size - 1) // batch_size\n",
159
+ "\n",
160
+ "for batch_idx in range(total_batches):\n",
161
+ " start = batch_idx * batch_size\n",
162
+ " end = min(start + batch_size, len(docs_df))\n",
163
+ " batch = docs_df.iloc[start:end]\n",
164
+ " print(f'\\nBatch {batch_idx + 1}/{total_batches}: documents {start}-{end - 1}')\n",
165
+ "\n",
166
+ " t0 = time.time()\n",
167
+ " results = Parallel(n_jobs=12, backend='loky')(\n",
168
+ " delayed(process_one_document)(row) for _, row in batch.iterrows()\n",
169
+ " )\n",
170
+ " elapsed = time.time() - t0\n",
171
+ " print(f' Rendered in {elapsed:.1f}s')\n",
172
+ "\n",
173
+ " # ---- Store page features ----\n",
174
+ " page_rows_count = []\n",
175
+ " page_rows_area = []\n",
176
+ " doc_rows_total = []\n",
177
+ " doc_rows_flag = []\n",
178
+ "\n",
179
+ " for doc_id, page_results in results:\n",
180
+ " if not page_results:\n",
181
+ " # No pages rendered -- still mark document as processed\n",
182
+ " doc_rows_total.append((doc_id, 'total_redactions', 0.0, None))\n",
183
+ " doc_rows_flag.append((doc_id, 'has_redactions', 0.0, None))\n",
184
+ " continue\n",
185
+ "\n",
186
+ " # Look up page IDs for this document\n",
187
+ " page_id_df = fetch_df(\n",
188
+ " 'SELECT id, page_number FROM pages WHERE document_id = %s ORDER BY page_number',\n",
189
+ " [doc_id],\n",
190
+ " )\n",
191
+ " page_id_map = dict(zip(page_id_df['page_number'], page_id_df['id']))\n",
192
+ "\n",
193
+ " total_redactions = 0\n",
194
+ " for pr in page_results:\n",
195
+ " pid = page_id_map.get(pr['page_number'])\n",
196
+ " if pid is None:\n",
197
+ " continue\n",
198
+ " page_rows_count.append((int(pid), 'redaction_count', float(pr['redaction_count']), None))\n",
199
+ " page_rows_area.append((int(pid), 'redaction_area_pct', float(pr['redaction_area_pct']), None))\n",
200
+ " total_redactions += pr['redaction_count']\n",
201
+ "\n",
202
+ " doc_rows_total.append((doc_id, 'total_redactions', float(total_redactions), None))\n",
203
+ " doc_rows_flag.append((doc_id, 'has_redactions', 1.0 if total_redactions > 0 else 0.0, None))\n",
204
+ "\n",
205
+ " # Upsert page features\n",
206
+ " for label, rows in [('redaction_count', page_rows_count),\n",
207
+ " ('redaction_area_pct', page_rows_area)]:\n",
208
+ " if rows:\n",
209
+ " n = upsert_feature('page_features', ['page_id', 'feature_name'],\n",
210
+ " ['feature_value', 'feature_json'], rows)\n",
211
+ " print(f' page_features: {n} rows for {label}')\n",
212
+ "\n",
213
+ " # Upsert document features\n",
214
+ " for label, rows in [('total_redactions', doc_rows_total),\n",
215
+ " ('has_redactions', doc_rows_flag)]:\n",
216
+ " if rows:\n",
217
+ " n = upsert_feature('document_features', ['document_id', 'feature_name'],\n",
218
+ " ['feature_value', 'feature_json'], rows)\n",
219
+ " print(f' document_features: {n} rows for {label}')\n",
220
+ "\n",
221
+ " total_processed += len(batch)\n",
222
+ "\n",
223
+ "finish_run(run_id, documents_processed=total_processed)\n",
224
+ "print(f'\\nRun {run_id} complete: {total_processed} documents processed.')"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": null,
230
+ "metadata": {},
231
+ "outputs": [],
232
+ "source": [
233
+ "# ---- Print stats ----\n",
234
+ "stats_df = fetch_df(\"\"\"\n",
235
+ " SELECT\n",
236
+ " d.source_section,\n",
237
+ " COUNT(DISTINCT df.document_id) AS docs_with_redactions\n",
238
+ " FROM document_features df\n",
239
+ " JOIN documents d ON d.id = df.document_id\n",
240
+ " WHERE df.feature_name = 'has_redactions' AND df.feature_value = 1.0\n",
241
+ " GROUP BY d.source_section\n",
242
+ " ORDER BY docs_with_redactions DESC\n",
243
+ "\"\"\")\n",
244
+ "\n",
245
+ "total_pairs = fetch_df(\"\"\"\n",
246
+ " SELECT COUNT(*) AS cnt FROM document_features\n",
247
+ " WHERE feature_name = 'total_redactions' AND feature_value > 0\n",
248
+ "\"\"\")\n",
249
+ "\n",
250
+ "print('Documents with redactions by collection:')\n",
251
+ "print(stats_df.to_string(index=False))\n",
252
+ "print(f\"\\nTotal documents with at least one redaction: {total_pairs['cnt'].iloc[0]}\")"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": null,
258
+ "metadata": {},
259
+ "outputs": [],
260
+ "source": [
261
+ "# ---- Top 20 most redacted documents ----\n",
262
+ "top_redacted = fetch_df(\"\"\"\n",
263
+ " SELECT df.document_id, d.source_section, d.filename, df.feature_value AS total_redactions\n",
264
+ " FROM document_features df\n",
265
+ " JOIN documents d ON d.id = df.document_id\n",
266
+ " WHERE df.feature_name = 'total_redactions'\n",
267
+ " ORDER BY df.feature_value DESC\n",
268
+ " LIMIT 20\n",
269
+ "\"\"\")\n",
270
+ "print('Top 20 most redacted documents:')\n",
271
+ "print(top_redacted.to_string(index=False))"
272
+ ]
273
+ }
274
+ ],
275
+ "metadata": {
276
+ "kernelspec": {
277
+ "display_name": "Python 3",
278
+ "language": "python",
279
+ "name": "python3"
280
+ },
281
+ "language_info": {
282
+ "name": "python",
283
+ "version": "3.10.0"
284
+ }
285
+ },
286
+ "nbformat": 4,
287
+ "nbformat_minor": 5
288
+ }