datamatters24 commited on
Commit
bd76614
·
verified ·
1 Parent(s): cbbd15e

Upload notebooks/04_forensic/44_forensic_dashboard.ipynb with huggingface_hub

Browse files
notebooks/04_forensic/44_forensic_dashboard.ipynb ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 44 - Forensic Dashboard\n",
8
+ "\n",
9
+ "Interactive dashboard summarizing forensic analysis results:\n",
10
+ "- OCR confidence by collection (heatmap)\n",
11
+ "- Redaction counts per collection (bar chart)\n",
12
+ "- Top 20 most redacted documents (table)\n",
13
+ "- Classification stamp distribution (pie chart)\n",
14
+ "- Documents with lowest OCR confidence (table)"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "import sys, warnings, json\n",
24
+ "sys.path.insert(0, '/opt/epstein_env/research')\n",
25
+ "warnings.filterwarnings('ignore')\n",
26
+ "\n",
27
+ "import pandas as pd\n",
28
+ "import numpy as np\n",
29
+ "import matplotlib.pyplot as plt\n",
30
+ "import seaborn as sns\n",
31
+ "\n",
32
+ "from research_lib.config import COLLECTIONS, COLLECTION_LABELS\n",
33
+ "from research_lib.db import fetch_df\n",
34
+ "from research_lib.plotting import (\n",
35
+ " set_style, save_fig, COLLECTION_COLORS, collection_color,\n",
36
+ ")\n",
37
+ "\n",
38
+ "set_style()\n",
39
+ "print('Libraries loaded.')"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "# ---- Load document features ----\n",
49
+ "conf_df = fetch_df(\"\"\"\n",
50
+ " SELECT df.document_id, d.source_section, d.filename,\n",
51
+ " df.feature_value AS avg_ocr_confidence\n",
52
+ " FROM document_features df\n",
53
+ " JOIN documents d ON d.id = df.document_id\n",
54
+ " WHERE df.feature_name = 'avg_ocr_confidence'\n",
55
+ "\"\"\")\n",
56
+ "\n",
57
+ "redact_df = fetch_df(\"\"\"\n",
58
+ " SELECT df.document_id, d.source_section, d.filename,\n",
59
+ " df.feature_value AS total_redactions\n",
60
+ " FROM document_features df\n",
61
+ " JOIN documents d ON d.id = df.document_id\n",
62
+ " WHERE df.feature_name = 'total_redactions'\n",
63
+ "\"\"\")\n",
64
+ "\n",
65
+ "stamp_df = fetch_df(\"\"\"\n",
66
+ " SELECT df.document_id, d.source_section,\n",
67
+ " df.feature_json AS stamps\n",
68
+ " FROM document_features df\n",
69
+ " JOIN documents d ON d.id = df.document_id\n",
70
+ " WHERE df.feature_name = 'classification_stamps'\n",
71
+ "\"\"\")\n",
72
+ "\n",
73
+ "print(f'Confidence data: {len(conf_df)} docs')\n",
74
+ "print(f'Redaction data: {len(redact_df)} docs')\n",
75
+ "print(f'Stamp data: {len(stamp_df)} docs')"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "# ---- Heatmap: OCR Confidence by Collection ----\n",
85
+ "if not conf_df.empty:\n",
86
+ " # Create confidence bands per collection\n",
87
+ " bins = [0, 20, 40, 60, 80, 100]\n",
88
+ " labels = ['0-20', '20-40', '40-60', '60-80', '80-100']\n",
89
+ " conf_df['conf_band'] = pd.cut(\n",
90
+ " conf_df['avg_ocr_confidence'], bins=bins, labels=labels, include_lowest=True\n",
91
+ " )\n",
92
+ " pivot = conf_df.groupby(['source_section', 'conf_band']).size().unstack(fill_value=0)\n",
93
+ " # Normalize to percentages\n",
94
+ " pivot_pct = pivot.div(pivot.sum(axis=1), axis=0) * 100\n",
95
+ "\n",
96
+ " fig, ax = plt.subplots(figsize=(12, 8))\n",
97
+ " sns.heatmap(\n",
98
+ " pivot_pct, annot=True, fmt='.1f', cmap='RdYlGn',\n",
99
+ " cbar_kws={'label': '% of Documents'}, ax=ax,\n",
100
+ " )\n",
101
+ " ax.set_title('OCR Confidence Distribution by Collection (%)')\n",
102
+ " ax.set_xlabel('Confidence Band')\n",
103
+ " ax.set_ylabel('Collection')\n",
104
+ " plt.tight_layout()\n",
105
+ " save_fig(fig, 'forensic_ocr_heatmap')\n",
106
+ " plt.show()\n",
107
+ "else:\n",
108
+ " print('No OCR confidence data available.')"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": null,
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "# ---- Bar chart: documents with redactions per collection ----\n",
118
+ "if not redact_df.empty:\n",
119
+ " redacted_only = redact_df[redact_df['total_redactions'] > 0]\n",
120
+ " by_collection = (\n",
121
+ " redacted_only.groupby('source_section')\n",
122
+ " .size()\n",
123
+ " .reset_index(name='docs_with_redactions')\n",
124
+ " .sort_values('docs_with_redactions', ascending=False)\n",
125
+ " )\n",
126
+ "\n",
127
+ " colors = [collection_color(s) for s in by_collection['source_section']]\n",
128
+ "\n",
129
+ " fig, ax = plt.subplots(figsize=(12, 6))\n",
130
+ " ax.bar(by_collection['source_section'], by_collection['docs_with_redactions'], color=colors)\n",
131
+ " ax.set_title('Documents with Detected Redactions by Collection')\n",
132
+ " ax.set_ylabel('Number of Documents')\n",
133
+ " plt.xticks(rotation=45, ha='right')\n",
134
+ " plt.tight_layout()\n",
135
+ " save_fig(fig, 'forensic_redactions_by_collection')\n",
136
+ " plt.show()\n",
137
+ "else:\n",
138
+ " print('No redaction data available.')"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": null,
144
+ "metadata": {},
145
+ "outputs": [],
146
+ "source": [
147
+ "# ---- Table: Top 20 most redacted documents ----\n",
148
+ "if not redact_df.empty:\n",
149
+ " top_redacted = (\n",
150
+ " redact_df.nlargest(20, 'total_redactions')\n",
151
+ " [['document_id', 'source_section', 'filename', 'total_redactions']]\n",
152
+ " )\n",
153
+ " print('Top 20 Most Redacted Documents:')\n",
154
+ " print(top_redacted.to_string(index=False))\n",
155
+ "else:\n",
156
+ " print('No redaction data available.')"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": null,
162
+ "metadata": {},
163
+ "outputs": [],
164
+ "source": [
165
+ "# ---- Pie chart: classification stamp distribution ----\n",
166
+ "if not stamp_df.empty:\n",
167
+ " # Parse stamps JSON and flatten\n",
168
+ " all_stamps = []\n",
169
+ " for _, row in stamp_df.iterrows():\n",
170
+ " stamps = row['stamps']\n",
171
+ " if isinstance(stamps, str):\n",
172
+ " stamps = json.loads(stamps)\n",
173
+ " if stamps:\n",
174
+ " all_stamps.extend(stamps)\n",
175
+ "\n",
176
+ " if all_stamps:\n",
177
+ " stamp_counts = pd.Series(all_stamps).value_counts()\n",
178
+ "\n",
179
+ " fig, ax = plt.subplots(figsize=(10, 8))\n",
180
+ " ax.pie(\n",
181
+ " stamp_counts.values,\n",
182
+ " labels=stamp_counts.index,\n",
183
+ " autopct='%1.1f%%',\n",
184
+ " startangle=140,\n",
185
+ " colors=sns.color_palette('Set2', len(stamp_counts)),\n",
186
+ " )\n",
187
+ " ax.set_title('Classification Stamp Distribution')\n",
188
+ " plt.tight_layout()\n",
189
+ " save_fig(fig, 'forensic_stamps_pie')\n",
190
+ " plt.show()\n",
191
+ " else:\n",
192
+ " print('No classification stamps found in any documents.')\n",
193
+ "else:\n",
194
+ " print('No stamp data available.')"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": null,
200
+ "metadata": {},
201
+ "outputs": [],
202
+ "source": [
203
+ "# ---- Table: Documents with lowest OCR confidence ----\n",
204
+ "if not conf_df.empty:\n",
205
+ " lowest = (\n",
206
+ " conf_df.nsmallest(20, 'avg_ocr_confidence')\n",
207
+ " [['document_id', 'source_section', 'filename', 'avg_ocr_confidence']]\n",
208
+ " )\n",
209
+ " print('Documents with Lowest OCR Confidence:')\n",
210
+ " print(lowest.to_string(index=False))\n",
211
+ "else:\n",
212
+ " print('No OCR confidence data available.')"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "code",
217
+ "execution_count": null,
218
+ "metadata": {},
219
+ "outputs": [],
220
+ "source": [
221
+ "# ---- Combined forensic risk score ----\n",
222
+ "# Quick summary combining redactions + low confidence\n",
223
+ "if not conf_df.empty and not redact_df.empty:\n",
224
+ " merged = conf_df[['document_id', 'source_section', 'filename', 'avg_ocr_confidence']].merge(\n",
225
+ " redact_df[['document_id', 'total_redactions']],\n",
226
+ " on='document_id', how='left',\n",
227
+ " )\n",
228
+ " merged['total_redactions'] = merged['total_redactions'].fillna(0)\n",
229
+ "\n",
230
+ " # Flag: low confidence + has redactions\n",
231
+ " flagged = merged[\n",
232
+ " (merged['avg_ocr_confidence'] < 40) & (merged['total_redactions'] > 0)\n",
233
+ " ].sort_values('avg_ocr_confidence')\n",
234
+ "\n",
235
+ " print(f'\\nDocuments with BOTH low confidence (<40) AND redactions: {len(flagged)}')\n",
236
+ " if len(flagged) > 0:\n",
237
+ " print(flagged.head(20).to_string(index=False))"
238
+ ]
239
+ }
240
+ ],
241
+ "metadata": {
242
+ "kernelspec": {
243
+ "display_name": "Python 3",
244
+ "language": "python",
245
+ "name": "python3"
246
+ },
247
+ "language_info": {
248
+ "name": "python",
249
+ "version": "3.10.0"
250
+ }
251
+ },
252
+ "nbformat": 4,
253
+ "nbformat_minor": 5
254
+ }