datamatters24 commited on
Commit
eddc997
·
verified ·
1 Parent(s): 7b1c599

Upload notebooks/05_cross_analysis/52_summary_dashboard.ipynb with huggingface_hub

Browse files
notebooks/05_cross_analysis/52_summary_dashboard.ipynb ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 52 - Summary Dashboard\n",
8
+ "\n",
9
+ "Master overview dashboard pulling from all analysis tables.\n",
10
+ "\n",
11
+ "- Collection stats (from `collection_stats` materialized view)\n",
12
+ "- Top entities across all collections\n",
13
+ "- Top topics per collection\n",
14
+ "- Forensic alerts: most redacted, lowest confidence, classification stamps\n",
15
+ "- Entity network summary: most connected nodes, largest communities\n",
16
+ "- Key metrics as a printable report"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "import sys, warnings, json\n",
26
+ "sys.path.insert(0, '/opt/epstein_env/research')\n",
27
+ "warnings.filterwarnings('ignore')\n",
28
+ "\n",
29
+ "import pandas as pd\n",
30
+ "import numpy as np\n",
31
+ "import matplotlib.pyplot as plt\n",
32
+ "import seaborn as sns\n",
33
+ "from datetime import datetime\n",
34
+ "\n",
35
+ "from research_lib.config import COLLECTIONS, COLLECTION_LABELS\n",
36
+ "from research_lib.db import fetch_df, fetch_all\n",
37
+ "from research_lib.plotting import (\n",
38
+ " set_style, save_fig, COLLECTION_COLORS, collection_color,\n",
39
+ ")\n",
40
+ "\n",
41
+ "set_style()\n",
42
+ "print('Summary Dashboard loaded.')\n",
43
+ "print(f'Report generated: {datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")}')"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "markdown",
48
+ "metadata": {},
49
+ "source": [
50
+ "## 1. Collection Statistics"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "metadata": {},
57
+ "outputs": [],
58
+ "source": [
59
+ "# ---- Collection stats from materialized view ----\n",
60
+ "try:\n",
61
+ " collection_stats = fetch_df('SELECT * FROM collection_stats ORDER BY source_section')\n",
62
+ " print('Collection Statistics:')\n",
63
+ " print(collection_stats.to_string(index=False))\n",
64
+ "except Exception as e:\n",
65
+ " print(f'collection_stats view not available: {e}')\n",
66
+ " # Fallback: compute from documents table\n",
67
+ " collection_stats = fetch_df(\"\"\"\n",
68
+ " SELECT source_section,\n",
69
+ " COUNT(*) AS document_count,\n",
70
+ " SUM(page_count) AS total_pages\n",
71
+ " FROM documents\n",
72
+ " GROUP BY source_section\n",
73
+ " ORDER BY source_section\n",
74
+ " \"\"\")\n",
75
+ " print('Collection Statistics (from documents table):')\n",
76
+ " print(collection_stats.to_string(index=False))"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "markdown",
81
+ "metadata": {},
82
+ "source": [
83
+ "## 2. Top Entities Across All Collections"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": null,
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "# ---- Top 10 entities across all collections ----\n",
93
+ "top_entities = fetch_df(\"\"\"\n",
94
+ " SELECT entity_text, entity_type,\n",
95
+ " COUNT(*) AS mention_count,\n",
96
+ " COUNT(DISTINCT document_id) AS doc_count,\n",
97
+ " COUNT(DISTINCT d.source_section) AS collection_count\n",
98
+ " FROM entities e\n",
99
+ " JOIN documents d ON d.id = e.document_id\n",
100
+ " WHERE e.entity_type IN ('PERSON', 'ORG', 'GPE')\n",
101
+ " GROUP BY entity_text, entity_type\n",
102
+ " ORDER BY doc_count DESC\n",
103
+ " LIMIT 10\n",
104
+ "\"\"\")\n",
105
+ "\n",
106
+ "print('Top 10 Entities (by document count):')\n",
107
+ "print(top_entities.to_string(index=False))"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "markdown",
112
+ "metadata": {},
113
+ "source": [
114
+ "## 3. Top Topics Per Collection"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": null,
120
+ "metadata": {},
121
+ "outputs": [],
122
+ "source": [
123
+ "# ---- Topics per collection ----\n",
124
+ "try:\n",
125
+ " topics_df = fetch_df(\"\"\"\n",
126
+ " SELECT source_section, topic_label, document_count, top_words\n",
127
+ " FROM topics\n",
128
+ " WHERE topic_label IS NOT NULL\n",
129
+ " ORDER BY source_section, document_count DESC\n",
130
+ " \"\"\")\n",
131
+ "\n",
132
+ " if not topics_df.empty:\n",
133
+ " for section in topics_df['source_section'].unique():\n",
134
+ " section_topics = topics_df[topics_df['source_section'] == section].head(5)\n",
135
+ " label = COLLECTION_LABELS.get(section, section)\n",
136
+ " print(f'\\n--- {label} ---')\n",
137
+ " print(section_topics[['topic_label', 'document_count']].to_string(index=False))\n",
138
+ " else:\n",
139
+ " print('No topic data available.')\n",
140
+ "except Exception as e:\n",
141
+ " print(f'Topics table not available: {e}')"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "markdown",
146
+ "metadata": {},
147
+ "source": [
148
+ "## 4. Forensic Alerts"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": null,
154
+ "metadata": {},
155
+ "outputs": [],
156
+ "source": [
157
+ "# ---- Documents with most redactions ----\n",
158
+ "try:\n",
159
+ " most_redacted = fetch_df(\"\"\"\n",
160
+ " SELECT df.document_id, d.source_section, d.filename,\n",
161
+ " df.feature_value AS total_redactions\n",
162
+ " FROM document_features df\n",
163
+ " JOIN documents d ON d.id = df.document_id\n",
164
+ " WHERE df.feature_name = 'total_redactions' AND df.feature_value > 0\n",
165
+ " ORDER BY df.feature_value DESC\n",
166
+ " LIMIT 10\n",
167
+ " \"\"\")\n",
168
+ " print('ALERT: Most Redacted Documents')\n",
169
+ " if not most_redacted.empty:\n",
170
+ " print(most_redacted.to_string(index=False))\n",
171
+ " else:\n",
172
+ " print(' No redacted documents found.')\n",
173
+ "except Exception:\n",
174
+ " print(' Redaction data not available.')"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "code",
179
+ "execution_count": null,
180
+ "metadata": {},
181
+ "outputs": [],
182
+ "source": [
183
+ "# ---- Documents with lowest OCR confidence ----\n",
184
+ "try:\n",
185
+ " lowest_conf = fetch_df(\"\"\"\n",
186
+ " SELECT df.document_id, d.source_section, d.filename,\n",
187
+ " df.feature_value AS avg_ocr_confidence\n",
188
+ " FROM document_features df\n",
189
+ " JOIN documents d ON d.id = df.document_id\n",
190
+ " WHERE df.feature_name = 'avg_ocr_confidence'\n",
191
+ " ORDER BY df.feature_value ASC\n",
192
+ " LIMIT 10\n",
193
+ " \"\"\")\n",
194
+ " print('\\nALERT: Lowest OCR Confidence Documents')\n",
195
+ " if not lowest_conf.empty:\n",
196
+ " print(lowest_conf.to_string(index=False))\n",
197
+ " else:\n",
198
+ " print(' No OCR confidence data found.')\n",
199
+ "except Exception:\n",
200
+ " print(' OCR confidence data not available.')"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": null,
206
+ "metadata": {},
207
+ "outputs": [],
208
+ "source": [
209
+ "# ---- Classification stamps summary ----\n",
210
+ "try:\n",
211
+ " stamps_df = fetch_df(\"\"\"\n",
212
+ " SELECT df.feature_json AS stamps, d.source_section\n",
213
+ " FROM document_features df\n",
214
+ " JOIN documents d ON d.id = df.document_id\n",
215
+ " WHERE df.feature_name = 'classification_stamps'\n",
216
+ " AND df.feature_json IS NOT NULL\n",
217
+ " AND df.feature_json != '[]'\n",
218
+ " \"\"\")\n",
219
+ "\n",
220
+ " all_stamps = []\n",
221
+ " for _, row in stamps_df.iterrows():\n",
222
+ " s = row['stamps']\n",
223
+ " if isinstance(s, str):\n",
224
+ " s = json.loads(s)\n",
225
+ " if s:\n",
226
+ " all_stamps.extend(s)\n",
227
+ "\n",
228
+ " print('\\nALERT: Classification Stamps Found')\n",
229
+ " if all_stamps:\n",
230
+ " stamp_counts = pd.Series(all_stamps).value_counts()\n",
231
+ " print(stamp_counts.to_string())\n",
232
+ " print(f'\\nTotal documents with stamps: {len(stamps_df)}')\n",
233
+ " else:\n",
234
+ " print(' No stamps found.')\n",
235
+ "except Exception:\n",
236
+ " print(' Stamp data not available.')"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "markdown",
241
+ "metadata": {},
242
+ "source": [
243
+ "## 5. Entity Network Summary"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": null,
249
+ "metadata": {},
250
+ "outputs": [],
251
+ "source": [
252
+ "# ---- Most connected entities (by relationship count) ----\n",
253
+ "try:\n",
254
+ " connected = fetch_df(\"\"\"\n",
255
+ " SELECT entity, relationship_count FROM (\n",
256
+ " SELECT entity_a AS entity, COUNT(*) AS relationship_count\n",
257
+ " FROM entity_relationships\n",
258
+ " GROUP BY entity_a\n",
259
+ " UNION ALL\n",
260
+ " SELECT entity_b AS entity, COUNT(*) AS relationship_count\n",
261
+ " FROM entity_relationships\n",
262
+ " GROUP BY entity_b\n",
263
+ " ) sub\n",
264
+ " GROUP BY entity\n",
265
+ " ORDER BY SUM(relationship_count) DESC\n",
266
+ " LIMIT 20\n",
267
+ " \"\"\")\n",
268
+ " print('Most Connected Entities (by relationship count):')\n",
269
+ " if not connected.empty:\n",
270
+ " print(connected.to_string(index=False))\n",
271
+ " else:\n",
272
+ " print(' No entity relationships found.')\n",
273
+ "except Exception:\n",
274
+ " print(' Entity relationship data not available.')"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "metadata": {},
281
+ "outputs": [],
282
+ "source": [
283
+ "# ---- Community summary (if available) ----\n",
284
+ "try:\n",
285
+ " communities = fetch_df(\"\"\"\n",
286
+ " SELECT feature_json->>'community' AS community,\n",
287
+ " COUNT(*) AS member_count\n",
288
+ " FROM document_features\n",
289
+ " WHERE feature_name = 'community_id'\n",
290
+ " GROUP BY feature_json->>'community'\n",
291
+ " ORDER BY member_count DESC\n",
292
+ " LIMIT 10\n",
293
+ " \"\"\")\n",
294
+ " if not communities.empty:\n",
295
+ " print('\\nLargest Entity Communities:')\n",
296
+ " print(communities.to_string(index=False))\n",
297
+ "except Exception:\n",
298
+ " pass # community data may not be available yet"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "markdown",
303
+ "metadata": {},
304
+ "source": [
305
+ "## 6. Key Metrics Report"
306
+ ]
307
+ },
308
+ {
309
+ "cell_type": "code",
310
+ "execution_count": null,
311
+ "metadata": {},
312
+ "outputs": [],
313
+ "source": [
314
+ "# ---- Printable summary report ----\n",
315
+ "print('=' * 70)\n",
316
+ "print('RESEARCH ANALYSIS -- KEY METRICS REPORT')\n",
317
+ "print(f'Generated: {datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")}')\n",
318
+ "print('=' * 70)\n",
319
+ "\n",
320
+ "# Document counts\n",
321
+ "doc_count = fetch_df('SELECT COUNT(*) AS cnt FROM documents')\n",
322
+ "page_count = fetch_df('SELECT COUNT(*) AS cnt FROM pages')\n",
323
+ "entity_count = fetch_df('SELECT COUNT(*) AS cnt FROM entities')\n",
324
+ "\n",
325
+ "print(f\"\\nTotal Documents: {doc_count['cnt'].iloc[0]:>10,}\")\n",
326
+ "print(f\"Total Pages: {page_count['cnt'].iloc[0]:>10,}\")\n",
327
+ "print(f\"Total Entities: {entity_count['cnt'].iloc[0]:>10,}\")\n",
328
+ "\n",
329
+ "# Feature counts\n",
330
+ "try:\n",
331
+ " feat_counts = fetch_df(\"\"\"\n",
332
+ " SELECT feature_name, COUNT(*) AS cnt\n",
333
+ " FROM document_features\n",
334
+ " GROUP BY feature_name\n",
335
+ " ORDER BY cnt DESC\n",
336
+ " \"\"\")\n",
337
+ " print('\\nDocument Features Computed:')\n",
338
+ " for _, r in feat_counts.iterrows():\n",
339
+ " print(f\" {r['feature_name']:<30} {r['cnt']:>8,}\")\n",
340
+ "except Exception:\n",
341
+ " pass\n",
342
+ "\n",
343
+ "# Topic counts\n",
344
+ "try:\n",
345
+ " topic_count = fetch_df('SELECT COUNT(*) AS cnt FROM topics')\n",
346
+ " print(f\"\\nTopics Discovered: {topic_count['cnt'].iloc[0]:>10,}\")\n",
347
+ "except Exception:\n",
348
+ " pass\n",
349
+ "\n",
350
+ "# Duplicate pairs\n",
351
+ "try:\n",
352
+ " dup_count = fetch_df('SELECT COUNT(*) AS cnt FROM duplicate_pairs')\n",
353
+ " print(f\"Duplicate Pairs: {dup_count['cnt'].iloc[0]:>10,}\")\n",
354
+ "except Exception:\n",
355
+ " pass\n",
356
+ "\n",
357
+ "# Pipeline runs\n",
358
+ "try:\n",
359
+ " runs = fetch_df(\"\"\"\n",
360
+ " SELECT pipeline_name, status, COUNT(*) AS runs,\n",
361
+ " SUM(documents_processed) AS total_docs_processed\n",
362
+ " FROM analysis_runs\n",
363
+ " GROUP BY pipeline_name, status\n",
364
+ " ORDER BY pipeline_name\n",
365
+ " \"\"\")\n",
366
+ " print('\\nPipeline Runs:')\n",
367
+ " print(runs.to_string(index=False))\n",
368
+ "except Exception:\n",
369
+ " pass\n",
370
+ "\n",
371
+ "print('\\n' + '=' * 70)\n",
372
+ "print('END OF REPORT')\n",
373
+ "print('=' * 70)"
374
+ ]
375
+ }
376
+ ],
377
+ "metadata": {
378
+ "kernelspec": {
379
+ "display_name": "Python 3",
380
+ "language": "python",
381
+ "name": "python3"
382
+ },
383
+ "language_info": {
384
+ "name": "python",
385
+ "version": "3.10.0"
386
+ }
387
+ },
388
+ "nbformat": 4,
389
+ "nbformat_minor": 5
390
+ }