datamatters24 commited on
Commit
7b1c599
·
verified ·
1 Parent(s): caff04b

Upload notebooks/05_cross_analysis/51_cross_collection.ipynb with huggingface_hub

Browse files
notebooks/05_cross_analysis/51_cross_collection.ipynb ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 51 - Cross-Collection Entity Analysis\n",
8
+ "\n",
9
+ "Pipeline notebook that identifies entities appearing across multiple document collections.\n",
10
+ "\n",
11
+ "- Finds entities that appear in 2+ collections\n",
12
+ "- Computes Jaccard similarity of entity sets between collection pairs\n",
13
+ "- Identifies top \"bridge\" entities (those appearing in the most collections)\n",
14
+ "- Stores cross-collection entity relationships"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {
21
+ "tags": [
22
+ "parameters"
23
+ ]
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "# Parameters\n",
28
+ "# No filtering -- always processes all collections"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "import sys, warnings\n",
38
+ "sys.path.insert(0, '/opt/epstein_env/research')\n",
39
+ "warnings.filterwarnings('ignore')\n",
40
+ "\n",
41
+ "import pandas as pd\n",
42
+ "import numpy as np\n",
43
+ "import matplotlib.pyplot as plt\n",
44
+ "import seaborn as sns\n",
45
+ "from itertools import combinations\n",
46
+ "\n",
47
+ "from research_lib.config import COLLECTIONS, COLLECTION_LABELS\n",
48
+ "from research_lib.db import fetch_df, bulk_insert, get_conn\n",
49
+ "from research_lib.incremental import start_run, finish_run\n",
50
+ "from research_lib.plotting import set_style, save_fig, COLLECTION_COLORS\n",
51
+ "\n",
52
+ "set_style()\n",
53
+ "print('Libraries loaded.')"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": null,
59
+ "metadata": {},
60
+ "outputs": [],
61
+ "source": [
62
+ "# ---- Start run ----\n",
63
+ "PIPELINE = 'cross_collection'\n",
64
+ "run_id = start_run(PIPELINE, source_section=None)"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "# ---- Query distinct entities per collection ----\n",
74
+ "entity_df = fetch_df(\"\"\"\n",
75
+ " SELECT DISTINCT\n",
76
+ " e.entity_text,\n",
77
+ " e.entity_type,\n",
78
+ " d.source_section\n",
79
+ " FROM entities e\n",
80
+ " JOIN documents d ON d.id = e.document_id\n",
81
+ " WHERE e.entity_type IN ('PERSON', 'ORG', 'GPE')\n",
82
+ " ORDER BY e.entity_text\n",
83
+ "\"\"\")\n",
84
+ "\n",
85
+ "print(f'Total entity-collection pairs: {len(entity_df)}')\n",
86
+ "print(f'Unique entities: {entity_df[\"entity_text\"].nunique()}')\n",
87
+ "print(f'Collections represented: {entity_df[\"source_section\"].nunique()}')"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": null,
93
+ "metadata": {},
94
+ "outputs": [],
95
+ "source": [
96
+ "# ---- Build entity sets per collection ----\n",
97
+ "collection_entities = {}\n",
98
+ "for section in entity_df['source_section'].unique():\n",
99
+ " mask = entity_df['source_section'] == section\n",
100
+ " entities = set(\n",
101
+ " entity_df.loc[mask].apply(\n",
102
+ " lambda r: (r['entity_text'].lower(), r['entity_type']), axis=1\n",
103
+ " )\n",
104
+ " )\n",
105
+ " collection_entities[section] = entities\n",
106
+ " print(f'{section}: {len(entities)} unique entities')"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": null,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "# ---- Find entities appearing in 2+ collections ----\n",
116
+ "entity_collections = {}\n",
117
+ "for section, ents in collection_entities.items():\n",
118
+ " for ent in ents:\n",
119
+ " if ent not in entity_collections:\n",
120
+ " entity_collections[ent] = set()\n",
121
+ " entity_collections[ent].add(section)\n",
122
+ "\n",
123
+ "# Filter to multi-collection entities\n",
124
+ "bridge_entities = {\n",
125
+ " ent: cols for ent, cols in entity_collections.items() if len(cols) >= 2\n",
126
+ "}\n",
127
+ "\n",
128
+ "print(f'Entities appearing in 2+ collections: {len(bridge_entities)}')\n",
129
+ "print(f'Total unique entities: {len(entity_collections)}')\n",
130
+ "print(f'Bridge ratio: {len(bridge_entities)/max(len(entity_collections),1)*100:.1f}%')"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": [
139
+ "# ---- Jaccard similarity between collection pairs ----\n",
140
+ "collections_present = sorted(collection_entities.keys())\n",
141
+ "n = len(collections_present)\n",
142
+ "jaccard_matrix = pd.DataFrame(\n",
143
+ " np.zeros((n, n)), index=collections_present, columns=collections_present\n",
144
+ ")\n",
145
+ "\n",
146
+ "for c1, c2 in combinations(collections_present, 2):\n",
147
+ " s1 = collection_entities[c1]\n",
148
+ " s2 = collection_entities[c2]\n",
149
+ " intersection = len(s1 & s2)\n",
150
+ " union = len(s1 | s2)\n",
151
+ " jaccard = intersection / union if union > 0 else 0\n",
152
+ " jaccard_matrix.loc[c1, c2] = jaccard\n",
153
+ " jaccard_matrix.loc[c2, c1] = jaccard\n",
154
+ "\n",
155
+ "# Diagonal = 1.0\n",
156
+ "np.fill_diagonal(jaccard_matrix.values, 1.0)\n",
157
+ "\n",
158
+ "print('Entity Overlap Matrix (Jaccard Similarity):')\n",
159
+ "print(jaccard_matrix.round(4).to_string())"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": null,
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": [
168
+ "# ---- Heatmap of Jaccard similarities ----\n",
169
+ "fig, ax = plt.subplots(figsize=(12, 10))\n",
170
+ "mask = np.triu(np.ones_like(jaccard_matrix, dtype=bool), k=1)\n",
171
+ "sns.heatmap(\n",
172
+ " jaccard_matrix, annot=True, fmt='.3f', cmap='YlOrRd',\n",
173
+ " mask=mask, square=True, ax=ax,\n",
174
+ " cbar_kws={'label': 'Jaccard Similarity'},\n",
175
+ ")\n",
176
+ "ax.set_title('Entity Overlap Between Collections (Jaccard Similarity)')\n",
177
+ "plt.tight_layout()\n",
178
+ "save_fig(fig, 'cross_collection_jaccard')\n",
179
+ "plt.show()"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": null,
185
+ "metadata": {},
186
+ "outputs": [],
187
+ "source": [
188
+ "# ---- Top 50 bridge entities ----\n",
189
+ "bridge_list = [\n",
190
+ " {\n",
191
+ " 'entity_text': ent[0],\n",
192
+ " 'entity_type': ent[1],\n",
193
+ " 'collection_count': len(cols),\n",
194
+ " 'collections': ', '.join(sorted(cols)),\n",
195
+ " }\n",
196
+ " for ent, cols in bridge_entities.items()\n",
197
+ "]\n",
198
+ "bridge_df = pd.DataFrame(bridge_list).sort_values(\n",
199
+ " ['collection_count', 'entity_text'], ascending=[False, True]\n",
200
+ ")\n",
201
+ "\n",
202
+ "print('Top 50 Bridge Entities (appearing in most collections):')\n",
203
+ "print(bridge_df.head(50).to_string(index=False))"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": null,
209
+ "metadata": {},
210
+ "outputs": [],
211
+ "source": [
212
+ "# ---- Store cross-collection entity relationships ----\n",
213
+ "# For each bridge entity, create relationships between the collections it bridges\n",
214
+ "relationship_rows = []\n",
215
+ "for ent, cols in bridge_entities.items():\n",
216
+ " entity_text, entity_type = ent\n",
217
+ " for c1, c2 in combinations(sorted(cols), 2):\n",
218
+ " relationship_rows.append((\n",
219
+ " entity_text, # entity_a\n",
220
+ " entity_text, # entity_b (same entity)\n",
221
+ " 'cross_collection', # relationship_type\n",
222
+ " None, # source_section (NULL for cross-collection)\n",
223
+ " 1.0, # weight\n",
224
+ " ))\n",
225
+ "\n",
226
+ "# Deduplicate\n",
227
+ "relationship_rows = list(set(relationship_rows))\n",
228
+ "\n",
229
+ "if relationship_rows:\n",
230
+ " n = bulk_insert(\n",
231
+ " 'entity_relationships',\n",
232
+ " ['entity_a', 'entity_b', 'relationship_type', 'source_section', 'weight'],\n",
233
+ " relationship_rows,\n",
234
+ " )\n",
235
+ " print(f'Inserted {n} cross-collection entity_relationships')\n",
236
+ "\n",
237
+ "finish_run(run_id, documents_processed=entity_df['source_section'].nunique())\n",
238
+ "print(f'Run {run_id} complete.')"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": null,
244
+ "metadata": {},
245
+ "outputs": [],
246
+ "source": [
247
+ "# ---- Bar chart: bridge entities by type ----\n",
248
+ "if len(bridge_df) > 0:\n",
249
+ " by_type = bridge_df['entity_type'].value_counts()\n",
250
+ "\n",
251
+ " fig, ax = plt.subplots(figsize=(8, 5))\n",
252
+ " ax.bar(by_type.index, by_type.values, color='#9333ea')\n",
253
+ " ax.set_title('Bridge Entities by Type')\n",
254
+ " ax.set_ylabel('Count')\n",
255
+ " plt.tight_layout()\n",
256
+ " save_fig(fig, 'cross_collection_bridge_types')\n",
257
+ " plt.show()"
258
+ ]
259
+ }
260
+ ],
261
+ "metadata": {
262
+ "kernelspec": {
263
+ "display_name": "Python 3",
264
+ "language": "python",
265
+ "name": "python3"
266
+ },
267
+ "language_info": {
268
+ "name": "python",
269
+ "version": "3.10.0"
270
+ }
271
+ },
272
+ "nbformat": 4,
273
+ "nbformat_minor": 5
274
+ }