Upload notebooks/05_cross_analysis/51_cross_collection.ipynb with huggingface_hub
Browse files
notebooks/05_cross_analysis/51_cross_collection.ipynb
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# 51 - Cross-Collection Entity Analysis\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"Pipeline notebook that identifies entities appearing across multiple document collections.\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"- Finds entities that appear in 2+ collections\n",
|
| 12 |
+
"- Computes Jaccard similarity of entity sets between collection pairs\n",
|
| 13 |
+
"- Identifies top \"bridge\" entities (those appearing in the most collections)\n",
|
| 14 |
+
"- Stores cross-collection entity relationships"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": null,
|
| 20 |
+
"metadata": {
|
| 21 |
+
"tags": [
|
| 22 |
+
"parameters"
|
| 23 |
+
]
|
| 24 |
+
},
|
| 25 |
+
"outputs": [],
|
| 26 |
+
"source": [
|
| 27 |
+
"# Parameters\n",
|
| 28 |
+
"# No filtering -- always processes all collections"
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"cell_type": "code",
|
| 33 |
+
"execution_count": null,
|
| 34 |
+
"metadata": {},
|
| 35 |
+
"outputs": [],
|
| 36 |
+
"source": [
|
| 37 |
+
"import sys, warnings\n",
|
| 38 |
+
"sys.path.insert(0, '/opt/epstein_env/research')\n",
|
| 39 |
+
"warnings.filterwarnings('ignore')\n",
|
| 40 |
+
"\n",
|
| 41 |
+
"import pandas as pd\n",
|
| 42 |
+
"import numpy as np\n",
|
| 43 |
+
"import matplotlib.pyplot as plt\n",
|
| 44 |
+
"import seaborn as sns\n",
|
| 45 |
+
"from itertools import combinations\n",
|
| 46 |
+
"\n",
|
| 47 |
+
"from research_lib.config import COLLECTIONS, COLLECTION_LABELS\n",
|
| 48 |
+
"from research_lib.db import fetch_df, bulk_insert, get_conn\n",
|
| 49 |
+
"from research_lib.incremental import start_run, finish_run\n",
|
| 50 |
+
"from research_lib.plotting import set_style, save_fig, COLLECTION_COLORS\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"set_style()\n",
|
| 53 |
+
"print('Libraries loaded.')"
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"cell_type": "code",
|
| 58 |
+
"execution_count": null,
|
| 59 |
+
"metadata": {},
|
| 60 |
+
"outputs": [],
|
| 61 |
+
"source": [
|
| 62 |
+
"# ---- Start run ----\n",
|
| 63 |
+
"PIPELINE = 'cross_collection'\n",
|
| 64 |
+
"run_id = start_run(PIPELINE, source_section=None)"
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"cell_type": "code",
|
| 69 |
+
"execution_count": null,
|
| 70 |
+
"metadata": {},
|
| 71 |
+
"outputs": [],
|
| 72 |
+
"source": [
|
| 73 |
+
"# ---- Query distinct entities per collection ----\n",
|
| 74 |
+
"entity_df = fetch_df(\"\"\"\n",
|
| 75 |
+
" SELECT DISTINCT\n",
|
| 76 |
+
" e.entity_text,\n",
|
| 77 |
+
" e.entity_type,\n",
|
| 78 |
+
" d.source_section\n",
|
| 79 |
+
" FROM entities e\n",
|
| 80 |
+
" JOIN documents d ON d.id = e.document_id\n",
|
| 81 |
+
" WHERE e.entity_type IN ('PERSON', 'ORG', 'GPE')\n",
|
| 82 |
+
" ORDER BY e.entity_text\n",
|
| 83 |
+
"\"\"\")\n",
|
| 84 |
+
"\n",
|
| 85 |
+
"print(f'Total entity-collection pairs: {len(entity_df)}')\n",
|
| 86 |
+
"print(f'Unique entities: {entity_df[\"entity_text\"].nunique()}')\n",
|
| 87 |
+
"print(f'Collections represented: {entity_df[\"source_section\"].nunique()}')"
|
| 88 |
+
]
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"cell_type": "code",
|
| 92 |
+
"execution_count": null,
|
| 93 |
+
"metadata": {},
|
| 94 |
+
"outputs": [],
|
| 95 |
+
"source": [
|
| 96 |
+
"# ---- Build entity sets per collection ----\n",
|
| 97 |
+
"collection_entities = {}\n",
|
| 98 |
+
"for section in entity_df['source_section'].unique():\n",
|
| 99 |
+
" mask = entity_df['source_section'] == section\n",
|
| 100 |
+
" entities = set(\n",
|
| 101 |
+
" entity_df.loc[mask].apply(\n",
|
| 102 |
+
" lambda r: (r['entity_text'].lower(), r['entity_type']), axis=1\n",
|
| 103 |
+
" )\n",
|
| 104 |
+
" )\n",
|
| 105 |
+
" collection_entities[section] = entities\n",
|
| 106 |
+
" print(f'{section}: {len(entities)} unique entities')"
|
| 107 |
+
]
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"cell_type": "code",
|
| 111 |
+
"execution_count": null,
|
| 112 |
+
"metadata": {},
|
| 113 |
+
"outputs": [],
|
| 114 |
+
"source": [
|
| 115 |
+
"# ---- Find entities appearing in 2+ collections ----\n",
|
| 116 |
+
"entity_collections = {}\n",
|
| 117 |
+
"for section, ents in collection_entities.items():\n",
|
| 118 |
+
" for ent in ents:\n",
|
| 119 |
+
" if ent not in entity_collections:\n",
|
| 120 |
+
" entity_collections[ent] = set()\n",
|
| 121 |
+
" entity_collections[ent].add(section)\n",
|
| 122 |
+
"\n",
|
| 123 |
+
"# Filter to multi-collection entities\n",
|
| 124 |
+
"bridge_entities = {\n",
|
| 125 |
+
" ent: cols for ent, cols in entity_collections.items() if len(cols) >= 2\n",
|
| 126 |
+
"}\n",
|
| 127 |
+
"\n",
|
| 128 |
+
"print(f'Entities appearing in 2+ collections: {len(bridge_entities)}')\n",
|
| 129 |
+
"print(f'Total unique entities: {len(entity_collections)}')\n",
|
| 130 |
+
"print(f'Bridge ratio: {len(bridge_entities)/max(len(entity_collections),1)*100:.1f}%')"
|
| 131 |
+
]
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"cell_type": "code",
|
| 135 |
+
"execution_count": null,
|
| 136 |
+
"metadata": {},
|
| 137 |
+
"outputs": [],
|
| 138 |
+
"source": [
|
| 139 |
+
"# ---- Jaccard similarity between collection pairs ----\n",
|
| 140 |
+
"collections_present = sorted(collection_entities.keys())\n",
|
| 141 |
+
"n = len(collections_present)\n",
|
| 142 |
+
"jaccard_matrix = pd.DataFrame(\n",
|
| 143 |
+
" np.zeros((n, n)), index=collections_present, columns=collections_present\n",
|
| 144 |
+
")\n",
|
| 145 |
+
"\n",
|
| 146 |
+
"for c1, c2 in combinations(collections_present, 2):\n",
|
| 147 |
+
" s1 = collection_entities[c1]\n",
|
| 148 |
+
" s2 = collection_entities[c2]\n",
|
| 149 |
+
" intersection = len(s1 & s2)\n",
|
| 150 |
+
" union = len(s1 | s2)\n",
|
| 151 |
+
" jaccard = intersection / union if union > 0 else 0\n",
|
| 152 |
+
" jaccard_matrix.loc[c1, c2] = jaccard\n",
|
| 153 |
+
" jaccard_matrix.loc[c2, c1] = jaccard\n",
|
| 154 |
+
"\n",
|
| 155 |
+
"# Diagonal = 1.0\n",
|
| 156 |
+
"np.fill_diagonal(jaccard_matrix.values, 1.0)\n",
|
| 157 |
+
"\n",
|
| 158 |
+
"print('Entity Overlap Matrix (Jaccard Similarity):')\n",
|
| 159 |
+
"print(jaccard_matrix.round(4).to_string())"
|
| 160 |
+
]
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"cell_type": "code",
|
| 164 |
+
"execution_count": null,
|
| 165 |
+
"metadata": {},
|
| 166 |
+
"outputs": [],
|
| 167 |
+
"source": [
|
| 168 |
+
"# ---- Heatmap of Jaccard similarities ----\n",
|
| 169 |
+
"fig, ax = plt.subplots(figsize=(12, 10))\n",
|
| 170 |
+
"mask = np.triu(np.ones_like(jaccard_matrix, dtype=bool), k=1)\n",
|
| 171 |
+
"sns.heatmap(\n",
|
| 172 |
+
" jaccard_matrix, annot=True, fmt='.3f', cmap='YlOrRd',\n",
|
| 173 |
+
" mask=mask, square=True, ax=ax,\n",
|
| 174 |
+
" cbar_kws={'label': 'Jaccard Similarity'},\n",
|
| 175 |
+
")\n",
|
| 176 |
+
"ax.set_title('Entity Overlap Between Collections (Jaccard Similarity)')\n",
|
| 177 |
+
"plt.tight_layout()\n",
|
| 178 |
+
"save_fig(fig, 'cross_collection_jaccard')\n",
|
| 179 |
+
"plt.show()"
|
| 180 |
+
]
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"cell_type": "code",
|
| 184 |
+
"execution_count": null,
|
| 185 |
+
"metadata": {},
|
| 186 |
+
"outputs": [],
|
| 187 |
+
"source": [
|
| 188 |
+
"# ---- Top 50 bridge entities ----\n",
|
| 189 |
+
"bridge_list = [\n",
|
| 190 |
+
" {\n",
|
| 191 |
+
" 'entity_text': ent[0],\n",
|
| 192 |
+
" 'entity_type': ent[1],\n",
|
| 193 |
+
" 'collection_count': len(cols),\n",
|
| 194 |
+
" 'collections': ', '.join(sorted(cols)),\n",
|
| 195 |
+
" }\n",
|
| 196 |
+
" for ent, cols in bridge_entities.items()\n",
|
| 197 |
+
"]\n",
|
| 198 |
+
"bridge_df = pd.DataFrame(bridge_list).sort_values(\n",
|
| 199 |
+
" ['collection_count', 'entity_text'], ascending=[False, True]\n",
|
| 200 |
+
")\n",
|
| 201 |
+
"\n",
|
| 202 |
+
"print('Top 50 Bridge Entities (appearing in most collections):')\n",
|
| 203 |
+
"print(bridge_df.head(50).to_string(index=False))"
|
| 204 |
+
]
|
| 205 |
+
},
|
| 206 |
+
{
|
| 207 |
+
"cell_type": "code",
|
| 208 |
+
"execution_count": null,
|
| 209 |
+
"metadata": {},
|
| 210 |
+
"outputs": [],
|
| 211 |
+
"source": [
|
| 212 |
+
"# ---- Store cross-collection entity relationships ----\n",
|
| 213 |
+
"# For each bridge entity, create relationships between the collections it bridges\n",
|
| 214 |
+
"relationship_rows = []\n",
|
| 215 |
+
"for ent, cols in bridge_entities.items():\n",
|
| 216 |
+
" entity_text, entity_type = ent\n",
|
| 217 |
+
" for c1, c2 in combinations(sorted(cols), 2):\n",
|
| 218 |
+
" relationship_rows.append((\n",
|
| 219 |
+
" entity_text, # entity_a\n",
|
| 220 |
+
" entity_text, # entity_b (same entity)\n",
|
| 221 |
+
" 'cross_collection', # relationship_type\n",
|
| 222 |
+
" None, # source_section (NULL for cross-collection)\n",
|
| 223 |
+
" 1.0, # weight\n",
|
| 224 |
+
" ))\n",
|
| 225 |
+
"\n",
|
| 226 |
+
"# Deduplicate\n",
|
| 227 |
+
"relationship_rows = list(set(relationship_rows))\n",
|
| 228 |
+
"\n",
|
| 229 |
+
"if relationship_rows:\n",
|
| 230 |
+
" n = bulk_insert(\n",
|
| 231 |
+
" 'entity_relationships',\n",
|
| 232 |
+
" ['entity_a', 'entity_b', 'relationship_type', 'source_section', 'weight'],\n",
|
| 233 |
+
" relationship_rows,\n",
|
| 234 |
+
" )\n",
|
| 235 |
+
" print(f'Inserted {n} cross-collection entity_relationships')\n",
|
| 236 |
+
"\n",
|
| 237 |
+
"finish_run(run_id, documents_processed=entity_df['source_section'].nunique())\n",
|
| 238 |
+
"print(f'Run {run_id} complete.')"
|
| 239 |
+
]
|
| 240 |
+
},
|
| 241 |
+
{
|
| 242 |
+
"cell_type": "code",
|
| 243 |
+
"execution_count": null,
|
| 244 |
+
"metadata": {},
|
| 245 |
+
"outputs": [],
|
| 246 |
+
"source": [
|
| 247 |
+
"# ---- Bar chart: bridge entities by type ----\n",
|
| 248 |
+
"if len(bridge_df) > 0:\n",
|
| 249 |
+
" by_type = bridge_df['entity_type'].value_counts()\n",
|
| 250 |
+
"\n",
|
| 251 |
+
" fig, ax = plt.subplots(figsize=(8, 5))\n",
|
| 252 |
+
" ax.bar(by_type.index, by_type.values, color='#9333ea')\n",
|
| 253 |
+
" ax.set_title('Bridge Entities by Type')\n",
|
| 254 |
+
" ax.set_ylabel('Count')\n",
|
| 255 |
+
" plt.tight_layout()\n",
|
| 256 |
+
" save_fig(fig, 'cross_collection_bridge_types')\n",
|
| 257 |
+
" plt.show()"
|
| 258 |
+
]
|
| 259 |
+
}
|
| 260 |
+
],
|
| 261 |
+
"metadata": {
|
| 262 |
+
"kernelspec": {
|
| 263 |
+
"display_name": "Python 3",
|
| 264 |
+
"language": "python",
|
| 265 |
+
"name": "python3"
|
| 266 |
+
},
|
| 267 |
+
"language_info": {
|
| 268 |
+
"name": "python",
|
| 269 |
+
"version": "3.10.0"
|
| 270 |
+
}
|
| 271 |
+
},
|
| 272 |
+
"nbformat": 4,
|
| 273 |
+
"nbformat_minor": 5
|
| 274 |
+
}
|