Upload notebooks/05_cross_analysis/51_cross_collection.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/05_cross_analysis/51_cross_collection.ipynb +274 -0

notebooks/05_cross_analysis/51_cross_collection.ipynb ADDED Viewed

	@@ -0,0 +1,274 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 51 - Cross-Collection Entity Analysis\n",
+    "\n",
+    "Pipeline notebook that identifies entities appearing across multiple document collections.\n",
+    "\n",
+    "- Finds entities that appear in 2+ collections\n",
+    "- Computes Jaccard similarity of entity sets between collection pairs\n",
+    "- Identifies top \"bridge\" entities (those appearing in the most collections)\n",
+    "- Stores cross-collection entity relationships"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "# No filtering -- always processes all collections"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, warnings\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from itertools import combinations\n",
+    "\n",
+    "from research_lib.config import COLLECTIONS, COLLECTION_LABELS\n",
+    "from research_lib.db import fetch_df, bulk_insert, get_conn\n",
+    "from research_lib.incremental import start_run, finish_run\n",
+    "from research_lib.plotting import set_style, save_fig, COLLECTION_COLORS\n",
+    "\n",
+    "set_style()\n",
+    "print('Libraries loaded.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Start run ----\n",
+    "PIPELINE = 'cross_collection'\n",
+    "run_id = start_run(PIPELINE, source_section=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Query distinct entities per collection ----\n",
+    "entity_df = fetch_df(\"\"\"\n",
+    "    SELECT DISTINCT\n",
+    "        e.entity_text,\n",
+    "        e.entity_type,\n",
+    "        d.source_section\n",
+    "    FROM entities e\n",
+    "    JOIN documents d ON d.id = e.document_id\n",
+    "    WHERE e.entity_type IN ('PERSON', 'ORG', 'GPE')\n",
+    "    ORDER BY e.entity_text\n",
+    "\"\"\")\n",
+    "\n",
+    "print(f'Total entity-collection pairs: {len(entity_df)}')\n",
+    "print(f'Unique entities: {entity_df[\"entity_text\"].nunique()}')\n",
+    "print(f'Collections represented: {entity_df[\"source_section\"].nunique()}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Build entity sets per collection ----\n",
+    "collection_entities = {}\n",
+    "for section in entity_df['source_section'].unique():\n",
+    "    mask = entity_df['source_section'] == section\n",
+    "    entities = set(\n",
+    "        entity_df.loc[mask].apply(\n",
+    "            lambda r: (r['entity_text'].lower(), r['entity_type']), axis=1\n",
+    "        )\n",
+    "    )\n",
+    "    collection_entities[section] = entities\n",
+    "    print(f'{section}: {len(entities)} unique entities')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Find entities appearing in 2+ collections ----\n",
+    "entity_collections = {}\n",
+    "for section, ents in collection_entities.items():\n",
+    "    for ent in ents:\n",
+    "        if ent not in entity_collections:\n",
+    "            entity_collections[ent] = set()\n",
+    "        entity_collections[ent].add(section)\n",
+    "\n",
+    "# Filter to multi-collection entities\n",
+    "bridge_entities = {\n",
+    "    ent: cols for ent, cols in entity_collections.items() if len(cols) >= 2\n",
+    "}\n",
+    "\n",
+    "print(f'Entities appearing in 2+ collections: {len(bridge_entities)}')\n",
+    "print(f'Total unique entities: {len(entity_collections)}')\n",
+    "print(f'Bridge ratio: {len(bridge_entities)/max(len(entity_collections),1)*100:.1f}%')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Jaccard similarity between collection pairs ----\n",
+    "collections_present = sorted(collection_entities.keys())\n",
+    "n = len(collections_present)\n",
+    "jaccard_matrix = pd.DataFrame(\n",
+    "    np.zeros((n, n)), index=collections_present, columns=collections_present\n",
+    ")\n",
+    "\n",
+    "for c1, c2 in combinations(collections_present, 2):\n",
+    "    s1 = collection_entities[c1]\n",
+    "    s2 = collection_entities[c2]\n",
+    "    intersection = len(s1 & s2)\n",
+    "    union = len(s1 | s2)\n",
+    "    jaccard = intersection / union if union > 0 else 0\n",
+    "    jaccard_matrix.loc[c1, c2] = jaccard\n",
+    "    jaccard_matrix.loc[c2, c1] = jaccard\n",
+    "\n",
+    "# Diagonal = 1.0\n",
+    "np.fill_diagonal(jaccard_matrix.values, 1.0)\n",
+    "\n",
+    "print('Entity Overlap Matrix (Jaccard Similarity):')\n",
+    "print(jaccard_matrix.round(4).to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Heatmap of Jaccard similarities ----\n",
+    "fig, ax = plt.subplots(figsize=(12, 10))\n",
+    "mask = np.triu(np.ones_like(jaccard_matrix, dtype=bool), k=1)\n",
+    "sns.heatmap(\n",
+    "    jaccard_matrix, annot=True, fmt='.3f', cmap='YlOrRd',\n",
+    "    mask=mask, square=True, ax=ax,\n",
+    "    cbar_kws={'label': 'Jaccard Similarity'},\n",
+    ")\n",
+    "ax.set_title('Entity Overlap Between Collections (Jaccard Similarity)')\n",
+    "plt.tight_layout()\n",
+    "save_fig(fig, 'cross_collection_jaccard')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Top 50 bridge entities ----\n",
+    "bridge_list = [\n",
+    "    {\n",
+    "        'entity_text': ent[0],\n",
+    "        'entity_type': ent[1],\n",
+    "        'collection_count': len(cols),\n",
+    "        'collections': ', '.join(sorted(cols)),\n",
+    "    }\n",
+    "    for ent, cols in bridge_entities.items()\n",
+    "]\n",
+    "bridge_df = pd.DataFrame(bridge_list).sort_values(\n",
+    "    ['collection_count', 'entity_text'], ascending=[False, True]\n",
+    ")\n",
+    "\n",
+    "print('Top 50 Bridge Entities (appearing in most collections):')\n",
+    "print(bridge_df.head(50).to_string(index=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Store cross-collection entity relationships ----\n",
+    "# For each bridge entity, create relationships between the collections it bridges\n",
+    "relationship_rows = []\n",
+    "for ent, cols in bridge_entities.items():\n",
+    "    entity_text, entity_type = ent\n",
+    "    for c1, c2 in combinations(sorted(cols), 2):\n",
+    "        relationship_rows.append((\n",
+    "            entity_text,       # entity_a\n",
+    "            entity_text,       # entity_b (same entity)\n",
+    "            'cross_collection', # relationship_type\n",
+    "            None,              # source_section (NULL for cross-collection)\n",
+    "            1.0,               # weight\n",
+    "        ))\n",
+    "\n",
+    "# Deduplicate\n",
+    "relationship_rows = list(set(relationship_rows))\n",
+    "\n",
+    "if relationship_rows:\n",
+    "    n = bulk_insert(\n",
+    "        'entity_relationships',\n",
+    "        ['entity_a', 'entity_b', 'relationship_type', 'source_section', 'weight'],\n",
+    "        relationship_rows,\n",
+    "    )\n",
+    "    print(f'Inserted {n} cross-collection entity_relationships')\n",
+    "\n",
+    "finish_run(run_id, documents_processed=entity_df['source_section'].nunique())\n",
+    "print(f'Run {run_id} complete.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Bar chart: bridge entities by type ----\n",
+    "if len(bridge_df) > 0:\n",
+    "    by_type = bridge_df['entity_type'].value_counts()\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(8, 5))\n",
+    "    ax.bar(by_type.index, by_type.values, color='#9333ea')\n",
+    "    ax.set_title('Bridge Entities by Type')\n",
+    "    ax.set_ylabel('Count')\n",
+    "    plt.tight_layout()\n",
+    "    save_fig(fig, 'cross_collection_bridge_types')\n",
+    "    plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}