Upload notebooks/02_entity_network/20_entity_cooccurrence.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/02_entity_network/20_entity_cooccurrence.ipynb +252 -0

notebooks/02_entity_network/20_entity_cooccurrence.ipynb ADDED Viewed

	@@ -0,0 +1,252 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 20 - Entity Co-occurrence Analysis\n",
+    "\n",
+    "Pipeline notebook for computing entity co-occurrence from page-level entity extractions.\n",
+    "\n",
+    "For each page, computes all entity pairs (entity_a < entity_b) and aggregates co-occurrence\n",
+    "counts across the corpus. Results are stored in the `entity_relationships` table.\n",
+    "\n",
+    "**Incremental**: Only processes documents not yet in entity_relationships."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "source_section = None\n",
+    "min_count = 3\n",
+    "batch_size = 10000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from collections import Counter\n",
+    "from itertools import combinations\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "from research_lib.db import fetch_df, fetch_all, bulk_insert, get_conn\n",
+    "from research_lib.incremental import start_run, finish_run, get_processed_doc_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Start incremental run\n",
+    "run_id = start_run(\n",
+    "    'entity_cooccurrence',\n",
+    "    source_section=source_section,\n",
+    "    parameters={'min_count': min_count, 'batch_size': batch_size},\n",
+    ")\n",
+    "print(f'Started run {run_id}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get already-processed document IDs from entity_relationships\n",
+    "processed_ids = get_processed_doc_ids(\n",
+    "    'entity_cooccurrence',\n",
+    "    feature_table='entity_relationships',\n",
+    ")\n",
+    "print(f'Already processed: {len(processed_ids)} documents')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build query for unprocessed entities\n",
+    "where_clauses = []\n",
+    "params = []\n",
+    "\n",
+    "if source_section:\n",
+    "    where_clauses.append('d.source_section = %s')\n",
+    "    params.append(source_section)\n",
+    "\n",
+    "if processed_ids:\n",
+    "    where_clauses.append(f'e.document_id NOT IN ({\",\".join(str(i) for i in processed_ids)})')\n",
+    "\n",
+    "where_sql = ('WHERE ' + ' AND '.join(where_clauses)) if where_clauses else ''\n",
+    "\n",
+    "count_sql = f\"\"\"\n",
+    "    SELECT COUNT(DISTINCT e.document_id)\n",
+    "    FROM entities e\n",
+    "    JOIN documents d ON d.id = e.document_id\n",
+    "    {where_sql}\n",
+    "\"\"\"\n",
+    "total_docs = fetch_all(count_sql, params or None)[0]['count']\n",
+    "print(f'Documents to process: {total_docs}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process entities in batches, computing co-occurrence pairs\n",
+    "pair_counter = Counter()\n",
+    "pair_types = {}  # (entity_a, entity_b) -> (type_a, type_b)\n",
+    "doc_ids_processed = set()\n",
+    "offset = 0\n",
+    "\n",
+    "while True:\n",
+    "    sql = f\"\"\"\n",
+    "        SELECT e.document_id, e.page_number, e.entity_text, e.entity_type\n",
+    "        FROM entities e\n",
+    "        JOIN documents d ON d.id = e.document_id\n",
+    "        {where_sql}\n",
+    "        ORDER BY e.document_id, e.page_number\n",
+    "        LIMIT %s OFFSET %s\n",
+    "    \"\"\"\n",
+    "    batch_params = (params or []) + [batch_size, offset]\n",
+    "    batch_df = fetch_df(sql, batch_params)\n",
+    "\n",
+    "    if batch_df.empty:\n",
+    "        break\n",
+    "\n",
+    "    # Group by (document_id, page_number) and compute pairs\n",
+    "    for (doc_id, page_num), group in batch_df.groupby(['document_id', 'page_number']):\n",
+    "        entities = list(zip(group['entity_text'], group['entity_type']))\n",
+    "        doc_ids_processed.add(doc_id)\n",
+    "\n",
+    "        for (text_a, type_a), (text_b, type_b) in combinations(entities, 2):\n",
+    "            # Canonical ordering: alphabetical by entity text\n",
+    "            if text_a > text_b:\n",
+    "                text_a, text_b = text_b, text_a\n",
+    "                type_a, type_b = type_b, type_a\n",
+    "\n",
+    "            pair = (text_a, text_b)\n",
+    "            pair_counter[pair] += 1\n",
+    "            if pair not in pair_types:\n",
+    "                pair_types[pair] = (type_a, type_b)\n",
+    "\n",
+    "    offset += batch_size\n",
+    "    print(f'  Processed batch at offset {offset}, running pairs: {len(pair_counter)}')\n",
+    "\n",
+    "print(f'\\nTotal unique pairs found: {len(pair_counter)}')\n",
+    "print(f'Documents processed: {len(doc_ids_processed)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filter by min_count and prepare rows for insertion\n",
+    "rows = []\n",
+    "for (entity_a, entity_b), count in pair_counter.items():\n",
+    "    if count >= min_count:\n",
+    "        type_a, type_b = pair_types[(entity_a, entity_b)]\n",
+    "        rows.append((\n",
+    "            entity_a,\n",
+    "            type_a,\n",
+    "            entity_b,\n",
+    "            type_b,\n",
+    "            count,\n",
+    "            source_section,\n",
+    "        ))\n",
+    "\n",
+    "print(f'Pairs after filtering (min_count={min_count}): {len(rows)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Insert into entity_relationships\n",
+    "if rows:\n",
+    "    inserted = bulk_insert(\n",
+    "        'entity_relationships',\n",
+    "        ['entity_a', 'entity_a_type', 'entity_b', 'entity_b_type', 'co_occurrence_count', 'source_section'],\n",
+    "        rows,\n",
+    "        on_conflict='DO NOTHING',\n",
+    "    )\n",
+    "    print(f'Inserted {inserted} rows into entity_relationships')\n",
+    "else:\n",
+    "    print('No rows to insert.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Finish run\n",
+    "finish_run(run_id, documents_processed=len(doc_ids_processed))\n",
+    "print(f'Run {run_id} completed.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Summary statistics\n",
+    "print('=== Co-occurrence Summary ===')\n",
+    "print(f'Total documents processed this run: {len(doc_ids_processed)}')\n",
+    "print(f'Total unique pairs (before filtering): {len(pair_counter)}')\n",
+    "print(f'Pairs stored (min_count >= {min_count}): {len(rows)}')\n",
+    "\n",
+    "if rows:\n",
+    "    counts = [r[4] for r in rows]\n",
+    "    print(f'Co-occurrence count range: {min(counts)} - {max(counts)}')\n",
+    "    print(f'Mean co-occurrence count: {np.mean(counts):.1f}')\n",
+    "    print(f'Median co-occurrence count: {np.median(counts):.1f}')\n",
+    "\n",
+    "    # Top 20 pairs\n",
+    "    print('\\nTop 20 co-occurring entity pairs:')\n",
+    "    top_pairs = sorted(rows, key=lambda x: x[4], reverse=True)[:20]\n",
+    "    for entity_a, type_a, entity_b, type_b, count, _ in top_pairs:\n",
+    "        print(f'  {entity_a} ({type_a}) <-> {entity_b} ({type_b}): {count}')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}