Upload notebooks/02_entity_network/22_network_analysis.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/02_entity_network/22_network_analysis.ipynb +238 -0

notebooks/02_entity_network/22_network_analysis.ipynb ADDED Viewed

	@@ -0,0 +1,238 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 22 - Network Analysis\n",
+    "\n",
+    "Pipeline notebook for building and analyzing the entity co-occurrence network.\n",
+    "\n",
+    "Loads entity relationships into a NetworkX graph, computes centrality metrics,\n",
+    "runs Louvain community detection, and exports the graph as JSON for visualization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "source_section = None\n",
+    "min_edge_weight = 5\n",
+    "entity_types = [\"PERSON\", \"ORG\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "\n",
+    "import json\n",
+    "import networkx as nx\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "from research_lib.db import fetch_df, fetch_all\n",
+    "from research_lib.export import export_network_json\n",
+    "from research_lib.incremental import start_run, finish_run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Start run\n",
+    "run_id = start_run(\n",
+    "    'network_analysis',\n",
+    "    source_section=source_section,\n",
+    "    parameters={\n",
+    "        'min_edge_weight': min_edge_weight,\n",
+    "        'entity_types': entity_types,\n",
+    "    },\n",
+    ")\n",
+    "print(f'Started run {run_id}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load entity relationships from DB, filtered by edge weight and entity types\n",
+    "type_placeholders = ','.join(['%s'] * len(entity_types))\n",
+    "\n",
+    "where_clauses = [\n",
+    "    'co_occurrence_count >= %s',\n",
+    "    f'entity_a_type IN ({type_placeholders})',\n",
+    "    f'entity_b_type IN ({type_placeholders})',\n",
+    "]\n",
+    "params = [min_edge_weight] + entity_types + entity_types\n",
+    "\n",
+    "if source_section:\n",
+    "    where_clauses.append('source_section = %s')\n",
+    "    params.append(source_section)\n",
+    "\n",
+    "sql = f\"\"\"\n",
+    "    SELECT entity_a, entity_a_type, entity_b, entity_b_type, co_occurrence_count\n",
+    "    FROM entity_relationships\n",
+    "    WHERE {' AND '.join(where_clauses)}\n",
+    "    ORDER BY co_occurrence_count DESC\n",
+    "\"\"\"\n",
+    "edges_df = fetch_df(sql, params)\n",
+    "print(f'Loaded {len(edges_df)} edges')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build NetworkX graph\n",
+    "G = nx.Graph()\n",
+    "\n",
+    "for _, row in edges_df.iterrows():\n",
+    "    # Add nodes with type attributes\n",
+    "    G.add_node(row['entity_a'], label=row['entity_a'], type=row['entity_a_type'])\n",
+    "    G.add_node(row['entity_b'], label=row['entity_b'], type=row['entity_b_type'])\n",
+    "    # Add edge with weight\n",
+    "    G.add_edge(row['entity_a'], row['entity_b'], weight=row['co_occurrence_count'])\n",
+    "\n",
+    "print(f'Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute centrality metrics\n",
+    "print('Computing degree centrality...')\n",
+    "degree_cent = nx.degree_centrality(G)\n",
+    "\n",
+    "print('Computing betweenness centrality...')\n",
+    "betweenness_cent = nx.betweenness_centrality(G, weight='weight')\n",
+    "\n",
+    "print('Computing PageRank...')\n",
+    "pagerank = nx.pagerank(G, weight='weight')\n",
+    "\n",
+    "# Store as node attributes\n",
+    "for node in G.nodes:\n",
+    "    G.nodes[node]['degree_centrality'] = degree_cent[node]\n",
+    "    G.nodes[node]['betweenness_centrality'] = betweenness_cent[node]\n",
+    "    G.nodes[node]['pagerank'] = pagerank[node]\n",
+    "    G.nodes[node]['centrality'] = pagerank[node]  # used by export_network_json\n",
+    "\n",
+    "print('Centrality metrics computed.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Community detection using Louvain method\n",
+    "print('Running Louvain community detection...')\n",
+    "communities = nx.community.louvain_communities(G, weight='weight', seed=42)\n",
+    "\n",
+    "# Assign community IDs to nodes\n",
+    "for comm_id, community in enumerate(communities):\n",
+    "    for node in community:\n",
+    "        G.nodes[node]['community'] = comm_id\n",
+    "\n",
+    "print(f'Found {len(communities)} communities')\n",
+    "\n",
+    "# Community size distribution\n",
+    "comm_sizes = sorted([len(c) for c in communities], reverse=True)\n",
+    "print(f'Community sizes (top 10): {comm_sizes[:10]}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Export graph as JSON\n",
+    "section_label = source_section or 'all'\n",
+    "filename = f'network_{section_label}.json'\n",
+    "output_path = export_network_json(G, filename, max_nodes=500)\n",
+    "print(f'Network exported to: {output_path}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Print top 20 entities by PageRank\n",
+    "print('\\n=== Top 20 Entities by PageRank ===')\n",
+    "top_pr = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:20]\n",
+    "for rank, (entity, pr_score) in enumerate(top_pr, 1):\n",
+    "    node_data = G.nodes[entity]\n",
+    "    print(\n",
+    "        f'{rank:2d}. {entity:40s} '\n",
+    "        f'type={node_data[\"type\"]:8s} '\n",
+    "        f'PR={pr_score:.6f} '\n",
+    "        f'degree={G.degree(entity):4d} '\n",
+    "        f'community={node_data[\"community\"]}'\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Summary\n",
+    "print('\\n=== Network Analysis Summary ===')\n",
+    "print(f'Nodes: {G.number_of_nodes()}')\n",
+    "print(f'Edges: {G.number_of_edges()}')\n",
+    "print(f'Communities: {len(communities)}')\n",
+    "print(f'Density: {nx.density(G):.6f}')\n",
+    "if nx.is_connected(G):\n",
+    "    print(f'Diameter: {nx.diameter(G)}')\n",
+    "    print(f'Avg shortest path: {nx.average_shortest_path_length(G):.2f}')\n",
+    "else:\n",
+    "    components = list(nx.connected_components(G))\n",
+    "    print(f'Connected components: {len(components)}')\n",
+    "    largest = max(components, key=len)\n",
+    "    print(f'Largest component: {len(largest)} nodes')\n",
+    "\n",
+    "# Finish run\n",
+    "finish_run(run_id, documents_processed=G.number_of_nodes())\n",
+    "print(f'\\nRun {run_id} completed.')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}