datamatters24 commited on
Commit
07ab7bf
·
verified ·
1 Parent(s): d491771

Upload notebooks/02_entity_network/22_network_analysis.ipynb with huggingface_hub

Browse files
notebooks/02_entity_network/22_network_analysis.ipynb ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 22 - Network Analysis\n",
8
+ "\n",
9
+ "Pipeline notebook for building and analyzing the entity co-occurrence network.\n",
10
+ "\n",
11
+ "Loads entity relationships into a NetworkX graph, computes centrality metrics,\n",
12
+ "runs Louvain community detection, and exports the graph as JSON for visualization."
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": null,
18
+ "metadata": {
19
+ "tags": [
20
+ "parameters"
21
+ ]
22
+ },
23
+ "outputs": [],
24
+ "source": [
25
+ "# Parameters\n",
26
+ "source_section = None\n",
27
+ "min_edge_weight = 5\n",
28
+ "entity_types = [\"PERSON\", \"ORG\"]"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "import sys\n",
38
+ "sys.path.insert(0, '/opt/epstein_env/research')\n",
39
+ "\n",
40
+ "import json\n",
41
+ "import networkx as nx\n",
42
+ "import pandas as pd\n",
43
+ "import numpy as np\n",
44
+ "\n",
45
+ "from research_lib.db import fetch_df, fetch_all\n",
46
+ "from research_lib.export import export_network_json\n",
47
+ "from research_lib.incremental import start_run, finish_run"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "# Start run\n",
57
+ "run_id = start_run(\n",
58
+ " 'network_analysis',\n",
59
+ " source_section=source_section,\n",
60
+ " parameters={\n",
61
+ " 'min_edge_weight': min_edge_weight,\n",
62
+ " 'entity_types': entity_types,\n",
63
+ " },\n",
64
+ ")\n",
65
+ "print(f'Started run {run_id}')"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": null,
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "# Load entity relationships from DB, filtered by edge weight and entity types\n",
75
+ "type_placeholders = ','.join(['%s'] * len(entity_types))\n",
76
+ "\n",
77
+ "where_clauses = [\n",
78
+ " 'co_occurrence_count >= %s',\n",
79
+ " f'entity_a_type IN ({type_placeholders})',\n",
80
+ " f'entity_b_type IN ({type_placeholders})',\n",
81
+ "]\n",
82
+ "params = [min_edge_weight] + entity_types + entity_types\n",
83
+ "\n",
84
+ "if source_section:\n",
85
+ " where_clauses.append('source_section = %s')\n",
86
+ " params.append(source_section)\n",
87
+ "\n",
88
+ "sql = f\"\"\"\n",
89
+ " SELECT entity_a, entity_a_type, entity_b, entity_b_type, co_occurrence_count\n",
90
+ " FROM entity_relationships\n",
91
+ " WHERE {' AND '.join(where_clauses)}\n",
92
+ " ORDER BY co_occurrence_count DESC\n",
93
+ "\"\"\"\n",
94
+ "edges_df = fetch_df(sql, params)\n",
95
+ "print(f'Loaded {len(edges_df)} edges')"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "# Build NetworkX graph\n",
105
+ "G = nx.Graph()\n",
106
+ "\n",
107
+ "for _, row in edges_df.iterrows():\n",
108
+ " # Add nodes with type attributes\n",
109
+ " G.add_node(row['entity_a'], label=row['entity_a'], type=row['entity_a_type'])\n",
110
+ " G.add_node(row['entity_b'], label=row['entity_b'], type=row['entity_b_type'])\n",
111
+ " # Add edge with weight\n",
112
+ " G.add_edge(row['entity_a'], row['entity_b'], weight=row['co_occurrence_count'])\n",
113
+ "\n",
114
+ "print(f'Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges')"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": null,
120
+ "metadata": {},
121
+ "outputs": [],
122
+ "source": [
123
+ "# Compute centrality metrics\n",
124
+ "print('Computing degree centrality...')\n",
125
+ "degree_cent = nx.degree_centrality(G)\n",
126
+ "\n",
127
+ "print('Computing betweenness centrality...')\n",
128
+ "betweenness_cent = nx.betweenness_centrality(G, weight='weight')\n",
129
+ "\n",
130
+ "print('Computing PageRank...')\n",
131
+ "pagerank = nx.pagerank(G, weight='weight')\n",
132
+ "\n",
133
+ "# Store as node attributes\n",
134
+ "for node in G.nodes:\n",
135
+ " G.nodes[node]['degree_centrality'] = degree_cent[node]\n",
136
+ " G.nodes[node]['betweenness_centrality'] = betweenness_cent[node]\n",
137
+ " G.nodes[node]['pagerank'] = pagerank[node]\n",
138
+ " G.nodes[node]['centrality'] = pagerank[node] # used by export_network_json\n",
139
+ "\n",
140
+ "print('Centrality metrics computed.')"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": null,
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "# Community detection using Louvain method\n",
150
+ "print('Running Louvain community detection...')\n",
151
+ "communities = nx.community.louvain_communities(G, weight='weight', seed=42)\n",
152
+ "\n",
153
+ "# Assign community IDs to nodes\n",
154
+ "for comm_id, community in enumerate(communities):\n",
155
+ " for node in community:\n",
156
+ " G.nodes[node]['community'] = comm_id\n",
157
+ "\n",
158
+ "print(f'Found {len(communities)} communities')\n",
159
+ "\n",
160
+ "# Community size distribution\n",
161
+ "comm_sizes = sorted([len(c) for c in communities], reverse=True)\n",
162
+ "print(f'Community sizes (top 10): {comm_sizes[:10]}')"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": null,
168
+ "metadata": {},
169
+ "outputs": [],
170
+ "source": [
171
+ "# Export graph as JSON\n",
172
+ "section_label = source_section or 'all'\n",
173
+ "filename = f'network_{section_label}.json'\n",
174
+ "output_path = export_network_json(G, filename, max_nodes=500)\n",
175
+ "print(f'Network exported to: {output_path}')"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": null,
181
+ "metadata": {},
182
+ "outputs": [],
183
+ "source": [
184
+ "# Print top 20 entities by PageRank\n",
185
+ "print('\\n=== Top 20 Entities by PageRank ===')\n",
186
+ "top_pr = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:20]\n",
187
+ "for rank, (entity, pr_score) in enumerate(top_pr, 1):\n",
188
+ " node_data = G.nodes[entity]\n",
189
+ " print(\n",
190
+ " f'{rank:2d}. {entity:40s} '\n",
191
+ " f'type={node_data[\"type\"]:8s} '\n",
192
+ " f'PR={pr_score:.6f} '\n",
193
+ " f'degree={G.degree(entity):4d} '\n",
194
+ " f'community={node_data[\"community\"]}'\n",
195
+ " )"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "# Summary\n",
205
+ "print('\\n=== Network Analysis Summary ===')\n",
206
+ "print(f'Nodes: {G.number_of_nodes()}')\n",
207
+ "print(f'Edges: {G.number_of_edges()}')\n",
208
+ "print(f'Communities: {len(communities)}')\n",
209
+ "print(f'Density: {nx.density(G):.6f}')\n",
210
+ "if nx.is_connected(G):\n",
211
+ " print(f'Diameter: {nx.diameter(G)}')\n",
212
+ " print(f'Avg shortest path: {nx.average_shortest_path_length(G):.2f}')\n",
213
+ "else:\n",
214
+ " components = list(nx.connected_components(G))\n",
215
+ " print(f'Connected components: {len(components)}')\n",
216
+ " largest = max(components, key=len)\n",
217
+ " print(f'Largest component: {len(largest)} nodes')\n",
218
+ "\n",
219
+ "# Finish run\n",
220
+ "finish_run(run_id, documents_processed=G.number_of_nodes())\n",
221
+ "print(f'\\nRun {run_id} completed.')"
222
+ ]
223
+ }
224
+ ],
225
+ "metadata": {
226
+ "kernelspec": {
227
+ "display_name": "Python 3",
228
+ "language": "python",
229
+ "name": "python3"
230
+ },
231
+ "language_info": {
232
+ "name": "python",
233
+ "version": "3.10.0"
234
+ }
235
+ },
236
+ "nbformat": 4,
237
+ "nbformat_minor": 5
238
+ }