datamatters24 commited on
Commit
43cc579
·
verified ·
1 Parent(s): 9027112

Upload ml/10_entity_network.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. ml/10_entity_network.py +241 -0
ml/10_entity_network.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 5: Entity Network Analysis
4
+
5
+ 1. Entity resolution: group similar PERSON names using fuzzy matching
6
+ 2. Co-occurrence: count entity pairs that appear in the same document
7
+ 3. Store in entity_aliases and entity_relationships tables
8
+
9
+ Focuses on PERSON and ORG entities that appear in 3+ documents.
10
+
11
+ Runs on: Hetzner CPU
12
+ """
13
+
14
+ import logging
15
+ from collections import defaultdict
16
+ from difflib import SequenceMatcher
17
+
18
+ import psycopg2
19
+ import psycopg2.extras
20
+
21
+ from db import get_conn
22
+
23
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s")
24
+ log = logging.getLogger(__name__)
25
+
26
+ MIN_DOCS = 3 # minimum documents for an entity to be included
27
+ FUZZY_THRESHOLD = 0.88 # SequenceMatcher ratio for alias detection
28
+ MAX_ENTITIES_PER_DOC = 50 # limit entity pairs per document
29
+ BATCH_SIZE = 1000
30
+
31
+
32
+ def get_frequent_entities(conn, entity_type, min_docs=MIN_DOCS):
33
+ """Get entities appearing in at least min_docs documents."""
34
+ with conn.cursor() as cur:
35
+ cur.execute("""
36
+ SELECT entity_text, COUNT(DISTINCT document_id) as doc_count
37
+ FROM entities
38
+ WHERE entity_type = %s
39
+ AND LENGTH(entity_text) >= 3
40
+ AND LENGTH(entity_text) <= 100
41
+ GROUP BY entity_text
42
+ HAVING COUNT(DISTINCT document_id) >= %s
43
+ ORDER BY doc_count DESC
44
+ """, (entity_type, min_docs))
45
+ return cur.fetchall()
46
+
47
+
48
+ def resolve_entities(entities):
49
+ """Find aliases among entity names using fuzzy matching."""
50
+ names = [e[0] for e in entities]
51
+ doc_counts = {e[0]: e[1] for e in entities}
52
+
53
+ # Sort by frequency (most common = canonical)
54
+ names.sort(key=lambda n: doc_counts.get(n, 0), reverse=True)
55
+
56
+ canonical_map = {} # alias -> canonical
57
+ groups = {} # canonical -> [aliases]
58
+
59
+ for name in names:
60
+ if name in canonical_map:
61
+ continue
62
+
63
+ # Check against existing canonical names
64
+ best_match = None
65
+ best_ratio = 0.0
66
+
67
+ name_lower = name.lower().strip()
68
+
69
+ for canonical in groups:
70
+ canonical_lower = canonical.lower().strip()
71
+
72
+ # Quick length check
73
+ if abs(len(name_lower) - len(canonical_lower)) > max(len(name_lower), len(canonical_lower)) * 0.3:
74
+ continue
75
+
76
+ # Check containment first (faster)
77
+ if name_lower in canonical_lower or canonical_lower in name_lower:
78
+ ratio = 0.92
79
+ else:
80
+ ratio = SequenceMatcher(None, name_lower, canonical_lower).ratio()
81
+
82
+ if ratio > best_ratio and ratio >= FUZZY_THRESHOLD:
83
+ best_ratio = ratio
84
+ best_match = canonical
85
+
86
+ if best_match:
87
+ canonical_map[name] = best_match
88
+ groups[best_match].append(name)
89
+ else:
90
+ groups[name] = []
91
+ canonical_map[name] = name
92
+
93
+ return canonical_map, groups
94
+
95
+
96
+ def store_aliases(conn, canonical_map, entity_type):
97
+ """Store alias mappings in entity_aliases table."""
98
+ rows = []
99
+ for alias, canonical in canonical_map.items():
100
+ if alias != canonical:
101
+ rows.append((canonical, alias, entity_type, 0.9))
102
+
103
+ if not rows:
104
+ return 0
105
+
106
+ with conn.cursor() as cur:
107
+ psycopg2.extras.execute_batch(
108
+ cur,
109
+ """INSERT INTO entity_aliases (canonical_name, alias_name, entity_type, confidence)
110
+ VALUES (%s, %s, %s, %s)
111
+ ON CONFLICT (alias_name, entity_type) DO UPDATE SET
112
+ canonical_name = EXCLUDED.canonical_name""",
113
+ rows,
114
+ page_size=1000,
115
+ )
116
+ conn.commit()
117
+ return len(rows)
118
+
119
+
120
+ def build_cooccurrence(conn, entity_type, canonical_map):
121
+ """Build co-occurrence relationships per source_section."""
122
+ log.info(f"Building co-occurrence for {entity_type}...")
123
+
124
+ # Get all sections
125
+ with conn.cursor() as cur:
126
+ cur.execute("SELECT DISTINCT source_section FROM documents ORDER BY source_section")
127
+ sections = [r[0] for r in cur.fetchall()]
128
+
129
+ total_rels = 0
130
+
131
+ for section in sections:
132
+ log.info(f" Processing section: {section}")
133
+
134
+ # Get entities per document for this section
135
+ with conn.cursor() as cur:
136
+ cur.execute("""
137
+ SELECT e.document_id, array_agg(DISTINCT e.entity_text) as entities
138
+ FROM entities e
139
+ JOIN documents d ON d.id = e.document_id
140
+ WHERE e.entity_type = %s AND d.source_section = %s
141
+ AND LENGTH(e.entity_text) >= 3
142
+ GROUP BY e.document_id
143
+ HAVING COUNT(DISTINCT e.entity_text) >= 2
144
+ """, (entity_type, section))
145
+ doc_entities = cur.fetchall()
146
+
147
+ if not doc_entities:
148
+ continue
149
+
150
+ # Count co-occurrences
151
+ pair_counts = defaultdict(lambda: {'count': 0, 'docs': set()})
152
+
153
+ for doc_id, ent_list in doc_entities:
154
+ # Resolve to canonical names
155
+ resolved = list(set(canonical_map.get(e, e) for e in ent_list))
156
+ resolved.sort()
157
+
158
+ # Limit pairs per document
159
+ if len(resolved) > MAX_ENTITIES_PER_DOC:
160
+ resolved = resolved[:MAX_ENTITIES_PER_DOC]
161
+
162
+ for i in range(len(resolved)):
163
+ for j in range(i + 1, len(resolved)):
164
+ key = (resolved[i], resolved[j])
165
+ pair_counts[key]['count'] += 1
166
+ if len(pair_counts[key]['docs']) < 10:
167
+ pair_counts[key]['docs'].add(doc_id)
168
+
169
+ # Filter: keep pairs with 2+ co-occurrences
170
+ significant = {k: v for k, v in pair_counts.items() if v['count'] >= 2}
171
+
172
+ if not significant:
173
+ continue
174
+
175
+ # Insert
176
+ rows = []
177
+ for (ea, eb), data in significant.items():
178
+ sample_ids = sorted(list(data['docs']))[:5]
179
+ rows.append((
180
+ ea, entity_type, eb, entity_type,
181
+ data['count'], len(data['docs']),
182
+ section, sample_ids,
183
+ ))
184
+
185
+ with conn.cursor() as cur:
186
+ psycopg2.extras.execute_batch(
187
+ cur,
188
+ """INSERT INTO entity_relationships
189
+ (entity_a, entity_a_type, entity_b, entity_b_type,
190
+ co_occurrence_count, document_count, source_section, sample_doc_ids)
191
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
192
+ ON CONFLICT (entity_a, entity_a_type, entity_b, entity_b_type, source_section)
193
+ DO UPDATE SET
194
+ co_occurrence_count = EXCLUDED.co_occurrence_count,
195
+ document_count = EXCLUDED.document_count,
196
+ sample_doc_ids = EXCLUDED.sample_doc_ids""",
197
+ rows,
198
+ page_size=500,
199
+ )
200
+ conn.commit()
201
+ total_rels += len(rows)
202
+ log.info(f" {section}: {len(rows)} relationships ({len(doc_entities)} docs)")
203
+
204
+ return total_rels
205
+
206
+
207
+ def main():
208
+ conn = get_conn()
209
+
210
+ for entity_type in ['PERSON', 'ORG']:
211
+ log.info(f"=== Processing {entity_type} entities ===")
212
+
213
+ # Step 1: Get frequent entities
214
+ entities = get_frequent_entities(conn, entity_type)
215
+ log.info(f"Found {len(entities)} frequent {entity_type} entities (>= {MIN_DOCS} docs)")
216
+
217
+ if not entities:
218
+ continue
219
+
220
+ # Step 2: Entity resolution
221
+ if len(entities) <= 50000: # Only fuzzy match if manageable
222
+ log.info("Running entity resolution...")
223
+ canonical_map, groups = resolve_entities(entities)
224
+ alias_count = sum(1 for a, c in canonical_map.items() if a != c)
225
+ log.info(f"Found {alias_count} aliases across {len(groups)} canonical entities")
226
+ stored = store_aliases(conn, canonical_map, entity_type)
227
+ log.info(f"Stored {stored} alias mappings")
228
+ else:
229
+ log.info(f"Too many entities ({len(entities)}) for fuzzy matching, using exact names")
230
+ canonical_map = {e[0]: e[0] for e in entities}
231
+
232
+ # Step 3: Co-occurrence
233
+ total_rels = build_cooccurrence(conn, entity_type, canonical_map)
234
+ log.info(f"Total {entity_type} relationships: {total_rels}")
235
+
236
+ conn.close()
237
+ log.info("Done.")
238
+
239
+
240
+ if __name__ == "__main__":
241
+ main()