""" Network Analyzer — builds entity co-occurrence graphs using NetworkX. Generates data structures for frontend network visualization. """ from collections import Counter from itertools import combinations from typing import List, Dict, Optional from .models import EntityResult, NetworkData, NetworkNode, NetworkEdge class NetworkAnalyzer: """Builds co-occurrence networks from NER results.""" def build_network( self, documents_entities: List[List[EntityResult]], min_frequency: int = 2, top_n_nodes: int = 50, ) -> NetworkData: """ Build a co-occurrence network from entity results across multiple documents. Args: documents_entities: List of entity lists (one per document) min_frequency: Minimum entity frequency to include as a node top_n_nodes: Maximum number of nodes to include """ # Count entity frequencies entity_counter = Counter() entity_types = {} for doc_entities in documents_entities: for ent in doc_entities: key = ent.word.strip() if key: entity_counter[key] += 1 entity_types[key] = ent.entity_group # Filter by minimum frequency and take top N top_entities = { word for word, count in entity_counter.most_common(top_n_nodes) if count >= min_frequency } if not top_entities: return NetworkData() # Count co-occurrences (entities appearing in the same document) edge_counter = Counter() for doc_entities in documents_entities: doc_words = list({ ent.word.strip() for ent in doc_entities if ent.word.strip() in top_entities }) for a, b in combinations(sorted(doc_words), 2): edge_counter[(a, b)] += 1 # Build nodes nodes = [] for word in top_entities: nodes.append(NetworkNode( id=word, label=word, entity_type=entity_types.get(word, "MISC"), frequency=entity_counter[word], )) # Build edges edges = [] for (source, target), weight in edge_counter.items(): if weight >= 1: edges.append(NetworkEdge( source=source, target=target, weight=weight, )) return NetworkData(nodes=nodes, edges=edges) def get_entity_stats( self, documents_entities: List[List[EntityResult]], top_n: int = 20 ) -> Dict[str, List[Dict]]: """ Get top entities by type (PER, ORG, LOC). Returns: {"PER": [{"word": ..., "count": ...}], "ORG": [...], ...} """ by_type: Dict[str, Counter] = {} for doc_entities in documents_entities: for ent in doc_entities: etype = ent.entity_group if etype not in by_type: by_type[etype] = Counter() by_type[etype][ent.word.strip()] += 1 result = {} for etype, counter in by_type.items(): result[etype] = [ {"word": word, "count": count} for word, count in counter.most_common(top_n) ] return result