NLP-intelligence / nlp_core /network_analyzer.py
Nomio4640's picture
first init
ae91091
"""
Network Analyzer — builds entity co-occurrence graphs using NetworkX.
Generates data structures for frontend network visualization.
"""
from collections import Counter
from itertools import combinations
from typing import List, Dict, Optional
from .models import EntityResult, NetworkData, NetworkNode, NetworkEdge
class NetworkAnalyzer:
"""Builds co-occurrence networks from NER results."""
def build_network(
self,
documents_entities: List[List[EntityResult]],
min_frequency: int = 2,
top_n_nodes: int = 50,
) -> NetworkData:
"""
Build a co-occurrence network from entity results across multiple documents.
Args:
documents_entities: List of entity lists (one per document)
min_frequency: Minimum entity frequency to include as a node
top_n_nodes: Maximum number of nodes to include
"""
# Count entity frequencies
entity_counter = Counter()
entity_types = {}
for doc_entities in documents_entities:
for ent in doc_entities:
key = ent.word.strip()
if key:
entity_counter[key] += 1
entity_types[key] = ent.entity_group
# Filter by minimum frequency and take top N
top_entities = {
word for word, count in entity_counter.most_common(top_n_nodes)
if count >= min_frequency
}
if not top_entities:
return NetworkData()
# Count co-occurrences (entities appearing in the same document)
edge_counter = Counter()
for doc_entities in documents_entities:
doc_words = list({
ent.word.strip() for ent in doc_entities
if ent.word.strip() in top_entities
})
for a, b in combinations(sorted(doc_words), 2):
edge_counter[(a, b)] += 1
# Build nodes
nodes = []
for word in top_entities:
nodes.append(NetworkNode(
id=word,
label=word,
entity_type=entity_types.get(word, "MISC"),
frequency=entity_counter[word],
))
# Build edges
edges = []
for (source, target), weight in edge_counter.items():
if weight >= 1:
edges.append(NetworkEdge(
source=source,
target=target,
weight=weight,
))
return NetworkData(nodes=nodes, edges=edges)
def get_entity_stats(
self, documents_entities: List[List[EntityResult]], top_n: int = 20
) -> Dict[str, List[Dict]]:
"""
Get top entities by type (PER, ORG, LOC).
Returns: {"PER": [{"word": ..., "count": ...}], "ORG": [...], ...}
"""
by_type: Dict[str, Counter] = {}
for doc_entities in documents_entities:
for ent in doc_entities:
etype = ent.entity_group
if etype not in by_type:
by_type[etype] = Counter()
by_type[etype][ent.word.strip()] += 1
result = {}
for etype, counter in by_type.items():
result[etype] = [
{"word": word, "count": count}
for word, count in counter.most_common(top_n)
]
return result