Spaces:

Nomio4640
/

NLP-intelligence

Sleeping

App Files Files Community

NLP-intelligence / nlp_core /network_analyzer.py

Nomio4640

first init

ae91091 about 1 month ago

raw

history blame contribute delete

3.42 kB

	"""
	Network Analyzer — builds entity co-occurrence graphs using NetworkX.
	Generates data structures for frontend network visualization.
	"""

	from collections import Counter
	from itertools import combinations
	from typing import List, Dict, Optional
	from .models import EntityResult, NetworkData, NetworkNode, NetworkEdge


	class NetworkAnalyzer:
	"""Builds co-occurrence networks from NER results."""

	def build_network(
	self,
	documents_entities: List[List[EntityResult]],
	min_frequency: int = 2,
	top_n_nodes: int = 50,
	) -> NetworkData:
	"""
	Build a co-occurrence network from entity results across multiple documents.

	Args:
	documents_entities: List of entity lists (one per document)
	min_frequency: Minimum entity frequency to include as a node
	top_n_nodes: Maximum number of nodes to include
	"""
	# Count entity frequencies
	entity_counter = Counter()
	entity_types = {}

	for doc_entities in documents_entities:
	for ent in doc_entities:
	key = ent.word.strip()
	if key:
	entity_counter[key] += 1
	entity_types[key] = ent.entity_group

	# Filter by minimum frequency and take top N
	top_entities = {
	word for word, count in entity_counter.most_common(top_n_nodes)
	if count >= min_frequency
	}

	if not top_entities:
	return NetworkData()

	# Count co-occurrences (entities appearing in the same document)
	edge_counter = Counter()
	for doc_entities in documents_entities:
	doc_words = list({
	ent.word.strip() for ent in doc_entities
	if ent.word.strip() in top_entities
	})
	for a, b in combinations(sorted(doc_words), 2):
	edge_counter[(a, b)] += 1

	# Build nodes
	nodes = []
	for word in top_entities:
	nodes.append(NetworkNode(
	id=word,
	label=word,
	entity_type=entity_types.get(word, "MISC"),
	frequency=entity_counter[word],
	))

	# Build edges
	edges = []
	for (source, target), weight in edge_counter.items():
	if weight >= 1:
	edges.append(NetworkEdge(
	source=source,
	target=target,
	weight=weight,
	))

	return NetworkData(nodes=nodes, edges=edges)

	def get_entity_stats(
	self, documents_entities: List[List[EntityResult]], top_n: int = 20
	) -> Dict[str, List[Dict]]:
	"""
	Get top entities by type (PER, ORG, LOC).

	Returns: {"PER": [{"word": ..., "count": ...}], "ORG": [...], ...}
	"""
	by_type: Dict[str, Counter] = {}

	for doc_entities in documents_entities:
	for ent in doc_entities:
	etype = ent.entity_group
	if etype not in by_type:
	by_type[etype] = Counter()
	by_type[etype][ent.word.strip()] += 1

	result = {}
	for etype, counter in by_type.items():
	result[etype] = [
	{"word": word, "count": count}
	for word, count in counter.most_common(top_n)
	]

	return result