Upload app.py
Browse files
app.py
CHANGED
|
@@ -310,6 +310,67 @@ def build_entity_graph(entities_by_doc: List[List[Tuple[str, str]]],
|
|
| 310 |
"top_entities": [{"entity": e[0][0], "type": e[0][1], "docs": e[1]} for e in top_entities]
|
| 311 |
}
|
| 312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
|
| 314 |
# ==============================================================================
|
| 315 |
# PIPELINE DE PROCESSAMENTO DE DADOS
|
|
@@ -843,7 +904,11 @@ async def entity_graph_api(job_id: str = Form(...)):
|
|
| 843 |
for ents in entities_by_doc
|
| 844 |
]
|
| 845 |
|
| 846 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 847 |
|
| 848 |
return graph_data
|
| 849 |
|
|
|
|
| 310 |
"top_entities": [{"entity": e[0][0], "type": e[0][1], "docs": e[1]} for e in top_entities]
|
| 311 |
}
|
| 312 |
|
| 313 |
+
def build_entity_to_entity_graph(entities_by_doc: List[List[Tuple[str, str]]]) -> Dict[str, Any]:
|
| 314 |
+
"""
|
| 315 |
+
Constrói grafo onde os NÓS são entidades e ARESTAS são co-ocorrências.
|
| 316 |
+
Entidades que aparecem no mesmo documento são conectadas.
|
| 317 |
+
"""
|
| 318 |
+
# Contar co-ocorrências
|
| 319 |
+
cooccurrence = defaultdict(int)
|
| 320 |
+
entity_doc_count = defaultdict(int)
|
| 321 |
+
|
| 322 |
+
for entities in entities_by_doc:
|
| 323 |
+
unique_entities = list(set(entities))
|
| 324 |
+
|
| 325 |
+
# Contar docs por entidade
|
| 326 |
+
for ent in unique_entities:
|
| 327 |
+
entity_doc_count[ent] += 1
|
| 328 |
+
|
| 329 |
+
# Criar pares de co-ocorrência
|
| 330 |
+
for i in range(len(unique_entities)):
|
| 331 |
+
for j in range(i + 1, len(unique_entities)):
|
| 332 |
+
pair = tuple(sorted([unique_entities[i], unique_entities[j]], key=str))
|
| 333 |
+
cooccurrence[pair] += 1
|
| 334 |
+
|
| 335 |
+
# Construir nós (entidades com >= 2 docs)
|
| 336 |
+
nodes = []
|
| 337 |
+
entity_to_id = {}
|
| 338 |
+
for idx, (entity, count) in enumerate(sorted(entity_doc_count.items(), key=lambda x: x[1], reverse=True)):
|
| 339 |
+
if count >= 2:
|
| 340 |
+
entity_to_id[entity] = idx
|
| 341 |
+
nodes.append({
|
| 342 |
+
"id": idx,
|
| 343 |
+
"entity": entity[0],
|
| 344 |
+
"type": entity[1],
|
| 345 |
+
"docs": count,
|
| 346 |
+
# Posição inicial em círculo
|
| 347 |
+
"x": 2 * np.cos(2 * np.pi * idx / max(len(entity_doc_count), 1)),
|
| 348 |
+
"y": 2 * np.sin(2 * np.pi * idx / max(len(entity_doc_count), 1)),
|
| 349 |
+
"z": 0
|
| 350 |
+
})
|
| 351 |
+
|
| 352 |
+
# Construir arestas (co-ocorrências)
|
| 353 |
+
entity_edges = []
|
| 354 |
+
for (ent1, ent2), weight in cooccurrence.items():
|
| 355 |
+
if ent1 in entity_to_id and ent2 in entity_to_id and weight >= 1:
|
| 356 |
+
entity_edges.append({
|
| 357 |
+
"source": entity_to_id[ent1],
|
| 358 |
+
"target": entity_to_id[ent2],
|
| 359 |
+
"weight": weight,
|
| 360 |
+
"source_entity": ent1[0],
|
| 361 |
+
"target_entity": ent2[0]
|
| 362 |
+
})
|
| 363 |
+
|
| 364 |
+
# Ordenar arestas por peso
|
| 365 |
+
entity_edges.sort(key=lambda x: x["weight"], reverse=True)
|
| 366 |
+
|
| 367 |
+
return {
|
| 368 |
+
"nodes": nodes,
|
| 369 |
+
"edges": entity_edges[:200], # Limitar a 200 arestas mais fortes
|
| 370 |
+
"node_count": len(nodes),
|
| 371 |
+
"edge_count": len(entity_edges)
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
|
| 375 |
# ==============================================================================
|
| 376 |
# PIPELINE DE PROCESSAMENTO DE DADOS
|
|
|
|
| 904 |
for ents in entities_by_doc
|
| 905 |
]
|
| 906 |
|
| 907 |
+
# Adicionar grafo entidade-entidade
|
| 908 |
+
entity_network = build_entity_to_entity_graph(entities_by_doc)
|
| 909 |
+
graph_data["entity_network"] = entity_network
|
| 910 |
+
|
| 911 |
+
logging.info(f"Grafo construído: {graph_data['edge_count']} arestas doc-doc, {entity_network['node_count']} nós entidade")
|
| 912 |
|
| 913 |
return graph_data
|
| 914 |
|