Madras1 commited on
Commit
a9b7135
·
verified ·
1 Parent(s): f565049

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -1
app.py CHANGED
@@ -310,6 +310,67 @@ def build_entity_graph(entities_by_doc: List[List[Tuple[str, str]]],
310
  "top_entities": [{"entity": e[0][0], "type": e[0][1], "docs": e[1]} for e in top_entities]
311
  }
312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
  # ==============================================================================
315
  # PIPELINE DE PROCESSAMENTO DE DADOS
@@ -843,7 +904,11 @@ async def entity_graph_api(job_id: str = Form(...)):
843
  for ents in entities_by_doc
844
  ]
845
 
846
- logging.info(f"Grafo construído: {graph_data['edge_count']} arestas, {len(graph_data['top_entities'])} entidades principais")
 
 
 
 
847
 
848
  return graph_data
849
 
 
310
  "top_entities": [{"entity": e[0][0], "type": e[0][1], "docs": e[1]} for e in top_entities]
311
  }
312
 
313
+ def build_entity_to_entity_graph(entities_by_doc: List[List[Tuple[str, str]]]) -> Dict[str, Any]:
314
+ """
315
+ Constrói grafo onde os NÓS são entidades e ARESTAS são co-ocorrências.
316
+ Entidades que aparecem no mesmo documento são conectadas.
317
+ """
318
+ # Contar co-ocorrências
319
+ cooccurrence = defaultdict(int)
320
+ entity_doc_count = defaultdict(int)
321
+
322
+ for entities in entities_by_doc:
323
+ unique_entities = list(set(entities))
324
+
325
+ # Contar docs por entidade
326
+ for ent in unique_entities:
327
+ entity_doc_count[ent] += 1
328
+
329
+ # Criar pares de co-ocorrência
330
+ for i in range(len(unique_entities)):
331
+ for j in range(i + 1, len(unique_entities)):
332
+ pair = tuple(sorted([unique_entities[i], unique_entities[j]], key=str))
333
+ cooccurrence[pair] += 1
334
+
335
+ # Construir nós (entidades com >= 2 docs)
336
+ nodes = []
337
+ entity_to_id = {}
338
+ for idx, (entity, count) in enumerate(sorted(entity_doc_count.items(), key=lambda x: x[1], reverse=True)):
339
+ if count >= 2:
340
+ entity_to_id[entity] = idx
341
+ nodes.append({
342
+ "id": idx,
343
+ "entity": entity[0],
344
+ "type": entity[1],
345
+ "docs": count,
346
+ # Posição inicial em círculo
347
+ "x": 2 * np.cos(2 * np.pi * idx / max(len(entity_doc_count), 1)),
348
+ "y": 2 * np.sin(2 * np.pi * idx / max(len(entity_doc_count), 1)),
349
+ "z": 0
350
+ })
351
+
352
+ # Construir arestas (co-ocorrências)
353
+ entity_edges = []
354
+ for (ent1, ent2), weight in cooccurrence.items():
355
+ if ent1 in entity_to_id and ent2 in entity_to_id and weight >= 1:
356
+ entity_edges.append({
357
+ "source": entity_to_id[ent1],
358
+ "target": entity_to_id[ent2],
359
+ "weight": weight,
360
+ "source_entity": ent1[0],
361
+ "target_entity": ent2[0]
362
+ })
363
+
364
+ # Ordenar arestas por peso
365
+ entity_edges.sort(key=lambda x: x["weight"], reverse=True)
366
+
367
+ return {
368
+ "nodes": nodes,
369
+ "edges": entity_edges[:200], # Limitar a 200 arestas mais fortes
370
+ "node_count": len(nodes),
371
+ "edge_count": len(entity_edges)
372
+ }
373
+
374
 
375
  # ==============================================================================
376
  # PIPELINE DE PROCESSAMENTO DE DADOS
 
904
  for ents in entities_by_doc
905
  ]
906
 
907
+ # Adicionar grafo entidade-entidade
908
+ entity_network = build_entity_to_entity_graph(entities_by_doc)
909
+ graph_data["entity_network"] = entity_network
910
+
911
+ logging.info(f"Grafo construído: {graph_data['edge_count']} arestas doc-doc, {entity_network['node_count']} nós entidade")
912
 
913
  return graph_data
914