Madras1 commited on
Commit
f565049
·
verified ·
1 Parent(s): cd8a5cd

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -1
app.py CHANGED
@@ -211,7 +211,62 @@ def extract_entities(textos: List[str]) -> List[List[Tuple[str, str]]]:
211
  if len(ent.text.strip()) > 2 and ent.label_ in ("PERSON", "PER", "ORG", "GPE", "LOC")]
212
  entities_by_doc.append(entities)
213
 
214
- return entities_by_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
  def build_entity_graph(entities_by_doc: List[List[Tuple[str, str]]],
217
  positions: List[Dict]) -> Dict[str, Any]:
 
211
  if len(ent.text.strip()) > 2 and ent.label_ in ("PERSON", "PER", "ORG", "GPE", "LOC")]
212
  entities_by_doc.append(entities)
213
 
214
+ # Normalizar entidades para deduplicação
215
+ return normalize_entities(entities_by_doc)
216
+
217
+ def normalize_entities(entities_by_doc: List[List[Tuple[str, str]]]) -> List[List[Tuple[str, str]]]:
218
+ """Normaliza entidades para agrupar variações do mesmo nome."""
219
+
220
+ # Coletar todas as entidades únicas por tipo
221
+ all_entities = defaultdict(set)
222
+ for entities in entities_by_doc:
223
+ for text, etype in entities:
224
+ all_entities[etype].add(text)
225
+
226
+ # Criar mapeamento de normalização
227
+ # Agrupa entidades onde uma contém a outra ou são muito similares
228
+ normalization_map = {}
229
+
230
+ for etype, entity_set in all_entities.items():
231
+ entities_list = sorted(entity_set, key=len, reverse=True) # Maiores primeiro
232
+
233
+ for entity in entities_list:
234
+ if entity in normalization_map:
235
+ continue
236
+
237
+ # Encontrar entidades que são parte desta ou similares
238
+ canonical = entity
239
+ for other in entities_list:
240
+ if other == entity:
241
+ continue
242
+
243
+ # Se uma contém a outra (ex: "donald trump" contém "trump")
244
+ if other in entity or entity in other:
245
+ # Usar a mais completa como canônica
246
+ if len(entity) >= len(other):
247
+ normalization_map[(other, etype)] = (entity, etype)
248
+ else:
249
+ normalization_map[(entity, etype)] = (other, etype)
250
+ canonical = other
251
+
252
+ # Mapear para si mesmo se não foi mapeado
253
+ if (entity, etype) not in normalization_map:
254
+ normalization_map[(entity, etype)] = (canonical, etype)
255
+
256
+ # Aplicar normalização
257
+ normalized_docs = []
258
+ for entities in entities_by_doc:
259
+ normalized = []
260
+ seen = set()
261
+ for text, etype in entities:
262
+ canonical = normalization_map.get((text, etype), (text, etype))
263
+ if canonical not in seen:
264
+ seen.add(canonical)
265
+ normalized.append(canonical)
266
+ normalized_docs.append(normalized)
267
+
268
+ logging.info(f"Normalização: {len(all_entities)} tipos, mapa com {len(normalization_map)} entradas")
269
+ return normalized_docs
270
 
271
  def build_entity_graph(entities_by_doc: List[List[Tuple[str, str]]],
272
  positions: List[Dict]) -> Dict[str, Any]: