| import numpy as np |
| from sklearn.cluster import DBSCAN |
| from langchain_huggingface import HuggingFaceEmbeddings |
| from collections import Counter |
|
|
| class EntityResolver: |
| def __init__(self, model_name="all-MiniLM-L6-v2", similarity_threshold=0.85): |
| """ |
| Inizializza il modello per il calcolo delle similarità. |
| similarity_threshold: quanto devono essere vicini i vettori (0-1). |
| Convertito in 'eps' per DBSCAN. |
| """ |
| print("🧩 Inizializzazione Entity Resolver (DBSCAN)...") |
| self.embedding_model = HuggingFaceEmbeddings(model_name=model_name) |
| |
| |
| self.eps = 1 - similarity_threshold |
|
|
| def resolve_entities(self, triples): |
| """ |
| Prende una lista di triple (GraphTriple) e normalizza i nomi delle entità. |
| """ |
| if not triples: |
| return [] |
|
|
| |
| all_entities = set() |
| for t in triples: |
| all_entities.add(t.subject) |
| all_entities.add(t.object) |
| |
| unique_entities = list(all_entities) |
| print(f" Analisi di {len(unique_entities)} entità uniche per deduplica...") |
|
|
| if len(unique_entities) < 2: |
| return triples |
|
|
| |
| embeddings = self.embedding_model.embed_documents(unique_entities) |
| X = np.array(embeddings) |
|
|
| |
| |
| clustering = DBSCAN(eps=self.eps, min_samples=1, metric='cosine').fit(X) |
| labels = clustering.labels_ |
|
|
| |
| |
| cluster_map = {} |
| for entity, label in zip(unique_entities, labels): |
| if label not in cluster_map: |
| cluster_map[label] = [] |
| cluster_map[label].append(entity) |
|
|
| |
| entity_replacement_map = {} |
| for label, variants in cluster_map.items(): |
| if len(variants) > 1: |
| |
| |
| |
| canonical = sorted(variants, key=len, reverse=True)[0] |
| print(f" ✨ Deduplica: {variants} -> '{canonical}'") |
| for v in variants: |
| entity_replacement_map[v] = canonical |
| else: |
| entity_replacement_map[variants[0]] = variants[0] |
|
|
| |
| resolved_triples = [] |
| for t in triples: |
| |
| t.subject = entity_replacement_map.get(t.subject, t.subject) |
| t.object = entity_replacement_map.get(t.object, t.object) |
| resolved_triples.append(t) |
|
|
| return resolved_triples |