Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from sentence_transformers import SentenceTransformer, util | |
| from nltk import sent_tokenize | |
| class EmbeddingSimilarity: | |
| """ | |
| Uses a sentence‑transformer model to compute semantic similarity between | |
| the mark and a list of descriptive terms, and also between mark and goods. | |
| The model is loaded lazily on first use to avoid import‑time network issues. | |
| """ | |
| def __init__(self, model_name='all-MiniLM-L6-v2'): | |
| self.model_name = model_name | |
| self._model = None | |
| self.class_centroids = {} | |
| def model(self): | |
| """Lazy load the sentence transformer model.""" | |
| if self._model is None: | |
| print(f"Loading embedding model: {self.model_name}") | |
| self._model = SentenceTransformer(self.model_name) | |
| print("✅ Embedding model loaded.") | |
| return self._model | |
| def encode(self, text): | |
| """Return embedding for a single text.""" | |
| return self.model.encode(text, convert_to_tensor=True) | |
| def similarity(self, emb1, emb2): | |
| """Cosine similarity between two embeddings.""" | |
| return float(util.cos_sim(emb1, emb2)[0][0]) | |
| def max_similarity_to_terms(self, mark, descriptive_terms): | |
| """ | |
| Compute the maximum cosine similarity between the mark embedding | |
| and each individual descriptive term's embedding. | |
| """ | |
| if not descriptive_terms: | |
| return 0.0 | |
| mark_emb = self.encode(mark) | |
| term_embs = self.encode(descriptive_terms) | |
| sims = util.cos_sim(mark_emb, term_embs)[0] | |
| return float(sims.max()) | |
| def similarity_to_class_centroid(self, mark, class_terms): | |
| """ | |
| Pre‑compute centroid for a class (average of all term embeddings) | |
| and compare mark against it. (Useful for speed when class_terms are static.) | |
| """ | |
| if not class_terms: | |
| return 0.0 | |
| # For simplicity, compute on the fly; you can cache the centroid per class. | |
| term_embs = self.encode(class_terms) | |
| centroid = term_embs.mean(axis=0) | |
| mark_emb = self.encode(mark) | |
| return self.similarity(mark_emb, centroid) | |
| def similarity_to_goods(self, mark, goods): | |
| """ | |
| Compute similarity between mark and goods using the bi‑encoder. | |
| This is a fast alternative to the cross‑encoder. | |
| """ | |
| if not goods: | |
| return 0.0 | |
| mark_emb = self.encode(mark) | |
| goods_emb = self.encode(goods) | |
| return self.similarity(mark_emb, goods_emb) | |
| def similarity_to_goods_segments(self, mark, goods): | |
| """ | |
| Split goods into sentences and take the maximum similarity. | |
| """ | |
| if not goods: | |
| return 0.0 | |
| sentences = sent_tokenize(goods) | |
| if not sentences: | |
| return 0.0 | |
| mark_emb = self.encode(mark) | |
| sent_embs = self.encode(sentences) | |
| sims = util.cos_sim(mark_emb, sent_embs)[0] | |
| return float(sims.max()) |