File size: 3,004 Bytes
049099a
 
 
 
 
 
 
 
5ef3419
049099a
 
 
5ef3419
 
049099a
 
5ef3419
 
 
 
 
 
 
 
 
049099a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ef3419
049099a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import numpy as np
from sentence_transformers import SentenceTransformer, util
from nltk import sent_tokenize

class EmbeddingSimilarity:
    """
    Uses a sentence‑transformer model to compute semantic similarity between
    the mark and a list of descriptive terms, and also between mark and goods.
    The model is loaded lazily on first use to avoid import‑time network issues.
    """

    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model_name = model_name
        self._model = None
        self.class_centroids = {}

    @property
    def model(self):
        """Lazy load the sentence transformer model."""
        if self._model is None:
            print(f"Loading embedding model: {self.model_name}")
            self._model = SentenceTransformer(self.model_name)
            print("✅ Embedding model loaded.")
        return self._model

    def encode(self, text):
        """Return embedding for a single text."""
        return self.model.encode(text, convert_to_tensor=True)

    def similarity(self, emb1, emb2):
        """Cosine similarity between two embeddings."""
        return float(util.cos_sim(emb1, emb2)[0][0])

    def max_similarity_to_terms(self, mark, descriptive_terms):
        """
        Compute the maximum cosine similarity between the mark embedding
        and each individual descriptive term's embedding.
        """
        if not descriptive_terms:
            return 0.0
        mark_emb = self.encode(mark)
        term_embs = self.encode(descriptive_terms)
        sims = util.cos_sim(mark_emb, term_embs)[0]
        return float(sims.max())

    def similarity_to_class_centroid(self, mark, class_terms):
        """
        Pre‑compute centroid for a class (average of all term embeddings)
        and compare mark against it. (Useful for speed when class_terms are static.)
        """
        if not class_terms:
            return 0.0
        # For simplicity, compute on the fly; you can cache the centroid per class.
        term_embs = self.encode(class_terms)
        centroid = term_embs.mean(axis=0)
        mark_emb = self.encode(mark)
        return self.similarity(mark_emb, centroid)

    def similarity_to_goods(self, mark, goods):
        """
        Compute similarity between mark and goods using the bi‑encoder.
        This is a fast alternative to the cross‑encoder.
        """
        if not goods:
            return 0.0
        mark_emb = self.encode(mark)
        goods_emb = self.encode(goods)
        return self.similarity(mark_emb, goods_emb)

    def similarity_to_goods_segments(self, mark, goods):
        """
        Split goods into sentences and take the maximum similarity.
        """
        if not goods:
            return 0.0
        sentences = sent_tokenize(goods)
        if not sentences:
            return 0.0
        mark_emb = self.encode(mark)
        sent_embs = self.encode(sentences)
        sims = util.cos_sim(mark_emb, sent_embs)[0]
        return float(sims.max())