davidtran999 commited on
Commit
57b3892
·
verified ·
1 Parent(s): 8b3d510

Upload backend/core/embeddings.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. backend/core/embeddings.py +307 -0
backend/core/embeddings.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vector embeddings utilities for semantic search.
3
+ """
4
+ import os
5
+ from typing import List, Optional, Union, Dict
6
+ import numpy as np
7
+ from pathlib import Path
8
+
9
+ try:
10
+ from sentence_transformers import SentenceTransformer
11
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
12
+ except ImportError:
13
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
14
+ SentenceTransformer = None
15
+
16
+ # Available embedding models (ordered by preference for Vietnamese)
17
+ # Models are ordered from fastest to best quality
18
+ AVAILABLE_MODELS = {
19
+ # Fast models (384 dim) - Good for production
20
+ "paraphrase-multilingual": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", # Fast, 384 dim
21
+
22
+ # High quality models (768 dim) - Better accuracy
23
+ "multilingual-mpnet": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", # High quality, 768 dim, recommended
24
+ "vietnamese-sbert": "keepitreal/vietnamese-sbert-v2", # Vietnamese-specific (may require auth)
25
+
26
+ # Very high quality models (1024+ dim) - Best accuracy but slower
27
+ "multilingual-e5-large": "intfloat/multilingual-e5-large", # Very high quality, 1024 dim, large model
28
+ "multilingual-e5-base": "intfloat/multilingual-e5-base", # High quality, 768 dim, balanced
29
+
30
+ # Vietnamese-specific models (if available)
31
+ "vietnamese-embedding": "dangvantuan/vietnamese-embedding", # Vietnamese-specific (if available)
32
+ "vietnamese-bi-encoder": "bkai-foundation-models/vietnamese-bi-encoder", # Vietnamese bi-encoder (if available)
33
+ }
34
+
35
+ # Default embedding model for Vietnamese (can be overridden via env var)
36
+ # Use multilingual-mpnet as default - better quality than MiniLM, still reasonable size
37
+ # Can be set via EMBEDDING_MODEL env var (supports both short names and full model paths)
38
+ # Examples:
39
+ # - EMBEDDING_MODEL=multilingual-mpnet (uses short name)
40
+ # - EMBEDDING_MODEL=sentence-transformers/paraphrase-multilingual-mpnet-base-v2 (full path)
41
+ # - EMBEDDING_MODEL=/path/to/local/model (local model path)
42
+ # - EMBEDDING_MODEL=username/private-model (private HF model, requires HF_TOKEN)
43
+ DEFAULT_MODEL_NAME = os.environ.get(
44
+ "EMBEDDING_MODEL",
45
+ AVAILABLE_MODELS.get("multilingual-mpnet", "sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
46
+ )
47
+ FALLBACK_MODEL_NAME = AVAILABLE_MODELS.get("paraphrase-multilingual", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
48
+
49
+ # Cache for model instance
50
+ _model_cache: Optional[SentenceTransformer] = None
51
+ _cached_model_name: Optional[str] = None
52
+
53
+
54
+ def get_embedding_model(model_name: Optional[str] = None, force_reload: bool = False) -> Optional[SentenceTransformer]:
55
+ """
56
+ Get or load embedding model instance.
57
+
58
+ Args:
59
+ model_name: Name of the model to load. Can be:
60
+ - Full model name (e.g., "keepitreal/vietnamese-sbert-v2")
61
+ - Short name (e.g., "vietnamese-sbert")
62
+ - None (uses DEFAULT_MODEL_NAME from env or default)
63
+ force_reload: Force reload model even if cached.
64
+
65
+ Returns:
66
+ SentenceTransformer instance or None if not available.
67
+ """
68
+ global _model_cache, _cached_model_name
69
+
70
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
71
+ print("Warning: sentence-transformers not installed. Install with: pip install sentence-transformers")
72
+ return None
73
+
74
+ # Resolve model name (check if it's a short name)
75
+ resolved_model_name = model_name or DEFAULT_MODEL_NAME
76
+ if resolved_model_name in AVAILABLE_MODELS:
77
+ resolved_model_name = AVAILABLE_MODELS[resolved_model_name]
78
+
79
+ # Return cached model if same model and not forcing reload
80
+ if _model_cache is not None and _cached_model_name == resolved_model_name and not force_reload:
81
+ return _model_cache
82
+
83
+ # Load new model
84
+ try:
85
+ print(f"Loading embedding model: {resolved_model_name}")
86
+
87
+ # Check if it's a local path
88
+ model_path = Path(resolved_model_name)
89
+ if model_path.exists() and model_path.is_dir():
90
+ # Local model path
91
+ print(f"Loading local model from: {resolved_model_name}")
92
+ _model_cache = SentenceTransformer(str(model_path))
93
+ else:
94
+ # Hugging Face model (public or private)
95
+ hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
96
+ model_kwargs = {}
97
+ if hf_token:
98
+ print(f"Using Hugging Face token for model: {resolved_model_name}")
99
+ model_kwargs["token"] = hf_token
100
+ # Public model (or token provided)
101
+ _model_cache = SentenceTransformer(resolved_model_name, **model_kwargs)
102
+
103
+ _cached_model_name = resolved_model_name
104
+ # Get model dimension for info
105
+ try:
106
+ test_embedding = _model_cache.encode("test", show_progress_bar=False)
107
+ dim = len(test_embedding)
108
+ print(f"✅ Successfully loaded model: {resolved_model_name} (dimension: {dim})")
109
+ except Exception:
110
+ print(f"✅ Successfully loaded model: {resolved_model_name}")
111
+ return _model_cache
112
+ except Exception as e:
113
+ print(f"❌ Error loading model {resolved_model_name}: {e}")
114
+ if resolved_model_name != FALLBACK_MODEL_NAME:
115
+ print(f"Trying fallback model: {FALLBACK_MODEL_NAME}")
116
+ try:
117
+ _model_cache = SentenceTransformer(FALLBACK_MODEL_NAME)
118
+ _cached_model_name = FALLBACK_MODEL_NAME
119
+ test_embedding = _model_cache.encode("test", show_progress_bar=False)
120
+ dim = len(test_embedding)
121
+ print(f"✅ Successfully loaded fallback model: {FALLBACK_MODEL_NAME} (dimension: {dim})")
122
+ return _model_cache
123
+ except Exception as e2:
124
+ print(f"❌ Error loading fallback model: {e2}")
125
+ return None
126
+
127
+
128
+ def list_available_models() -> Dict[str, str]:
129
+ """
130
+ List all available embedding models.
131
+
132
+ Returns:
133
+ Dictionary mapping short names to full model names.
134
+ """
135
+ return AVAILABLE_MODELS.copy()
136
+
137
+
138
+ def compare_models(texts: List[str], model_names: Optional[List[str]] = None) -> Dict[str, Dict[str, float]]:
139
+ """
140
+ Compare different embedding models on sample texts.
141
+
142
+ Args:
143
+ texts: List of sample texts to test.
144
+ model_names: List of model names to compare. If None, compares all available models.
145
+
146
+ Returns:
147
+ Dictionary with comparison results including:
148
+ - dimension: Embedding dimension
149
+ - encoding_time: Time to encode texts (seconds)
150
+ - avg_similarity: Average similarity between texts
151
+ """
152
+ import time
153
+
154
+ if model_names is None:
155
+ model_names = list(AVAILABLE_MODELS.keys())
156
+
157
+ results = {}
158
+
159
+ for model_key in model_names:
160
+ if model_key not in AVAILABLE_MODELS:
161
+ continue
162
+
163
+ model_name = AVAILABLE_MODELS[model_key]
164
+ try:
165
+ model = get_embedding_model(model_name, force_reload=True)
166
+ if model is None:
167
+ continue
168
+
169
+ # Get dimension
170
+ dim = get_embedding_dimension(model_name)
171
+
172
+ # Measure encoding time
173
+ start_time = time.time()
174
+ embeddings = generate_embeddings_batch(texts, model=model)
175
+ encoding_time = time.time() - start_time
176
+
177
+ # Calculate average similarity
178
+ similarities = []
179
+ for i in range(len(embeddings)):
180
+ for j in range(i + 1, len(embeddings)):
181
+ if embeddings[i] is not None and embeddings[j] is not None:
182
+ sim = cosine_similarity(embeddings[i], embeddings[j])
183
+ similarities.append(sim)
184
+
185
+ avg_similarity = sum(similarities) / len(similarities) if similarities else 0.0
186
+
187
+ results[model_key] = {
188
+ "model_name": model_name,
189
+ "dimension": dim,
190
+ "encoding_time": encoding_time,
191
+ "avg_similarity": avg_similarity
192
+ }
193
+ except Exception as e:
194
+ print(f"Error comparing model {model_key}: {e}")
195
+ results[model_key] = {"error": str(e)}
196
+
197
+ return results
198
+
199
+
200
+ def generate_embedding(text: str, model: Optional[SentenceTransformer] = None) -> Optional[np.ndarray]:
201
+ """
202
+ Generate embedding vector for a single text.
203
+
204
+ Args:
205
+ text: Input text to embed.
206
+ model: SentenceTransformer instance. If None, uses default model.
207
+
208
+ Returns:
209
+ Numpy array of embedding vector or None if error.
210
+ """
211
+ if not text or not text.strip():
212
+ return None
213
+
214
+ if model is None:
215
+ model = get_embedding_model()
216
+
217
+ if model is None:
218
+ return None
219
+
220
+ try:
221
+ embedding = model.encode(text, normalize_embeddings=True, show_progress_bar=False)
222
+ return embedding
223
+ except Exception as e:
224
+ print(f"Error generating embedding: {e}")
225
+ return None
226
+
227
+
228
+ def generate_embeddings_batch(texts: List[str], model: Optional[SentenceTransformer] = None, batch_size: int = 32) -> List[Optional[np.ndarray]]:
229
+ """
230
+ Generate embeddings for a batch of texts.
231
+
232
+ Args:
233
+ texts: List of input texts.
234
+ model: SentenceTransformer instance. If None, uses default model.
235
+ batch_size: Batch size for processing.
236
+
237
+ Returns:
238
+ List of numpy arrays (embeddings) or None for failed texts.
239
+ """
240
+ if not texts:
241
+ return []
242
+
243
+ if model is None:
244
+ model = get_embedding_model()
245
+
246
+ if model is None:
247
+ return [None] * len(texts)
248
+
249
+ try:
250
+ embeddings = model.encode(
251
+ texts,
252
+ batch_size=batch_size,
253
+ normalize_embeddings=True,
254
+ show_progress_bar=True,
255
+ convert_to_numpy=True
256
+ )
257
+ return [emb for emb in embeddings]
258
+ except Exception as e:
259
+ print(f"Error generating batch embeddings: {e}")
260
+ return [None] * len(texts)
261
+
262
+
263
+ def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
264
+ """
265
+ Calculate cosine similarity between two vectors.
266
+
267
+ Args:
268
+ vec1: First vector.
269
+ vec2: Second vector.
270
+
271
+ Returns:
272
+ Cosine similarity score (0-1).
273
+ """
274
+ if vec1 is None or vec2 is None:
275
+ return 0.0
276
+
277
+ dot_product = np.dot(vec1, vec2)
278
+ norm1 = np.linalg.norm(vec1)
279
+ norm2 = np.linalg.norm(vec2)
280
+
281
+ if norm1 == 0 or norm2 == 0:
282
+ return 0.0
283
+
284
+ return float(dot_product / (norm1 * norm2))
285
+
286
+
287
+ def get_embedding_dimension(model_name: Optional[str] = None) -> int:
288
+ """
289
+ Get embedding dimension for a model.
290
+
291
+ Args:
292
+ model_name: Model name. If None, uses default.
293
+
294
+ Returns:
295
+ Embedding dimension or 0 if unknown.
296
+ """
297
+ model = get_embedding_model(model_name)
298
+ if model is None:
299
+ return 0
300
+
301
+ # Get dimension by encoding a dummy text
302
+ try:
303
+ dummy_embedding = model.encode("test", show_progress_bar=False)
304
+ return len(dummy_embedding)
305
+ except Exception:
306
+ return 0
307
+