davidtran999 commited on
Commit
b05e16e
·
verified ·
1 Parent(s): a7e982f

Upload backend/hue_portal/core/embeddings.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. backend/hue_portal/core/embeddings.py +42 -16
backend/hue_portal/core/embeddings.py CHANGED
@@ -25,6 +25,7 @@ AVAILABLE_MODELS = {
25
  "vietnamese-sbert": "keepitreal/vietnamese-sbert-v2", # Vietnamese-specific (may require auth)
26
 
27
  # Very high quality models (1024+ dim) - Best accuracy but slower
 
28
  "multilingual-e5-large": "intfloat/multilingual-e5-large", # Very high quality, 1024 dim, large model
29
  "multilingual-e5-base": "intfloat/multilingual-e5-base", # High quality, 768 dim, balanced
30
 
@@ -34,17 +35,18 @@ AVAILABLE_MODELS = {
34
  }
35
 
36
  # Default embedding model for Vietnamese (can be overridden via env var)
37
- # Use multilingual-e5-base as default for HF Space - good balance of quality and speed
38
- # 768 dimensions, faster than e5-large (1024 dim), better quality than MiniLM (384 dim)
39
  # Can be set via EMBEDDING_MODEL env var (supports both short names and full model paths)
40
  # Examples:
 
41
  # - EMBEDDING_MODEL=multilingual-e5-base (uses short name)
42
  # - EMBEDDING_MODEL=intfloat/multilingual-e5-base (full path)
43
  # - EMBEDDING_MODEL=/path/to/local/model (local model path)
44
  # - EMBEDDING_MODEL=username/private-model (private HF model, requires HF_TOKEN)
45
  DEFAULT_MODEL_NAME = os.environ.get(
46
  "EMBEDDING_MODEL",
47
- AVAILABLE_MODELS.get("multilingual-e5-base", "intfloat/multilingual-e5-base")
48
  )
49
  FALLBACK_MODEL_NAME = AVAILABLE_MODELS.get("paraphrase-multilingual", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
50
 
@@ -268,14 +270,28 @@ def generate_embedding(text: str, model: Optional[SentenceTransformer] = None) -
268
  return None
269
 
270
  try:
271
- embedding = model.encode(text, normalize_embeddings=True, show_progress_bar=False)
272
- return embedding
 
 
 
 
 
 
 
 
 
 
273
  except Exception as e:
274
- print(f"Error generating embedding: {e}")
275
  return None
276
 
277
 
278
- def generate_embeddings_batch(texts: List[str], model: Optional[SentenceTransformer] = None, batch_size: int = 32) -> List[Optional[np.ndarray]]:
 
 
 
 
279
  """
280
  Generate embeddings for a batch of texts.
281
 
@@ -297,16 +313,26 @@ def generate_embeddings_batch(texts: List[str], model: Optional[SentenceTransfor
297
  return [None] * len(texts)
298
 
299
  try:
300
- embeddings = model.encode(
301
- texts,
302
- batch_size=batch_size,
303
- normalize_embeddings=True,
304
- show_progress_bar=True,
305
- convert_to_numpy=True
306
- )
307
- return [emb for emb in embeddings]
 
 
 
 
 
 
 
 
 
 
308
  except Exception as e:
309
- print(f"Error generating batch embeddings: {e}")
310
  return [None] * len(texts)
311
 
312
 
 
25
  "vietnamese-sbert": "keepitreal/vietnamese-sbert-v2", # Vietnamese-specific (may require auth)
26
 
27
  # Very high quality models (1024+ dim) - Best accuracy but slower
28
+ "bge-m3": "BAAI/bge-m3", # Best for Vietnamese, 1024 dim, supports dense+sparse+multi-vector
29
  "multilingual-e5-large": "intfloat/multilingual-e5-large", # Very high quality, 1024 dim, large model
30
  "multilingual-e5-base": "intfloat/multilingual-e5-base", # High quality, 768 dim, balanced
31
 
 
35
  }
36
 
37
  # Default embedding model for Vietnamese (can be overridden via env var)
38
+ # Use bge-m3 as default - best for Vietnamese legal documents (1024 dim)
39
+ # Fallback to multilingual-e5-base if bge-m3 not available (768 dim, good balance)
40
  # Can be set via EMBEDDING_MODEL env var (supports both short names and full model paths)
41
  # Examples:
42
+ # - EMBEDDING_MODEL=bge-m3 (uses short name, recommended for Vietnamese)
43
  # - EMBEDDING_MODEL=multilingual-e5-base (uses short name)
44
  # - EMBEDDING_MODEL=intfloat/multilingual-e5-base (full path)
45
  # - EMBEDDING_MODEL=/path/to/local/model (local model path)
46
  # - EMBEDDING_MODEL=username/private-model (private HF model, requires HF_TOKEN)
47
  DEFAULT_MODEL_NAME = os.environ.get(
48
  "EMBEDDING_MODEL",
49
+ AVAILABLE_MODELS.get("bge-m3", "BAAI/bge-m3") # BGE-M3 is default, no fallback
50
  )
51
  FALLBACK_MODEL_NAME = AVAILABLE_MODELS.get("paraphrase-multilingual", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
52
 
 
270
  return None
271
 
272
  try:
273
+ import sys
274
+ # Increase recursion limit temporarily for model.encode
275
+ old_limit = sys.getrecursionlimit()
276
+ try:
277
+ sys.setrecursionlimit(5000) # Increase limit for model.encode
278
+ embedding = model.encode(text, normalize_embeddings=True, show_progress_bar=False, convert_to_numpy=True)
279
+ return embedding
280
+ finally:
281
+ sys.setrecursionlimit(old_limit) # Restore original limit
282
+ except RecursionError as e:
283
+ print(f"Error generating embedding (recursion): {e}", flush=True)
284
+ return None
285
  except Exception as e:
286
+ print(f"Error generating embedding: {e}", flush=True)
287
  return None
288
 
289
 
290
+ def generate_embeddings_batch(texts: List[str], model: Optional[SentenceTransformer] = None, batch_size: Optional[int] = None) -> List[Optional[np.ndarray]]:
291
+ # Get batch_size from env var or use default (balance speed and RAM)
292
+ # Smaller batch = faster, larger batch = more RAM usage
293
+ if batch_size is None:
294
+ batch_size = int(os.environ.get("EMBEDDING_BATCH_SIZE", "128")) # Reduced from 256 for speed
295
  """
296
  Generate embeddings for a batch of texts.
297
 
 
313
  return [None] * len(texts)
314
 
315
  try:
316
+ import sys
317
+ # Increase recursion limit temporarily for model.encode
318
+ old_limit = sys.getrecursionlimit()
319
+ try:
320
+ sys.setrecursionlimit(5000) # Increase limit for model.encode
321
+ embeddings = model.encode(
322
+ texts,
323
+ batch_size=batch_size,
324
+ normalize_embeddings=True,
325
+ show_progress_bar=False,
326
+ convert_to_numpy=True
327
+ )
328
+ return [emb for emb in embeddings]
329
+ finally:
330
+ sys.setrecursionlimit(old_limit) # Restore original limit
331
+ except RecursionError as e:
332
+ print(f"Error generating batch embeddings (recursion): {e}", flush=True)
333
+ return [None] * len(texts)
334
  except Exception as e:
335
+ print(f"Error generating batch embeddings: {e}", flush=True)
336
  return [None] * len(texts)
337
 
338