Shubham170793 commited on
Commit
3a56dbd
Β·
verified Β·
1 Parent(s): cd266a5

Update src/embeddings.py

Browse files
Files changed (1) hide show
  1. src/embeddings.py +33 -11
src/embeddings.py CHANGED
@@ -1,10 +1,13 @@
1
- # ----------------------------
2
- # Hugging Face cache bootstrap
3
- # ----------------------------
4
  import os
5
  import numpy as np
6
  from sentence_transformers import SentenceTransformer
7
 
 
 
 
8
  CACHE_DIR = "/tmp/hf_cache"
9
  os.makedirs(CACHE_DIR, exist_ok=True)
10
 
@@ -16,11 +19,11 @@ os.environ["HF_MODULES_CACHE"] = CACHE_DIR
16
  print(f"βœ… Using Hugging Face cache at {CACHE_DIR}")
17
 
18
  # ----------------------------
19
- # Load embedding model once (with fallback)
20
  # ----------------------------
21
  try:
22
  _model = SentenceTransformer(
23
- "intfloat/e5-small-v2", # βœ… Better for document QA retrieval
24
  cache_folder=CACHE_DIR
25
  )
26
  print("βœ… Loaded model: intfloat/e5-small-v2")
@@ -33,20 +36,39 @@ except Exception as e:
33
  print("βœ… Loaded fallback model: all-MiniLM-L6-v2")
34
 
35
  # ----------------------------
36
- # Function: generate embeddings
37
  # ----------------------------
38
  def generate_embeddings(chunks: list) -> list:
39
  """
40
- Generate normalized embeddings for a list of text chunks.
41
- Normalization improves FAISS retrieval accuracy (cosine-based).
 
 
 
 
 
 
 
 
 
 
42
  """
43
- # Add semantic prefix for e5 model to help it distinguish queries vs passages
 
 
 
 
44
  prepared_chunks = [f"passage: {chunk.strip()}" for chunk in chunks]
45
 
 
46
  vectors = _model.encode(
47
  prepared_chunks,
48
  convert_to_numpy=True,
49
- normalize_embeddings=True # βœ… ensures better FAISS matching
50
  )
51
 
52
- return vectors.tolist()
 
 
 
 
 
1
+ # ==========================================================
2
+ # πŸ“˜ embeddings.py β€” optimized for Hugging Face + FAISS + E5
3
+ # ==========================================================
4
  import os
5
  import numpy as np
6
  from sentence_transformers import SentenceTransformer
7
 
8
+ # ----------------------------
9
+ # Hugging Face Cache Bootstrap
10
+ # ----------------------------
11
  CACHE_DIR = "/tmp/hf_cache"
12
  os.makedirs(CACHE_DIR, exist_ok=True)
13
 
 
19
  print(f"βœ… Using Hugging Face cache at {CACHE_DIR}")
20
 
21
  # ----------------------------
22
+ # Load Embedding Model (E5 with fallback)
23
  # ----------------------------
24
  try:
25
  _model = SentenceTransformer(
26
+ "intfloat/e5-small-v2", # βœ… Trained for retrieval-augmented QA
27
  cache_folder=CACHE_DIR
28
  )
29
  print("βœ… Loaded model: intfloat/e5-small-v2")
 
36
  print("βœ… Loaded fallback model: all-MiniLM-L6-v2")
37
 
38
  # ----------------------------
39
+ # Function: Generate Embeddings
40
  # ----------------------------
41
  def generate_embeddings(chunks: list) -> list:
42
  """
43
+ πŸ“Œ Generate normalized embeddings for a list of text chunks.
44
+
45
+ Args:
46
+ chunks (list): List of text chunks.
47
+
48
+ Returns:
49
+ list: List of normalized embedding vectors (Python lists).
50
+
51
+ Notes:
52
+ - Prefixing chunks with 'passage:' improves retrieval accuracy for E5.
53
+ - normalize_embeddings=True ensures cosine-similarity consistency.
54
+ - Works efficiently even for large PDFs.
55
  """
56
+ if not chunks:
57
+ print("⚠️ No chunks provided for embedding generation.")
58
+ return []
59
+
60
+ # Step 1: Prefix each chunk for semantic clarity (per E5 training)
61
  prepared_chunks = [f"passage: {chunk.strip()}" for chunk in chunks]
62
 
63
+ # Step 2: Encode with normalization for cosine similarity
64
  vectors = _model.encode(
65
  prepared_chunks,
66
  convert_to_numpy=True,
67
+ normalize_embeddings=True # βœ… Makes FAISS IndexFlatIP accurate
68
  )
69
 
70
+ # Step 3: Convert to Python list for FAISS / JSON compatibility
71
+ embeddings = vectors.tolist()
72
+
73
+ print(f"βœ… Generated {len(embeddings)} embeddings.")
74
+ return embeddings