Shubham170793 commited on
Commit
abc5c49
·
verified ·
1 Parent(s): a27756e

Update src/qa.py

Browse files
Files changed (1) hide show
  1. src/qa.py +31 -24
src/qa.py CHANGED
@@ -76,55 +76,62 @@ try:
76
  except Exception as e:
77
  print(f"⚠️ Gen AI Hub setup failed: {e}")
78
  chat_llm = None
79
-
80
  # ==========================================================
81
- # 4️⃣ Embedding Cache Manager (Chunk-Aware)
82
  # ==========================================================
83
  CACHE_EMB_DIR = "/tmp/embed_cache"
84
  os.makedirs(CACHE_EMB_DIR, exist_ok=True)
85
 
86
  def _hash_name(file_name: str, chunk_size: int, overlap: int, num_chunks: int):
87
- """Generate unique hash based on file name and chunking config."""
88
  combo = f"{file_name}_{chunk_size}_{overlap}_{num_chunks}"
89
  return hashlib.md5(combo.encode()).hexdigest()[:8]
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  def cache_embeddings(file_name: str, chunks, embed_func, chunk_size: int = None, overlap: int = None):
92
  """
93
- Load cached embeddings if available for the same document and configuration.
94
- If not found, compute and store embeddings for reuse.
95
  """
96
  cache_key = _hash_name(file_name, chunk_size or 1000, overlap or 100, len(chunks))
97
  cache_file = f"{os.path.basename(file_name)}_cs{chunk_size}_ov{overlap}_{cache_key}.pkl"
98
  cache_path = os.path.join(CACHE_EMB_DIR, cache_file)
 
99
 
 
100
  if os.path.exists(cache_path):
101
- print(f"🧠 Loaded cached embeddings for {os.path.basename(file_name)} ({chunk_size}/{overlap})")
102
  with open(cache_path, "rb") as f:
103
  return pickle.load(f)
104
 
105
- print(f"💡 No cache found for {os.path.basename(file_name)} ({chunk_size}/{overlap}). Generating new embeddings...")
 
106
  embeddings = embed_func(chunks)
107
  with open(cache_path, "wb") as f:
108
  pickle.dump(embeddings, f)
109
  print(f"💾 Cached embeddings saved as {cache_file}")
110
- return embeddings
111
 
112
- def embed_chunks(chunks, batch_size=32):
113
- """
114
- Batch-encode text chunks for speed.
115
- """
116
- all_embeddings = []
117
- for i in range(0, len(chunks), batch_size):
118
- batch = [f"passage: {c}" for c in chunks[i:i+batch_size]]
119
- batch_embs = _query_model.encode(
120
- batch,
121
- convert_to_numpy=True,
122
- normalize_embeddings=True,
123
- show_progress_bar=False
124
- )
125
- all_embeddings.extend(batch_embs)
126
- print(f"⚡ Embedded {len(all_embeddings)} chunks in batches of {batch_size}")
127
- return np.array(all_embeddings)
128
 
129
  # ==========================================================
130
  # 5️⃣ Prompt Templates
 
76
  except Exception as e:
77
  print(f"⚠️ Gen AI Hub setup failed: {e}")
78
  chat_llm = None
 
79
  # ==========================================================
80
+ # 4️⃣ Embedding Cache Manager (Chunk-Aware + Auto-Cleanup)
81
  # ==========================================================
82
  CACHE_EMB_DIR = "/tmp/embed_cache"
83
  os.makedirs(CACHE_EMB_DIR, exist_ok=True)
84
 
85
  def _hash_name(file_name: str, chunk_size: int, overlap: int, num_chunks: int):
86
+ """Generate unique short hash for a file + chunking configuration."""
87
  combo = f"{file_name}_{chunk_size}_{overlap}_{num_chunks}"
88
  return hashlib.md5(combo.encode()).hexdigest()[:8]
89
 
90
+ def _clean_old_caches(base_name: str, keep_latest: int = 3):
91
+ """
92
+ Retains only the latest few embedding cache files for a given document.
93
+ Prevents /tmp from filling with redundant embeddings.
94
+ """
95
+ files = [
96
+ (os.path.getmtime(os.path.join(CACHE_EMB_DIR, f)), f)
97
+ for f in os.listdir(CACHE_EMB_DIR)
98
+ if f.startswith(base_name)
99
+ ]
100
+ if len(files) > keep_latest:
101
+ files.sort(reverse=True) # newest first
102
+ for _, old_file in files[keep_latest:]:
103
+ try:
104
+ os.remove(os.path.join(CACHE_EMB_DIR, old_file))
105
+ print(f"🧹 Removed old cache: {old_file}")
106
+ except Exception:
107
+ pass
108
+
109
  def cache_embeddings(file_name: str, chunks, embed_func, chunk_size: int = None, overlap: int = None):
110
  """
111
+ Loads cached embeddings if the same document+config already exists.
112
+ Otherwise generates new embeddings and prunes older cache versions.
113
  """
114
  cache_key = _hash_name(file_name, chunk_size or 1000, overlap or 100, len(chunks))
115
  cache_file = f"{os.path.basename(file_name)}_cs{chunk_size}_ov{overlap}_{cache_key}.pkl"
116
  cache_path = os.path.join(CACHE_EMB_DIR, cache_file)
117
+ base_name = os.path.basename(file_name)
118
 
119
+ # --- Use existing cache if available ---
120
  if os.path.exists(cache_path):
121
+ print(f"🧠 Loaded cached embeddings for {base_name} ({chunk_size}/{overlap})")
122
  with open(cache_path, "rb") as f:
123
  return pickle.load(f)
124
 
125
+ # --- Otherwise compute embeddings and cache them ---
126
+ print(f"💡 No cache found for {base_name} ({chunk_size}/{overlap}). Generating new embeddings...")
127
  embeddings = embed_func(chunks)
128
  with open(cache_path, "wb") as f:
129
  pickle.dump(embeddings, f)
130
  print(f"💾 Cached embeddings saved as {cache_file}")
 
131
 
132
+ # --- Clean up older cache versions for this document ---
133
+ _clean_old_caches(base_name, keep_latest=3)
134
+ return embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  # ==========================================================
137
  # 5️⃣ Prompt Templates