Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 19

Commit

abc5c49

verified ·

1 Parent(s): a27756e

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +31 -24

src/qa.py CHANGED Viewed

@@ -76,55 +76,62 @@ try:
 except Exception as e:
     print(f"⚠️ Gen AI Hub setup failed: {e}")
     chat_llm = None
 # ==========================================================
-# 4️⃣ Embedding Cache Manager (Chunk-Aware)
 # ==========================================================
 CACHE_EMB_DIR = "/tmp/embed_cache"
 os.makedirs(CACHE_EMB_DIR, exist_ok=True)
 def _hash_name(file_name: str, chunk_size: int, overlap: int, num_chunks: int):
-    """Generate unique hash based on file name and chunking config."""
     combo = f"{file_name}_{chunk_size}_{overlap}_{num_chunks}"
     return hashlib.md5(combo.encode()).hexdigest()[:8]
 def cache_embeddings(file_name: str, chunks, embed_func, chunk_size: int = None, overlap: int = None):
     """
-    Load cached embeddings if available for the same document and configuration.
-    If not found, compute and store embeddings for reuse.
     """
     cache_key = _hash_name(file_name, chunk_size or 1000, overlap or 100, len(chunks))
     cache_file = f"{os.path.basename(file_name)}_cs{chunk_size}_ov{overlap}_{cache_key}.pkl"
     cache_path = os.path.join(CACHE_EMB_DIR, cache_file)
     if os.path.exists(cache_path):
-        print(f"🧠 Loaded cached embeddings for {os.path.basename(file_name)} ({chunk_size}/{overlap})")
         with open(cache_path, "rb") as f:
             return pickle.load(f)
-    print(f"💡 No cache found for {os.path.basename(file_name)} ({chunk_size}/{overlap}). Generating new embeddings...")
     embeddings = embed_func(chunks)
     with open(cache_path, "wb") as f:
         pickle.dump(embeddings, f)
     print(f"💾 Cached embeddings saved as {cache_file}")
-    return embeddings
-def embed_chunks(chunks, batch_size=32):
-    """
-    Batch-encode text chunks for speed.
-    """
-    all_embeddings = []
-    for i in range(0, len(chunks), batch_size):
-        batch = [f"passage: {c}" for c in chunks[i:i+batch_size]]
-        batch_embs = _query_model.encode(
-            batch,
-            convert_to_numpy=True,
-            normalize_embeddings=True,
-            show_progress_bar=False
-        )
-        all_embeddings.extend(batch_embs)
-    print(f"⚡ Embedded {len(all_embeddings)} chunks in batches of {batch_size}")
-    return np.array(all_embeddings)
 # ==========================================================
 # 5️⃣ Prompt Templates

 except Exception as e:
     print(f"⚠️ Gen AI Hub setup failed: {e}")
     chat_llm = None
 # ==========================================================
+# 4️⃣ Embedding Cache Manager (Chunk-Aware + Auto-Cleanup)
 # ==========================================================
 CACHE_EMB_DIR = "/tmp/embed_cache"
 os.makedirs(CACHE_EMB_DIR, exist_ok=True)
 def _hash_name(file_name: str, chunk_size: int, overlap: int, num_chunks: int):
+    """Generate unique short hash for a file + chunking configuration."""
     combo = f"{file_name}_{chunk_size}_{overlap}_{num_chunks}"
     return hashlib.md5(combo.encode()).hexdigest()[:8]
+def _clean_old_caches(base_name: str, keep_latest: int = 3):
+    """
+    Retains only the latest few embedding cache files for a given document.
+    Prevents /tmp from filling with redundant embeddings.
+    """
+    files = [
+        (os.path.getmtime(os.path.join(CACHE_EMB_DIR, f)), f)
+        for f in os.listdir(CACHE_EMB_DIR)
+        if f.startswith(base_name)
+    ]
+    if len(files) > keep_latest:
+        files.sort(reverse=True)  # newest first
+        for _, old_file in files[keep_latest:]:
+            try:
+                os.remove(os.path.join(CACHE_EMB_DIR, old_file))
+                print(f"🧹 Removed old cache: {old_file}")
+            except Exception:
+                pass
 def cache_embeddings(file_name: str, chunks, embed_func, chunk_size: int = None, overlap: int = None):
     """
+    Loads cached embeddings if the same document+config already exists.
+    Otherwise generates new embeddings and prunes older cache versions.
     """
     cache_key = _hash_name(file_name, chunk_size or 1000, overlap or 100, len(chunks))
     cache_file = f"{os.path.basename(file_name)}_cs{chunk_size}_ov{overlap}_{cache_key}.pkl"
     cache_path = os.path.join(CACHE_EMB_DIR, cache_file)
+    base_name = os.path.basename(file_name)
+    # --- Use existing cache if available ---
     if os.path.exists(cache_path):
+        print(f"🧠 Loaded cached embeddings for {base_name} ({chunk_size}/{overlap})")
         with open(cache_path, "rb") as f:
             return pickle.load(f)
+    # --- Otherwise compute embeddings and cache them ---
+    print(f"💡 No cache found for {base_name} ({chunk_size}/{overlap}). Generating new embeddings...")
     embeddings = embed_func(chunks)
     with open(cache_path, "wb") as f:
         pickle.dump(embeddings, f)
     print(f"💾 Cached embeddings saved as {cache_file}")
+    # --- Clean up older cache versions for this document ---
+    _clean_old_caches(base_name, keep_latest=3)
+    return embeddings
 # ==========================================================
 # 5️⃣ Prompt Templates