Update src/qa.py
Browse files
src/qa.py
CHANGED
|
@@ -76,55 +76,62 @@ try:
|
|
| 76 |
except Exception as e:
|
| 77 |
print(f"⚠️ Gen AI Hub setup failed: {e}")
|
| 78 |
chat_llm = None
|
| 79 |
-
|
| 80 |
# ==========================================================
|
| 81 |
-
# 4️⃣ Embedding Cache Manager (Chunk-Aware)
|
| 82 |
# ==========================================================
|
| 83 |
CACHE_EMB_DIR = "/tmp/embed_cache"
|
| 84 |
os.makedirs(CACHE_EMB_DIR, exist_ok=True)
|
| 85 |
|
| 86 |
def _hash_name(file_name: str, chunk_size: int, overlap: int, num_chunks: int):
|
| 87 |
-
"""Generate unique hash
|
| 88 |
combo = f"{file_name}_{chunk_size}_{overlap}_{num_chunks}"
|
| 89 |
return hashlib.md5(combo.encode()).hexdigest()[:8]
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
def cache_embeddings(file_name: str, chunks, embed_func, chunk_size: int = None, overlap: int = None):
|
| 92 |
"""
|
| 93 |
-
|
| 94 |
-
|
| 95 |
"""
|
| 96 |
cache_key = _hash_name(file_name, chunk_size or 1000, overlap or 100, len(chunks))
|
| 97 |
cache_file = f"{os.path.basename(file_name)}_cs{chunk_size}_ov{overlap}_{cache_key}.pkl"
|
| 98 |
cache_path = os.path.join(CACHE_EMB_DIR, cache_file)
|
|
|
|
| 99 |
|
|
|
|
| 100 |
if os.path.exists(cache_path):
|
| 101 |
-
print(f"🧠 Loaded cached embeddings for {
|
| 102 |
with open(cache_path, "rb") as f:
|
| 103 |
return pickle.load(f)
|
| 104 |
|
| 105 |
-
|
|
|
|
| 106 |
embeddings = embed_func(chunks)
|
| 107 |
with open(cache_path, "wb") as f:
|
| 108 |
pickle.dump(embeddings, f)
|
| 109 |
print(f"💾 Cached embeddings saved as {cache_file}")
|
| 110 |
-
return embeddings
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
"""
|
| 116 |
-
all_embeddings = []
|
| 117 |
-
for i in range(0, len(chunks), batch_size):
|
| 118 |
-
batch = [f"passage: {c}" for c in chunks[i:i+batch_size]]
|
| 119 |
-
batch_embs = _query_model.encode(
|
| 120 |
-
batch,
|
| 121 |
-
convert_to_numpy=True,
|
| 122 |
-
normalize_embeddings=True,
|
| 123 |
-
show_progress_bar=False
|
| 124 |
-
)
|
| 125 |
-
all_embeddings.extend(batch_embs)
|
| 126 |
-
print(f"⚡ Embedded {len(all_embeddings)} chunks in batches of {batch_size}")
|
| 127 |
-
return np.array(all_embeddings)
|
| 128 |
|
| 129 |
# ==========================================================
|
| 130 |
# 5️⃣ Prompt Templates
|
|
|
|
| 76 |
except Exception as e:
|
| 77 |
print(f"⚠️ Gen AI Hub setup failed: {e}")
|
| 78 |
chat_llm = None
|
|
|
|
| 79 |
# ==========================================================
|
| 80 |
+
# 4️⃣ Embedding Cache Manager (Chunk-Aware + Auto-Cleanup)
|
| 81 |
# ==========================================================
|
| 82 |
CACHE_EMB_DIR = "/tmp/embed_cache"
|
| 83 |
os.makedirs(CACHE_EMB_DIR, exist_ok=True)
|
| 84 |
|
| 85 |
def _hash_name(file_name: str, chunk_size: int, overlap: int, num_chunks: int):
|
| 86 |
+
"""Generate unique short hash for a file + chunking configuration."""
|
| 87 |
combo = f"{file_name}_{chunk_size}_{overlap}_{num_chunks}"
|
| 88 |
return hashlib.md5(combo.encode()).hexdigest()[:8]
|
| 89 |
|
| 90 |
+
def _clean_old_caches(base_name: str, keep_latest: int = 3):
|
| 91 |
+
"""
|
| 92 |
+
Retains only the latest few embedding cache files for a given document.
|
| 93 |
+
Prevents /tmp from filling with redundant embeddings.
|
| 94 |
+
"""
|
| 95 |
+
files = [
|
| 96 |
+
(os.path.getmtime(os.path.join(CACHE_EMB_DIR, f)), f)
|
| 97 |
+
for f in os.listdir(CACHE_EMB_DIR)
|
| 98 |
+
if f.startswith(base_name)
|
| 99 |
+
]
|
| 100 |
+
if len(files) > keep_latest:
|
| 101 |
+
files.sort(reverse=True) # newest first
|
| 102 |
+
for _, old_file in files[keep_latest:]:
|
| 103 |
+
try:
|
| 104 |
+
os.remove(os.path.join(CACHE_EMB_DIR, old_file))
|
| 105 |
+
print(f"🧹 Removed old cache: {old_file}")
|
| 106 |
+
except Exception:
|
| 107 |
+
pass
|
| 108 |
+
|
| 109 |
def cache_embeddings(file_name: str, chunks, embed_func, chunk_size: int = None, overlap: int = None):
|
| 110 |
"""
|
| 111 |
+
Loads cached embeddings if the same document+config already exists.
|
| 112 |
+
Otherwise generates new embeddings and prunes older cache versions.
|
| 113 |
"""
|
| 114 |
cache_key = _hash_name(file_name, chunk_size or 1000, overlap or 100, len(chunks))
|
| 115 |
cache_file = f"{os.path.basename(file_name)}_cs{chunk_size}_ov{overlap}_{cache_key}.pkl"
|
| 116 |
cache_path = os.path.join(CACHE_EMB_DIR, cache_file)
|
| 117 |
+
base_name = os.path.basename(file_name)
|
| 118 |
|
| 119 |
+
# --- Use existing cache if available ---
|
| 120 |
if os.path.exists(cache_path):
|
| 121 |
+
print(f"🧠 Loaded cached embeddings for {base_name} ({chunk_size}/{overlap})")
|
| 122 |
with open(cache_path, "rb") as f:
|
| 123 |
return pickle.load(f)
|
| 124 |
|
| 125 |
+
# --- Otherwise compute embeddings and cache them ---
|
| 126 |
+
print(f"💡 No cache found for {base_name} ({chunk_size}/{overlap}). Generating new embeddings...")
|
| 127 |
embeddings = embed_func(chunks)
|
| 128 |
with open(cache_path, "wb") as f:
|
| 129 |
pickle.dump(embeddings, f)
|
| 130 |
print(f"💾 Cached embeddings saved as {cache_file}")
|
|
|
|
| 131 |
|
| 132 |
+
# --- Clean up older cache versions for this document ---
|
| 133 |
+
_clean_old_caches(base_name, keep_latest=3)
|
| 134 |
+
return embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
# ==========================================================
|
| 137 |
# 5️⃣ Prompt Templates
|