Shubham170793 commited on
Commit
fce2fdd
Β·
verified Β·
1 Parent(s): a5483fe

Update src/qa.py

Browse files
Files changed (1) hide show
  1. src/qa.py +15 -11
src/qa.py CHANGED
@@ -3,7 +3,7 @@ qa.py β€” GPT-4o (SAP Gen AI Hub) + ReRank Retrieval
3
  --------------------------------------------------
4
  βœ… Semantic retrieval (FAISS + cosine re-rank + neighbor fill)
5
  βœ… Bullet-aware similarity boost for procedural chunks
6
- βœ… Embedding caching (per PDF)
7
  βœ… Smart factual mode (fast)
8
  βœ… Deep reasoning mode (ChatGPT-like)
9
  βœ… genai_generate() helper for suggestions
@@ -78,31 +78,35 @@ except Exception as e:
78
  chat_llm = None
79
 
80
  # ==========================================================
81
- # 4️⃣ Embedding Cache Manager
82
  # ==========================================================
83
  CACHE_EMB_DIR = "/tmp/embed_cache"
84
  os.makedirs(CACHE_EMB_DIR, exist_ok=True)
85
 
86
- def _hash_name(file_name: str):
87
- """Generate unique hash for PDF file name."""
88
- return hashlib.md5(file_name.encode()).hexdigest()
 
89
 
90
- def cache_embeddings(file_name: str, chunks, embed_func):
91
  """
92
- Checks if cached embeddings exist for a PDF; if not, compute and save.
 
93
  """
94
- cache_path = os.path.join(CACHE_EMB_DIR, f"{_hash_name(file_name)}.pkl")
 
 
95
 
96
  if os.path.exists(cache_path):
97
- print(f"🧠 Loaded cached embeddings for {file_name}")
98
  with open(cache_path, "rb") as f:
99
  return pickle.load(f)
100
 
101
- print(f"πŸ’‘ No cache found for {file_name}. Generating embeddings...")
102
  embeddings = embed_func(chunks)
103
  with open(cache_path, "wb") as f:
104
  pickle.dump(embeddings, f)
105
- print(f"πŸ’Ύ Cached embeddings saved for {file_name}")
106
  return embeddings
107
 
108
  def embed_chunks(chunks, batch_size=32):
 
3
  --------------------------------------------------
4
  βœ… Semantic retrieval (FAISS + cosine re-rank + neighbor fill)
5
  βœ… Bullet-aware similarity boost for procedural chunks
6
+ βœ… Embedding caching (per PDF + chunk config aware)
7
  βœ… Smart factual mode (fast)
8
  βœ… Deep reasoning mode (ChatGPT-like)
9
  βœ… genai_generate() helper for suggestions
 
78
  chat_llm = None
79
 
80
  # ==========================================================
81
+ # 4️⃣ Embedding Cache Manager (Chunk-Aware)
82
  # ==========================================================
83
  CACHE_EMB_DIR = "/tmp/embed_cache"
84
  os.makedirs(CACHE_EMB_DIR, exist_ok=True)
85
 
86
+ def _hash_name(file_name: str, chunk_size: int, overlap: int, num_chunks: int):
87
+ """Generate unique hash based on file name and chunking config."""
88
+ combo = f"{file_name}_{chunk_size}_{overlap}_{num_chunks}"
89
+ return hashlib.md5(combo.encode()).hexdigest()[:8]
90
 
91
+ def cache_embeddings(file_name: str, chunks, embed_func, chunk_size: int = None, overlap: int = None):
92
  """
93
+ Load cached embeddings if available for the same document and configuration.
94
+ If not found, compute and store embeddings for reuse.
95
  """
96
+ cache_key = _hash_name(file_name, chunk_size or 1000, overlap or 100, len(chunks))
97
+ cache_file = f"{os.path.basename(file_name)}_cs{chunk_size}_ov{overlap}_{cache_key}.pkl"
98
+ cache_path = os.path.join(CACHE_EMB_DIR, cache_file)
99
 
100
  if os.path.exists(cache_path):
101
+ print(f"🧠 Loaded cached embeddings for {os.path.basename(file_name)} ({chunk_size}/{overlap})")
102
  with open(cache_path, "rb") as f:
103
  return pickle.load(f)
104
 
105
+ print(f"πŸ’‘ No cache found for {os.path.basename(file_name)} ({chunk_size}/{overlap}). Generating new embeddings...")
106
  embeddings = embed_func(chunks)
107
  with open(cache_path, "wb") as f:
108
  pickle.dump(embeddings, f)
109
+ print(f"πŸ’Ύ Cached embeddings saved as {cache_file}")
110
  return embeddings
111
 
112
  def embed_chunks(chunks, batch_size=32):