Shubham170793 commited on
Commit
197e569
·
verified ·
1 Parent(s): f384f96

Update src/qa.py

Browse files
Files changed (1) hide show
  1. src/qa.py +120 -116
src/qa.py CHANGED
@@ -1,11 +1,10 @@
1
  """
2
- qa.py — Phi-2 Hybrid (Fast + Reasoning) with Rerank & Similarity Filtering
3
- --------------------------------------------------------------------------
4
- ✅ Optimized for Hugging Face Spaces & Streamlit
5
- intfloat/e5-small-v2 for embeddings
6
- microsoft/phi-2 for generation (fast CPU-optimized)
7
- Re-ranking + minimum similarity threshold for clean retrieval
8
- ✅ reasoning_mode toggle for deeper answers
9
  """
10
 
11
  import os
@@ -15,10 +14,10 @@ from sklearn.metrics.pairwise import cosine_similarity
15
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
16
  import torch
17
 
18
- print("✅ qa.py (Phi-2 Hybrid + Rerank + Similarity Filter) loaded from:", __file__)
19
 
20
  # ==========================================================
21
- # 1️⃣ Hugging Face Cache Setup
22
  # ==========================================================
23
  CACHE_DIR = "/tmp/hf_cache"
24
  os.makedirs(CACHE_DIR, exist_ok=True)
@@ -31,169 +30,174 @@ os.environ.update({
31
  print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
32
 
33
  # ==========================================================
34
- # 2️⃣ Embedding Model (E5-small-v2)
35
  # ==========================================================
36
  try:
37
  _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
38
  print("✅ Loaded embedding model: intfloat/e5-small-v2")
39
  except Exception as e:
40
- print(f"⚠️ Embedding model load failed ({e}), falling back to MiniLM.")
41
  _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
42
 
43
  # ==========================================================
44
- # 3️⃣ LLM Setup — Phi-2 (Fast)
45
  # ==========================================================
46
  MODEL_NAME = "microsoft/phi-2"
47
  print(f"✅ Loading LLM: {MODEL_NAME}")
48
 
49
- try:
50
- _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
51
- _model = AutoModelForCausalLM.from_pretrained(
52
- MODEL_NAME,
53
- cache_dir=CACHE_DIR,
54
- torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
55
- low_cpu_mem_usage=True
56
- )
57
-
58
- _answer_model = pipeline(
59
- "text-generation",
60
- model=_model,
61
- tokenizer=_tokenizer,
62
- device_map="auto"
63
- )
64
- print("✅ Phi-2 text-generation pipeline ready.")
65
- except Exception as e:
66
- print(f"⚠️ Phi-2 load failed: {e}")
67
- _answer_model = None
68
 
69
  # ==========================================================
70
- # 4️⃣ Prompt Templates
71
  # ==========================================================
72
  STRICT_PROMPT = (
73
- "You are an assistant for enterprise documentation.\n"
74
- "Answer the question based ONLY on the context below.\n"
75
- "If the answer is not in the context, reply exactly:\n"
76
  "'I don't know based on the provided document.'\n\n"
77
  "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
78
  )
79
 
80
  REASONING_PROMPT = (
81
- "You are an expert enterprise assistant.\n"
82
- "Carefully reason about the following context and provide a detailed, step-by-step answer.\n"
83
- "If the context does not provide enough information, you may make cautious inferences based on logical reasoning.\n"
84
- "However, always note when you are inferring beyond the text.\n\n"
85
- "Context:\n{context}\n\nQuestion: {query}\n\nReasoning and Answer:"
 
86
  )
87
 
88
  # ==========================================================
89
- # 5️⃣ Retrieve Chunks FAISS + Re-rank + Similarity Filter
90
  # ==========================================================
91
- def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5, min_similarity: float = 0.6):
92
- """
93
- Retrieves top-K relevant chunks with re-ranking and similarity threshold filtering.
94
- Steps:
95
- 1️⃣ Use FAISS to get approximate top candidates.
96
- 2️⃣ Re-rank them by cosine similarity with the query.
97
- 3️⃣ Filter out low-similarity chunks below min_similarity.
98
- """
 
 
 
99
  if not index or not chunks:
100
  return []
101
 
102
- try:
103
- # --- Encode query ---
104
- q_emb = _query_model.encode(
105
- [f"query: {query.strip()}"],
106
- convert_to_numpy=True,
107
- normalize_embeddings=True
108
- )[0]
109
-
110
- # --- FAISS initial retrieval ---
111
- distances, indices = index.search(np.array([q_emb]).astype("float32"), top_k * 3)
112
- retrieved = [chunks[i] for i in indices[0]]
113
-
114
- # --- Compute re-ranking similarity scores ---
115
- doc_embs = _query_model.encode(
116
- [f"passage: {c}" for c in retrieved],
117
- convert_to_numpy=True,
118
- normalize_embeddings=True
119
- )
120
- sims = cosine_similarity([q_emb], doc_embs)[0]
121
-
122
- # --- Combine and sort by similarity ---
123
- scored = sorted(zip(retrieved, sims), key=lambda x: x[1], reverse=True)
124
-
125
- # --- Apply minimum similarity filter ---
126
- filtered = [(chunk, score) for chunk, score in scored if score >= min_similarity]
127
-
128
- # --- Select final top_k results ---
129
- final_chunks = [chunk for chunk, _ in filtered[:top_k]]
130
-
131
- print(f"✅ Retrieved {len(final_chunks)} chunks (min sim={min_similarity})")
132
- return final_chunks
133
-
134
- except Exception as e:
135
- print(f"⚠️ Retrieval error: {e}")
136
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  # ==========================================================
139
- # 6️⃣ Answer Generation (Fast / Reasoning Hybrid)
140
  # ==========================================================
141
  def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
142
- """
143
- Generates concise or reasoning-rich answers using Phi-2.
144
- reasoning_mode=True → longer, more explanatory (slower)
145
- reasoning_mode=False → short factual (fast)
146
- """
147
  if not retrieved_chunks:
148
  return "Sorry, I couldn’t find relevant information in the document."
149
 
150
  context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
151
- prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)
 
 
152
 
153
  try:
154
  result = _answer_model(
155
  prompt,
156
- max_new_tokens=200 if reasoning_mode else 120,
157
- temperature=0.6 if reasoning_mode else 0.2,
158
  do_sample=reasoning_mode,
 
159
  pad_token_id=_tokenizer.eos_token_id,
160
  )
161
- answer = result[0]["generated_text"].strip()
162
-
163
- if "Answer:" in answer:
164
- answer = answer.split("Answer:")[-1].strip()
165
-
166
- return answer
167
-
168
  except Exception as e:
169
  print(f"⚠️ Generation failed: {e}")
170
  return "⚠️ Error: Could not generate an answer."
171
 
172
  # ==========================================================
173
- # 7️⃣ Local Test (Optional)
174
  # ==========================================================
175
  if __name__ == "__main__":
176
  from vectorstore import build_faiss_index
177
- import faiss
178
 
179
  dummy_chunks = [
180
  "Step 1: Open the dashboard and navigate to reports.",
181
  "Step 2: Click 'Export' to download a CSV summary.",
182
- "Step 3: Review the generated report in your downloads folder."
 
183
  ]
184
-
185
  embeddings = [
186
- _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
187
- for chunk in dummy_chunks
188
  ]
 
189
 
190
- dim = embeddings[0].shape[0]
191
- index = faiss.IndexFlatL2(dim)
192
- index.add(np.array(embeddings).astype("float32"))
193
-
194
- query = "How to export a report?"
195
- retrieved = retrieve_chunks(query, index, dummy_chunks, top_k=3, min_similarity=0.6)
196
-
197
- print("\n🔍 Retrieved chunks:", retrieved)
198
- print("\n💬 FAST Answer:", generate_answer(query, retrieved, reasoning_mode=False))
199
- print("\n🧠 REASONING Answer:", generate_answer(query, retrieved, reasoning_mode=True))
 
1
  """
2
+ qa.py — Phi-2 FAST + RERANKED RETRIEVAL
3
+ --------------------------------------
4
+ Uses:
5
+ intfloat/e5-small-v2 embeddings
6
+ microsoft/phi-2 generation
7
+ Optimized for: speed, factual accuracy, and semantic retrieval on Hugging Face Spaces
 
8
  """
9
 
10
  import os
 
14
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
15
  import torch
16
 
17
+ print("✅ qa.py (Phi-2 FAST + ReRank) loaded from:", __file__)
18
 
19
  # ==========================================================
20
+ # 1️⃣ Cache Setup (Hugging Face /tmp cache)
21
  # ==========================================================
22
  CACHE_DIR = "/tmp/hf_cache"
23
  os.makedirs(CACHE_DIR, exist_ok=True)
 
30
  print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
31
 
32
  # ==========================================================
33
+ # 2️⃣ Embedding Model
34
  # ==========================================================
35
  try:
36
  _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
37
  print("✅ Loaded embedding model: intfloat/e5-small-v2")
38
  except Exception as e:
39
+ print(f"⚠️ Embedding load failed ({e}), falling back to MiniLM")
40
  _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
41
 
42
  # ==========================================================
43
+ # 3️⃣ Phi-2 LLM Setup
44
  # ==========================================================
45
  MODEL_NAME = "microsoft/phi-2"
46
  print(f"✅ Loading LLM: {MODEL_NAME}")
47
 
48
+ _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
49
+ _model = AutoModelForCausalLM.from_pretrained(
50
+ MODEL_NAME,
51
+ cache_dir=CACHE_DIR,
52
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.bfloat16,
53
+ low_cpu_mem_usage=True,
54
+ ).to("cpu")
55
+
56
+ _answer_model = pipeline(
57
+ "text-generation",
58
+ model=_model,
59
+ tokenizer=_tokenizer,
60
+ device=-1,
61
+ model_kwargs={"torch_dtype": torch.bfloat16, "low_cpu_mem_usage": True},
62
+ )
63
+ print("✅ Phi-2 text-generation pipeline ready (optimized).")
 
 
 
64
 
65
  # ==========================================================
66
+ # 4️⃣ Prompt Template
67
  # ==========================================================
68
  STRICT_PROMPT = (
69
+ "You are an enterprise documentation assistant.\n"
70
+ "Answer factually using ONLY the context below.\n"
71
+ "If the answer isn’t present, reply exactly:\n"
72
  "'I don't know based on the provided document.'\n\n"
73
  "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
74
  )
75
 
76
  REASONING_PROMPT = (
77
+ "You are an expert enterprise assistant with reasoning ability.\n"
78
+ "Think carefully about the context and question.\n"
79
+ "Use world knowledge and inference if necessary, but prefer factual accuracy.\n"
80
+ "If the document lacks the answer, say:\n"
81
+ "'I don't know based on the provided document.'\n\n"
82
+ "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
83
  )
84
 
85
  # ==========================================================
86
+ # 5️⃣ Retrieve Chunks (FAISS + Rerank + Neighbor Expansion)
87
  # ==========================================================
88
+ def retrieve_chunks(
89
+ query: str,
90
+ index,
91
+ chunks: list,
92
+ top_k: int = 3,
93
+ topn_candidates: int = 20,
94
+ neighbor_threshold: float = 0.68,
95
+ expansion_window: int = 1,
96
+ max_context_chunks: int = 6,
97
+ ):
98
+ """Retrieve semantically relevant chunks with reranking and neighbor expansion."""
99
  if not index or not chunks:
100
  return []
101
 
102
+ # 1️⃣ Encode query (normalized)
103
+ query_emb = _query_model.encode(
104
+ [f"query: {query.strip()}"],
105
+ convert_to_numpy=True,
106
+ normalize_embeddings=True
107
+ )[0].astype("float32")
108
+
109
+ # 2️⃣ FAISS search (initial candidates)
110
+ topn_candidates = min(topn_candidates, getattr(index, "ntotal", topn_candidates))
111
+ _, candidate_ids = index.search(np.array([query_emb]).astype("float32"), topn_candidates)
112
+ candidate_ids = [int(i) for i in candidate_ids[0] if i != -1]
113
+
114
+ # 3️⃣ Re-encode candidate chunks and compute cosine similarities
115
+ candidate_texts = [chunks[i] for i in candidate_ids]
116
+ candidate_vecs = np.array([
117
+ _query_model.encode([t], convert_to_numpy=True, normalize_embeddings=True)[0]
118
+ for t in candidate_texts
119
+ ])
120
+ sims = cosine_similarity([query_emb], candidate_vecs)[0]
121
+ sorted_idx = np.argsort(sims)[::-1]
122
+ reranked_ids = [candidate_ids[i] for i in sorted_idx]
123
+
124
+ # 4️⃣ Select top-k base chunks
125
+ selected, selected_set = [], set()
126
+ for rid in reranked_ids:
127
+ if len(selected) >= top_k:
128
+ break
129
+ selected.append(rid)
130
+ selected_set.add(rid)
131
+
132
+ # 5️⃣ Conditional neighbor expansion
133
+ final_order = list(selected)
134
+ for base_id in selected:
135
+ if len(final_order) >= max_context_chunks:
136
+ break
137
+ for offset in range(1, expansion_window + 1):
138
+ for neighbor in (base_id - offset, base_id + offset):
139
+ if neighbor < 0 or neighbor >= len(chunks) or neighbor in selected_set:
140
+ continue
141
+ # Check semantic closeness
142
+ neighbor_vec = _query_model.encode([chunks[neighbor]], convert_to_numpy=True, normalize_embeddings=True)[0]
143
+ sim = float(cosine_similarity([query_emb], [neighbor_vec])[0][0])
144
+ if sim >= neighbor_threshold:
145
+ final_order.append(neighbor)
146
+ selected_set.add(neighbor)
147
+ if len(final_order) >= max_context_chunks:
148
+ break
149
+ if len(final_order) >= max_context_chunks:
150
+ break
151
+
152
+ return [chunks[i] for i in final_order]
153
 
154
  # ==========================================================
155
+ # 6️⃣ Answer Generation
156
  # ==========================================================
157
  def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
158
+ """Generate concise, factual or reasoning-based answers using Phi-2."""
 
 
 
 
159
  if not retrieved_chunks:
160
  return "Sorry, I couldn’t find relevant information in the document."
161
 
162
  context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
163
+ prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(
164
+ context=context, query=query
165
+ )
166
 
167
  try:
168
  result = _answer_model(
169
  prompt,
170
+ max_new_tokens=180 if reasoning_mode else 120,
171
+ temperature=0.6 if reasoning_mode else 0.3,
172
  do_sample=reasoning_mode,
173
+ early_stopping=True,
174
  pad_token_id=_tokenizer.eos_token_id,
175
  )
176
+ text = result[0]["generated_text"].strip()
177
+ return text.split("Answer:")[-1].strip() if "Answer:" in text else text
 
 
 
 
 
178
  except Exception as e:
179
  print(f"⚠️ Generation failed: {e}")
180
  return "⚠️ Error: Could not generate an answer."
181
 
182
  # ==========================================================
183
+ # 7️⃣ Local Test
184
  # ==========================================================
185
  if __name__ == "__main__":
186
  from vectorstore import build_faiss_index
 
187
 
188
  dummy_chunks = [
189
  "Step 1: Open the dashboard and navigate to reports.",
190
  "Step 2: Click 'Export' to download a CSV summary.",
191
+ "Step 3: Review the generated report in your downloads folder.",
192
+ "Appendix: Communication user creation steps are explained later in this guide."
193
  ]
 
194
  embeddings = [
195
+ _query_model.encode([f"passage: {c}"], convert_to_numpy=True, normalize_embeddings=True)[0]
196
+ for c in dummy_chunks
197
  ]
198
+ index = build_faiss_index(embeddings)
199
 
200
+ query = "How do I create a communication user?"
201
+ retrieved = retrieve_chunks(query, index, dummy_chunks)
202
+ print("🔍 Retrieved:", retrieved)
203
+ print("💬 Answer:", generate_answer(query, retrieved))