File size: 10,557 Bytes
cd266a5 28eda6f cd266a5 ebbd49e e97699c 9f0da7b 41ac7b0 25140b4 41ac7b0 ebbd49e 28eda6f 9f0da7b 28eda6f 50ab09a 0c81fa1 cd266a5 41ac7b0 6718956 28eda6f 43b802c fea3890 d7aaa8f a610ce4 197e569 fea3890 cd266a5 28eda6f 41ac7b0 197e569 6d7ba5b 28eda6f f384f96 197e569 28eda6f 386cde6 f384f96 28eda6f 197e569 28eda6f 197e569 f384f96 28eda6f 235a5b5 28eda6f 235a5b5 28eda6f fea3890 41ac7b0 235a5b5 28eda6f 235a5b5 28eda6f 235a5b5 28eda6f 235a5b5 28eda6f 235a5b5 28eda6f 235a5b5 28eda6f 235a5b5 28eda6f 235a5b5 28eda6f 235a5b5 28eda6f 235a5b5 28eda6f f384f96 28eda6f fea3890 28eda6f 197e569 cd6e69b c7133f4 28eda6f f4366a1 28eda6f f4366a1 28eda6f f4366a1 25140b4 28eda6f 197e569 25140b4 f4366a1 28eda6f f4366a1 743f89e d4d8027 386cde6 fea3890 28eda6f fea3890 43cd83d f384f96 fea3890 43cd83d 197e569 fea3890 c91d8df 197e569 c91d8df 197e569 f384f96 197e569 28eda6f 197e569 28eda6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 |
"""
qa.py β Phi-2 FAST + ReRank (stable) β Prefer semantic ranking, neighbor-fill last-resort
---------------------------------------------------------------------------------------
- Uses intfloat/e5-small-v2 for embeddings
- Uses microsoft/phi-2 for generation
- Re-ranks candidate pool from FAISS then picks top_k by true cosine similarity
- Neighbor expansion only if not enough high-sim items
- Logs chunk indices + similarity scores for debugging
"""
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
print("β
qa.py (Phi-2 FAST + ReRank stable) loaded from:", __file__)
# ---------------------------
# Cache
# ---------------------------
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ.update({
"HF_HOME": CACHE_DIR,
"TRANSFORMERS_CACHE": CACHE_DIR,
"HF_DATASETS_CACHE": CACHE_DIR,
"HF_MODULES_CACHE": CACHE_DIR
})
print(f"β
Using Hugging Face cache at {CACHE_DIR}")
# ---------------------------
# Embeddings
# ---------------------------
try:
_query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
print("β
Loaded embedding model: intfloat/e5-small-v2")
except Exception as e:
print(f"β οΈ Embedding load failed ({e}), falling back to MiniLM")
_query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
# ---------------------------
# Phi-2 model
# ---------------------------
MODEL_NAME = "microsoft/phi-2"
print(f"β
Loading LLM: {MODEL_NAME}")
_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
cache_dir=CACHE_DIR,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.bfloat16,
low_cpu_mem_usage=True,
).to("cpu")
_answer_model = pipeline(
"text-generation",
model=_model,
tokenizer=_tokenizer,
device=-1,
model_kwargs={"torch_dtype": torch.bfloat16, "low_cpu_mem_usage": True},
)
print("β
Phi-2 text-generation pipeline ready (optimized).")
# ---------------------------
# Prompts
# ---------------------------
STRICT_PROMPT = (
"You are an enterprise documentation assistant.\n"
"Use ONLY the CONTEXT chunks below to answer the QUESTION.\n"
"Cite the chunk number(s) you used, e.g. [Chunk 3].\n"
"If the document does not contain the answer, reply exactly:\n"
"\"I don't know based on the provided document.\"\n\n"
"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
)
REASONING_PROMPT = (
"You are an expert enterprise assistant with reasoning capacity.\n"
"Prefer the provided CONTEXT but you may cautiously infer when reasonable.\n"
"If you infer, say so and prefer facts from the document.\n"
"If the document lacks the answer, say:\n"
"\"I don't know based on the provided document.\"\n\n"
"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
)
# ---------------------------
# Retrieval: FAISS -> rerank -> neighbor fill (last resort)
# ---------------------------
def retrieve_chunks(query: str, index, chunks: list, top_k: int = 3, min_similarity: float = 0.55, candidate_multiplier: int = 4):
"""
Steps:
1. Encode query (E5 style).
2. Run FAISS search for k*candidate_multiplier candidates.
3. Re-embed those candidate texts and compute cosine similarity with query embedding.
4. Sort by similarity and pick top_k where similarity >= min_similarity.
5. If fewer than top_k passed threshold, fill remaining slots by:
- selecting neighboring chunks around the *highest-scoring* chunk(s),
but only if absolutely necessary (keeps noise low).
Returns: ordered list of chunks (strings)
Also prints indices + similarity scores for debugging.
"""
if not index or not chunks:
return []
try:
# 1. encode query
q_emb = _query_model.encode(
[f"query: {query.strip()}"],
convert_to_numpy=True,
normalize_embeddings=True
)[0]
# 2. FAISS initial retrieval (get a larger candidate pool)
num_candidates = max(top_k * candidate_multiplier, top_k + 2)
distances, indices = index.search(np.array([q_emb]).astype("float32"), num_candidates)
candidate_indices = [int(i) for i in indices[0] if i >= 0]
# protective dedupe and clamp
candidate_indices = list(dict.fromkeys(candidate_indices)) # preserve order, unique
# 3. Re-embed candidate texts and compute true cosine similarity
candidate_texts = [chunks[i] for i in candidate_indices]
# Encode passages (passage prefix helps alignment)
doc_embs = _query_model.encode(
[f"passage: {c}" for c in candidate_texts],
convert_to_numpy=True,
normalize_embeddings=True
)
sims = cosine_similarity([q_emb], doc_embs)[0]
# Pair up indices and sims and sort descending
paired = [(candidate_indices[i], float(sims[i])) for i in range(len(candidate_indices))]
paired_sorted = sorted(paired, key=lambda x: x[1], reverse=True)
# Debug print: top candidates and their similarity
print("π Candidate ranking (index : sim):")
for idx, sim in paired_sorted[: min(len(paired_sorted), top_k * 3)]:
print(f" - Chunk {idx} : {sim:.4f}")
# 4. Pick those meeting threshold
selected = [idx for idx, sim in paired_sorted if sim >= min_similarity]
# Preserve order by similarity
selected = selected[:top_k]
# 5. If not enough, fill by neighbors around highest-scoring items
if len(selected) < top_k:
needed = top_k - len(selected)
# pick highest scoring indices as anchor(s)
anchors = [idx for idx, _ in paired_sorted[:3]] # top 3 anchors
expanded = []
for a in anchors:
# neighbors ordered by proximity: a, a-1, a+1, a-2, a+2 ...
if a not in expanded:
expanded.append(a)
offset = 1
while len(expanded) < top_k and offset < 5:
for cand in (a - offset, a + offset):
if 0 <= cand < len(chunks) and cand not in expanded:
expanded.append(cand)
if len(expanded) >= top_k:
break
offset += 1
if len(expanded) >= top_k:
break
# final selected: first maintain previously selected, then add neighbors from expanded preserving order
final_order = []
for idx, _sim in paired_sorted:
if idx in selected and idx not in final_order:
final_order.append(idx)
for idx in expanded:
if idx not in final_order:
final_order.append(idx)
selected = final_order[:top_k]
# final chunk strings (ordered by selected list)
final_chunks = [chunks[i] for i in selected]
print(f"β
retrieve_chunks: returning {len(final_chunks)} chunks (top_k={top_k}, min_sim={min_similarity})")
print(f" chunk indices: {selected}")
# Also return the indices? (if you want to display chunk numbers in UI, you can)
return final_chunks
except Exception as e:
print(f"β οΈ Retrieval error: {e}")
return []
# ---------------------------
# Answer generation
# ---------------------------
def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
"""
- reasoning_mode=False => strict factual, deterministic
- reasoning_mode=True => allow cautious inference (slower / longer)
"""
if not retrieved_chunks:
return "Sorry, I couldnβt find relevant information in the document."
# Add chunk headings so model can cite them if needed
context_lines = []
for i, chunk in enumerate(retrieved_chunks, start=1):
# Use [Chunk i] markers β LLM will echo them when asked to cite sources
context_lines.append(f"[Chunk {i}]: {chunk.strip()}")
context = "\n".join(context_lines)
prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(
context=context, query=query
)
try:
# deterministic in strict mode
if reasoning_mode:
max_new_tokens = 220
temp = 0.6
do_sample = True
else:
max_new_tokens = 140
temp = 0.0
do_sample = False
result = _answer_model(
prompt,
max_new_tokens=max_new_tokens,
temperature=temp,
do_sample=do_sample,
early_stopping=True,
pad_token_id=_tokenizer.eos_token_id,
)
text = result[0].get("generated_text", "").strip()
# remove the prompt echo if present
if "Answer:" in text:
out = text.split("Answer:")[-1].strip()
else:
out = text
# Enforce exact fallback phrase if model tries to paraphrase missing-answer
if not reasoning_mode and ("i don't know" in out.lower() or "not present" in out.lower()):
return "I don't know based on the provided document."
return out
except Exception as e:
print(f"β οΈ Generation failed: {e}")
return "β οΈ Error: Could not generate an answer."
# ---------------------------
# Local debug main
# ---------------------------
if __name__ == "__main__":
from vectorstore import build_faiss_index
dummy_chunks = [
"Step 1: Open the dashboard and navigate to reports.",
"Step 2: Click 'Export' to download a CSV summary.",
"Step 3: Review the generated report in your downloads folder.",
"Appendix: Communication user creation steps are explained later in this guide."
]
embeddings = [
_query_model.encode([f"passage: {c}"], convert_to_numpy=True, normalize_embeddings=True)[0]
for c in dummy_chunks
]
index = build_faiss_index(embeddings)
query = "How do I create a communication user?"
retrieved = retrieve_chunks(query, index, dummy_chunks, top_k=3, min_similarity=0.55)
print("π Retrieved:", retrieved)
print("π¬ Answer:", generate_answer(query, retrieved, reasoning_mode=False))
|