Shubham170793's picture
Update src/qa.py
874e5e3 verified
raw
history blame
5.95 kB
"""
qa.py — Optimized Phi-2 Retrieval + Generation
----------------------------------------------
Uses:
• intfloat/e5-small-v2 for embeddings
• microsoft/phi-2 for reasoning-rich generation (fast on CPU)
Optimized for: speed + stability in Streamlit / Hugging Face Spaces
"""
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
print("✅ qa.py (Phi-2 optimized fast) loaded from:", __file__)
# ==========================================================
# 1️⃣ Cache Setup
# ==========================================================
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ.update({
"HF_HOME": CACHE_DIR,
"TRANSFORMERS_CACHE": CACHE_DIR,
"HF_DATASETS_CACHE": CACHE_DIR,
"HF_MODULES_CACHE": CACHE_DIR
})
# ==========================================================
# 2️⃣ Embedding Model
# ==========================================================
try:
_query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
print("✅ Loaded embedding model: intfloat/e5-small-v2")
except Exception as e:
print(f"⚠️ Fallback to MiniLM due to {e}")
_query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
# ==========================================================
# 3️⃣ Phi-2 LLM Setup (Quantized for CPU)
# ==========================================================
try:
MODEL_NAME = "microsoft/phi-2"
print(f"✅ Loading LLM: {MODEL_NAME} (quantized, CPU-optimized)")
_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
# ✅ Load model in mixed precision for 4–6× faster inference
_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
cache_dir=CACHE_DIR,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.bfloat16,
low_cpu_mem_usage=True,
).to("cpu")
# ✅ Create generation pipeline (keep in memory)
_answer_model = pipeline(
"text-generation",
model=_model,
tokenizer=_tokenizer,
device=-1,
model_kwargs={"torch_dtype": torch.bfloat16, "low_cpu_mem_usage": True},
)
print("✅ Phi-2 text-generation pipeline ready (optimized).")
except Exception as e:
print(f"⚠️ Phi-2 load failed: {e}")
_answer_model = None
# ==========================================================
# 4️⃣ Prompt Template
# ==========================================================
PROMPT_TEMPLATE = (
"You are an expert assistant for enterprise document understanding.\n"
"Use ONLY the context below to answer the question clearly and factually.\n"
"If the context doesn’t contain the answer, reply exactly:\n"
"'I don't know based on the provided document.'\n\n"
"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
)
# ==========================================================
# 5️⃣ Retrieve Top-K Chunks
# ==========================================================
def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
"""Efficient FAISS retrieval using cosine similarity."""
if not index or not chunks:
return []
try:
q_emb = _query_model.encode([f"query: {query.strip()}"], convert_to_numpy=True, normalize_embeddings=True)[0]
distances, indices = index.search(np.array([q_emb]).astype("float32"), top_k * 2)
selected = set()
for idx in indices[0]:
for i in range(max(0, idx - 1), min(len(chunks), idx + 2)):
selected.add(i)
ordered_chunks = [chunks[i] for i in sorted(selected)]
return ordered_chunks
except Exception as e:
print(f"⚠️ Retrieval error: {e}")
return []
# ==========================================================
# 6️⃣ Answer Generation (fast)
# ==========================================================
def generate_answer(query: str, retrieved_chunks: list):
"""Generate concise, grounded answers using Phi-2."""
if not retrieved_chunks:
return "Sorry, I couldn’t find relevant information in the document."
context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
prompt = PROMPT_TEMPLATE.format(context=context, query=query)
try:
# ✅ Limit tokens to speed up inference
result = _answer_model(
prompt,
max_new_tokens=120, # reduced for faster completion
do_sample=False,
early_stopping=True,
pad_token_id=_tokenizer.eos_token_id,
)
answer = result[0]["generated_text"].strip()
# Clean excessive prompt echo
if "Answer:" in answer:
answer = answer.split("Answer:")[-1].strip()
return answer
except Exception as e:
print(f"⚠️ Generation failed: {e}")
return "⚠️ Error: Could not generate an answer at the moment."
# ==========================================================
# 7️⃣ Local Test
# ==========================================================
if __name__ == "__main__":
from vectorstore import build_faiss_index
dummy_chunks = [
"Step 1: Open the dashboard and navigate to reports.",
"Step 2: Click 'Export' to download a CSV summary.",
"Step 3: Review the generated report in your downloads folder."
]
embeddings = [
_query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
for chunk in dummy_chunks
]
index = build_faiss_index(embeddings)
query = "What are the steps to export a report?"
retrieved = retrieve_chunks(query, index, dummy_chunks)
print("🔍 Retrieved:", retrieved)
print("💬 Answer:", generate_answer(query, retrieved))