Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

enterprise-knowledge-assistant / src /qa.py

Shubham170793

Update src/qa.py

874e5e3 verified 2 months ago

raw

history blame

5.95 kB

	"""
	qa.py — Optimized Phi-2 Retrieval + Generation
	----------------------------------------------
	Uses:
	• intfloat/e5-small-v2 for embeddings
	• microsoft/phi-2 for reasoning-rich generation (fast on CPU)
	Optimized for: speed + stability in Streamlit / Hugging Face Spaces
	"""

	import os
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	import torch

	print("✅ qa.py (Phi-2 optimized fast) loaded from:", __file__)

	# ==========================================================
	# 1️⃣ Cache Setup
	# ==========================================================
	CACHE_DIR = "/tmp/hf_cache"
	os.makedirs(CACHE_DIR, exist_ok=True)
	os.environ.update({
	"HF_HOME": CACHE_DIR,
	"TRANSFORMERS_CACHE": CACHE_DIR,
	"HF_DATASETS_CACHE": CACHE_DIR,
	"HF_MODULES_CACHE": CACHE_DIR
	})

	# ==========================================================
	# 2️⃣ Embedding Model
	# ==========================================================
	try:
	_query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
	print("✅ Loaded embedding model: intfloat/e5-small-v2")
	except Exception as e:
	print(f"⚠️ Fallback to MiniLM due to {e}")
	_query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)

	# ==========================================================
	# 3️⃣ Phi-2 LLM Setup (Quantized for CPU)
	# ==========================================================
	try:
	MODEL_NAME = "microsoft/phi-2"
	print(f"✅ Loading LLM: {MODEL_NAME} (quantized, CPU-optimized)")

	_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)

	# ✅ Load model in mixed precision for 4–6× faster inference
	_model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	cache_dir=CACHE_DIR,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.bfloat16,
	low_cpu_mem_usage=True,
	).to("cpu")

	# ✅ Create generation pipeline (keep in memory)
	_answer_model = pipeline(
	"text-generation",
	model=_model,
	tokenizer=_tokenizer,
	device=-1,
	model_kwargs={"torch_dtype": torch.bfloat16, "low_cpu_mem_usage": True},
	)

	print("✅ Phi-2 text-generation pipeline ready (optimized).")

	except Exception as e:
	print(f"⚠️ Phi-2 load failed: {e}")
	_answer_model = None

	# ==========================================================
	# 4️⃣ Prompt Template
	# ==========================================================
	PROMPT_TEMPLATE = (
	"You are an expert assistant for enterprise document understanding.\n"
	"Use ONLY the context below to answer the question clearly and factually.\n"
	"If the context doesn’t contain the answer, reply exactly:\n"
	"'I don't know based on the provided document.'\n\n"
	"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
	)

	# ==========================================================
	# 5️⃣ Retrieve Top-K Chunks
	# ==========================================================
	def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
	"""Efficient FAISS retrieval using cosine similarity."""
	if not index or not chunks:
	return []

	try:
	q_emb = _query_model.encode([f"query: {query.strip()}"], convert_to_numpy=True, normalize_embeddings=True)[0]
	distances, indices = index.search(np.array([q_emb]).astype("float32"), top_k * 2)

	selected = set()
	for idx in indices[0]:
	for i in range(max(0, idx - 1), min(len(chunks), idx + 2)):
	selected.add(i)

	ordered_chunks = [chunks[i] for i in sorted(selected)]
	return ordered_chunks
	except Exception as e:
	print(f"⚠️ Retrieval error: {e}")
	return []

	# ==========================================================
	# 6️⃣ Answer Generation (fast)
	# ==========================================================
	def generate_answer(query: str, retrieved_chunks: list):
	"""Generate concise, grounded answers using Phi-2."""
	if not retrieved_chunks:
	return "Sorry, I couldn’t find relevant information in the document."

	context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
	prompt = PROMPT_TEMPLATE.format(context=context, query=query)

	try:
	# ✅ Limit tokens to speed up inference
	result = _answer_model(
	prompt,
	max_new_tokens=120, # reduced for faster completion
	do_sample=False,
	early_stopping=True,
	pad_token_id=_tokenizer.eos_token_id,
	)
	answer = result[0]["generated_text"].strip()

	# Clean excessive prompt echo
	if "Answer:" in answer:
	answer = answer.split("Answer:")[-1].strip()

	return answer

	except Exception as e:
	print(f"⚠️ Generation failed: {e}")
	return "⚠️ Error: Could not generate an answer at the moment."

	# ==========================================================
	# 7️⃣ Local Test
	# ==========================================================
	if __name__ == "__main__":
	from vectorstore import build_faiss_index
	dummy_chunks = [
	"Step 1: Open the dashboard and navigate to reports.",
	"Step 2: Click 'Export' to download a CSV summary.",
	"Step 3: Review the generated report in your downloads folder."
	]
	embeddings = [
	_query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
	for chunk in dummy_chunks
	]
	index = build_faiss_index(embeddings)

	query = "What are the steps to export a report?"
	retrieved = retrieve_chunks(query, index, dummy_chunks)
	print("🔍 Retrieved:", retrieved)
	print("💬 Answer:", generate_answer(query, retrieved))