Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

enterprise-knowledge-assistant / src /qa.py

Shubham170793

Update src/qa.py

cd6e69b verified 2 months ago

raw

history blame

7.33 kB

	"""
	qa.py — Phi-2 Fast + Smart Reasoning Mode (Hybrid)
	-------------------------------------------------
	✅ Uses intfloat/e5-small-v2 for embeddings
	✅ Uses microsoft/phi-2 (fast CPU quantized)
	✅ Reasoning Mode toggle integrated cleanly
	✅ Retrieval unaffected by reasoning mode
	"""

	import os
	import numpy as np
	import torch
	from sentence_transformers import SentenceTransformer
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

	print("✅ qa.py (Phi-2 Hybrid) loaded from:", __file__)

	# ==========================================================
	# 1️⃣ Cache Setups
	# ==========================================================
	CACHE_DIR = "/tmp/hf_cache"
	os.makedirs(CACHE_DIR, exist_ok=True)
	os.environ.update({
	"HF_HOME": CACHE_DIR,
	"TRANSFORMERS_CACHE": CACHE_DIR,
	"HF_DATASETS_CACHE": CACHE_DIR,
	"HF_MODULES_CACHE": CACHE_DIR
	})

	# ==========================================================
	# 2️⃣ Embedding Model
	# ==========================================================
	try:
	_query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
	print("✅ Loaded embedding model: intfloat/e5-small-v2")
	except Exception as e:
	print(f"⚠️ Fallback to MiniLM due to {e}")
	_query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)

	# ==========================================================
	# 3️⃣ Phi-2 LLM Setup (Optimized for CPU)
	# ==========================================================
	try:
	MODEL_NAME = "microsoft/phi-2"
	print(f"✅ Loading LLM: {MODEL_NAME} (optimized for reasoning)")

	_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
	_model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	cache_dir=CACHE_DIR,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.bfloat16,
	low_cpu_mem_usage=True,
	).to("cpu")

	_answer_model = pipeline(
	"text-generation",
	model=_model,
	tokenizer=_tokenizer,
	device=-1,
	model_kwargs={"torch_dtype": torch.bfloat16, "low_cpu_mem_usage": True},
	)

	print("✅ Phi-2 text-generation pipeline ready.")

	except Exception as e:
	print(f"⚠️ Phi-2 load failed: {e}")
	_answer_model = None

	# ==========================================================
	# 4️⃣ Prompt Templates
	# ==========================================================
	STRICT_PROMPT = (
	"Answer based ONLY on the context below.\n"
	"If the answer isn’t in the context, say: 'I don't know based on the provided document.'\n\n"
	"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
	)

	REASONING_PROMPT = (
	"You are an expert assistant. Use the context and your reasoning ability to form a clear, step-by-step answer.\n"
	"Be concise yet complete. If the context doesn’t contain the answer, say: 'I don't know based on the provided document.'\n\n"
	"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
	)

	# ==========================================================
	# 5️⃣ Chunk Retrieval (Unchanged — Fast)
	# ==========================================================
	def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
	"""Fast FAISS retrieval using cosine similarity."""
	if not index or not chunks:
	return []

	try:
	q_emb = _query_model.encode(
	[f"query: {query.strip()}"],
	convert_to_numpy=True,
	normalize_embeddings=True
	)[0]
	distances, indices = index.search(np.array([q_emb]).astype("float32"), top_k * 2)

	selected = set()
	for idx in indices[0]:
	for i in range(max(0, idx - 1), min(len(chunks), idx + 2)):
	selected.add(i)

	return [chunks[i] for i in sorted(selected)]
	except Exception as e:
	print(f"⚠️ Retrieval error: {e}")
	return []

	# ==========================================================
	# 6️⃣ Answer Generation (Enhanced — Balanced Reasoning + Speed)
	# ==========================================================
	def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
	"""
	Generate answers with Phi-2.
	- reasoning_mode=False → strict factual, fast
	- reasoning_mode=True → analytical, richer reasoning (slower)
	"""
	if not retrieved_chunks:
	return "Sorry, I couldn’t find relevant information in the document."

	context = "\n".join(chunk.strip() for chunk in retrieved_chunks)

	# 🧠 Reasoning prompt: encourages explanation, not just lookup
	REASONING_PROMPT = (
	"You are an expert assistant with strong reasoning skills.\n"
	"Think step by step and form a detailed, logical answer.\n"
	"You can combine hints from the context with your general understanding.\n"
	"If the context doesn't mention the answer, acknowledge that.\n\n"
	"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
	)

	# ⚡ Strict factual prompt
	STRICT_PROMPT = (
	"Answer based ONLY on the context below.\n"
	"If the answer isn’t in the context, say: 'I don't know based on the provided document.'\n\n"
	"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
	)

	prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)

	try:
	if reasoning_mode:
	# 🧩 The “brainy” config that produced the great long answer
	result = _answer_model(
	prompt,
	max_new_tokens=180,
	temperature=0.45,
	do_sample=False, # reasoning but deterministic
	pad_token_id=_tokenizer.eos_token_id,
	)
	else:
	# ⚡ Fast factual config
	result = _answer_model(
	prompt,
	max_new_tokens=120,
	temperature=0.2,
	do_sample=False,
	pad_token_id=_tokenizer.eos_token_id,
	)

	raw = result[0]["generated_text"].strip()
	if "Answer:" in raw:
	raw = raw.split("Answer:")[-1].strip()
	return raw

	except Exception as e:
	print(f"⚠️ Generation failed: {e}")
	return "⚠️ Error: Could not generate an answer."


	# ==========================================================
	# 7️⃣ Local Test
	# ==========================================================
	if __name__ == "__main__":
	from vectorstore import build_faiss_index
	dummy_chunks = [
	"Step 1: Open the dashboard and navigate to reports.",
	"Step 2: Click 'Export' to download a CSV summary.",
	"Step 3: Review the generated report in your downloads folder."
	]
	embeddings = [
	_query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
	for chunk in dummy_chunks
	]
	index = build_faiss_index(embeddings)

	query = "What are the steps to export a report?"
	retrieved = retrieve_chunks(query, index, dummy_chunks)

	print("\n--- Strict Mode ---")
	print(generate_answer(query, retrieved, reasoning_mode=False))

	print("\n--- Reasoning Mode ---")
	print(generate_answer(query, retrieved, reasoning_mode=True))