File size: 5,425 Bytes
cd266a5 43cd83d cd266a5 ebbd49e e97699c 9f0da7b 874e5e3 43cd83d ebbd49e 43cd83d 9f0da7b cd266a5 43cd83d cd266a5 50ab09a 0c81fa1 cd266a5 6718956 43cd83d cd266a5 43b802c fea3890 d7aaa8f a610ce4 43cd83d fea3890 cd266a5 43cd83d cd266a5 6718956 43cd83d 6718956 6d7ba5b cd266a5 43cd83d cd266a5 43cd83d 6718956 43cd83d 6d7ba5b fea3890 43cd83d fea3890 43cd83d fea3890 43cd83d fea3890 43cd83d fea3890 6718956 43cd83d fea3890 6718956 43cd83d fea3890 c7133f4 a5ea9d2 43cd83d a5ea9d2 43cd83d 743f89e d4d8027 6718956 fea3890 43cd83d fea3890 43cd83d fea3890 43cd83d fea3890 43cd83d 6718956 43cd83d 6718956 43cd83d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
"""
qa.py — Fast, Reasoning-Enabled Phi-2 Version
----------------------------------------------
• Uses SentenceTransformer (E5-small) for embeddings
• Uses microsoft/phi-2 for generation
• Retains reasoning vs factual modes
• Optimized for speed and low VRAM on CPU
"""
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.metrics.pairwise import cosine_similarity
print("✅ qa.py (Phi-2 optimized) loaded from:", __file__)
# ==========================================================
# Hugging Face Cache Setup
# ==========================================================
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ.update({
"HF_HOME": CACHE_DIR,
"TRANSFORMERS_CACHE": CACHE_DIR,
"HF_DATASETS_CACHE": CACHE_DIR,
"HF_MODULES_CACHE": CACHE_DIR
})
print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
# ==========================================================
# Query Embedding Model
# ==========================================================
try:
_query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
print("✅ Loaded embedding model: intfloat/e5-small-v2")
except Exception as e:
print(f"⚠️ Fallback to MiniLM due to {e}")
_query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
# ==========================================================
# Phi-2 Model (Causal LM)
# ==========================================================
MODEL_NAME = "microsoft/phi-2"
print(f"✅ Loading LLM: {MODEL_NAME}")
_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
cache_dir=CACHE_DIR,
torch_dtype="auto",
low_cpu_mem_usage=True
)
_answer_model = pipeline(
"text-generation",
model=_model,
tokenizer=_tokenizer,
device=-1 # CPU-compatible
)
print("✅ Phi-2 generation pipeline ready.")
# ==========================================================
# Prompt Templates
# ==========================================================
REASONING_PROMPT = """
You are an intelligent enterprise assistant.
Use the CONTEXT below and your general understanding to answer the QUESTION logically and clearly.
Explain your reasoning briefly if helpful.
---
CONTEXT:
{context}
---
QUESTION:
{query}
---
ANSWER:
"""
STRICT_PROMPT = """
You are an enterprise document assistant.
Use ONLY the CONTEXT below to answer the QUESTION clearly and factually.
If the answer is not found in the context, reply exactly:
"I don't know based on the provided document."
---
CONTEXT:
{context}
---
QUESTION:
{query}
---
ANSWER:
"""
# ==========================================================
# Retrieve Chunks
# ==========================================================
def retrieve_chunks(query: str, index, chunks: list, top_k: int = 3):
"""Retrieve top-K most relevant chunks quickly (no re-ranking for speed)."""
if not index or not chunks:
return []
query_emb = _query_model.encode(
[f"query: {query.strip()}"],
convert_to_numpy=True,
normalize_embeddings=True
)[0]
distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k)
return [chunks[i] for i in indices[0]]
# ==========================================================
# Generate Answer (Phi-2)
# ==========================================================
def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = True):
"""Generate answers using Phi-2. Supports reasoning or strict factual modes."""
if not retrieved_chunks:
return "Sorry, I couldn’t find relevant information in the document."
context = "\n".join([chunk.strip() for chunk in retrieved_chunks])
prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)
try:
result = _answer_model(
prompt,
max_new_tokens=180, # keeps output short & fast
temperature=0.4 if reasoning_mode else 0.2,
do_sample=False, # deterministic
num_beams=1, # no beam search for speed
early_stopping=True,
)
text = result[0]["generated_text"].split("ANSWER:")[-1].strip()
return text
except Exception as e:
print(f"⚠️ Generation failed: {e}")
return "⚠️ Error: Could not generate an answer."
# ==========================================================
# Local Test (optional)
# ==========================================================
if __name__ == "__main__":
from vectorstore import build_faiss_index
dummy_chunks = [
"Step 1: Open the dashboard and navigate to reports.",
"Step 2: Click 'Export' to download a CSV summary.",
"Step 3: Review the generated report in your downloads folder."
]
index = build_faiss_index([
_query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
for chunk in dummy_chunks
])
query = "What are the steps to export a report?"
retrieved = retrieve_chunks(query, index, dummy_chunks)
print("🔍 Retrieved:", retrieved)
print("💬 Answer:", generate_answer(query, retrieved))
|