File size: 5,425 Bytes
cd266a5
43cd83d
 
 
 
 
 
cd266a5
 
ebbd49e
e97699c
9f0da7b
874e5e3
43cd83d
ebbd49e
43cd83d
9f0da7b
cd266a5
43cd83d
cd266a5
50ab09a
0c81fa1
cd266a5
 
 
 
 
 
6718956
 
 
43cd83d
cd266a5
43b802c
fea3890
d7aaa8f
a610ce4
43cd83d
fea3890
cd266a5
 
43cd83d
cd266a5
6718956
 
 
 
 
 
 
 
43cd83d
 
 
 
 
 
 
6718956
 
6d7ba5b
cd266a5
43cd83d
cd266a5
43cd83d
 
 
 
6718956
43cd83d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d7ba5b
fea3890
43cd83d
fea3890
43cd83d
 
fea3890
 
43cd83d
 
 
 
 
 
 
fea3890
 
43cd83d
fea3890
6718956
43cd83d
fea3890
 
 
6718956
43cd83d
fea3890
c7133f4
a5ea9d2
 
43cd83d
 
 
 
 
a5ea9d2
43cd83d
 
743f89e
d4d8027
6718956
fea3890
 
43cd83d
fea3890
 
43cd83d
 
fea3890
 
43cd83d
 
fea3890
43cd83d
6718956
43cd83d
 
6718956
43cd83d
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
qa.py — Fast, Reasoning-Enabled Phi-2 Version
----------------------------------------------
• Uses SentenceTransformer (E5-small) for embeddings
• Uses microsoft/phi-2 for generation
• Retains reasoning vs factual modes
• Optimized for speed and low VRAM on CPU
"""

import os
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.metrics.pairwise import cosine_similarity

print("✅ qa.py (Phi-2 optimized) loaded from:", __file__)

# ==========================================================
# Hugging Face Cache Setup
# ==========================================================
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ.update({
    "HF_HOME": CACHE_DIR,
    "TRANSFORMERS_CACHE": CACHE_DIR,
    "HF_DATASETS_CACHE": CACHE_DIR,
    "HF_MODULES_CACHE": CACHE_DIR
})
print(f"✅ Using Hugging Face cache at {CACHE_DIR}")

# ==========================================================
# Query Embedding Model
# ==========================================================
try:
    _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
    print("✅ Loaded embedding model: intfloat/e5-small-v2")
except Exception as e:
    print(f"⚠️ Fallback to MiniLM due to {e}")
    _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)

# ==========================================================
# Phi-2 Model (Causal LM)
# ==========================================================
MODEL_NAME = "microsoft/phi-2"
print(f"✅ Loading LLM: {MODEL_NAME}")

_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    cache_dir=CACHE_DIR,
    torch_dtype="auto",
    low_cpu_mem_usage=True
)
_answer_model = pipeline(
    "text-generation",
    model=_model,
    tokenizer=_tokenizer,
    device=-1  # CPU-compatible
)
print("✅ Phi-2 generation pipeline ready.")

# ==========================================================
# Prompt Templates
# ==========================================================
REASONING_PROMPT = """
You are an intelligent enterprise assistant.
Use the CONTEXT below and your general understanding to answer the QUESTION logically and clearly.
Explain your reasoning briefly if helpful.

---
CONTEXT:
{context}
---
QUESTION:
{query}
---
ANSWER:
"""

STRICT_PROMPT = """
You are an enterprise document assistant.
Use ONLY the CONTEXT below to answer the QUESTION clearly and factually.
If the answer is not found in the context, reply exactly:
"I don't know based on the provided document."

---
CONTEXT:
{context}
---
QUESTION:
{query}
---
ANSWER:
"""

# ==========================================================
# Retrieve Chunks
# ==========================================================
def retrieve_chunks(query: str, index, chunks: list, top_k: int = 3):
    """Retrieve top-K most relevant chunks quickly (no re-ranking for speed)."""
    if not index or not chunks:
        return []
    query_emb = _query_model.encode(
        [f"query: {query.strip()}"],
        convert_to_numpy=True,
        normalize_embeddings=True
    )[0]
    distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k)
    return [chunks[i] for i in indices[0]]

# ==========================================================
# Generate Answer (Phi-2)
# ==========================================================
def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = True):
    """Generate answers using Phi-2. Supports reasoning or strict factual modes."""
    if not retrieved_chunks:
        return "Sorry, I couldn’t find relevant information in the document."

    context = "\n".join([chunk.strip() for chunk in retrieved_chunks])
    prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)

    try:
        result = _answer_model(
            prompt,
            max_new_tokens=180,        # keeps output short & fast
            temperature=0.4 if reasoning_mode else 0.2,
            do_sample=False,           # deterministic
            num_beams=1,               # no beam search for speed
            early_stopping=True,
        )
        text = result[0]["generated_text"].split("ANSWER:")[-1].strip()
        return text
    except Exception as e:
        print(f"⚠️ Generation failed: {e}")
        return "⚠️ Error: Could not generate an answer."

# ==========================================================
# Local Test (optional)
# ==========================================================
if __name__ == "__main__":
    from vectorstore import build_faiss_index

    dummy_chunks = [
        "Step 1: Open the dashboard and navigate to reports.",
        "Step 2: Click 'Export' to download a CSV summary.",
        "Step 3: Review the generated report in your downloads folder."
    ]

    index = build_faiss_index([
        _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
        for chunk in dummy_chunks
    ])

    query = "What are the steps to export a report?"
    retrieved = retrieve_chunks(query, index, dummy_chunks)
    print("🔍 Retrieved:", retrieved)
    print("💬 Answer:", generate_answer(query, retrieved))