Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

File size: 5,425 Bytes

cd266a5
43cd83d
 
 
 
 
 
cd266a5
 
ebbd49e
e97699c
9f0da7b
874e5e3
43cd83d
ebbd49e
43cd83d
9f0da7b
cd266a5
43cd83d
cd266a5
50ab09a
0c81fa1
cd266a5
 
 
 
 
 
6718956
 
 
43cd83d
cd266a5
43b802c
fea3890
d7aaa8f
a610ce4
43cd83d
fea3890
cd266a5
 
43cd83d
cd266a5
6718956
 
 
 
 
 
 
 
43cd83d
 
 
 
 
 
 
6718956
 
6d7ba5b
cd266a5
43cd83d
cd266a5
43cd83d
 
 
 
6718956
43cd83d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d7ba5b
fea3890
43cd83d
fea3890
43cd83d
 
fea3890
 
43cd83d
 
 
 
 
 
 
fea3890
 
43cd83d
fea3890
6718956
43cd83d
fea3890
 
 
6718956
43cd83d
fea3890
c7133f4
a5ea9d2
 
43cd83d
 
 
 
 
a5ea9d2
43cd83d
 
743f89e
d4d8027
6718956
fea3890
 
43cd83d
fea3890
 
43cd83d
 
fea3890
 
43cd83d
 
fea3890
43cd83d
6718956
43cd83d
 
6718956
43cd83d

"""
qa.py — Fast, Reasoning-Enabled Phi-2 Version
----------------------------------------------
• Uses SentenceTransformer (E5-small) for embeddings
• Uses microsoft/phi-2 for generation
• Retains reasoning vs factual modes
• Optimized for speed and low VRAM on CPU
"""

import os
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.metrics.pairwise import cosine_similarity

print("✅ qa.py (Phi-2 optimized) loaded from:", __file__)

# ==========================================================
# Hugging Face Cache Setup
# ==========================================================
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ.update({
    "HF_HOME": CACHE_DIR,
    "TRANSFORMERS_CACHE": CACHE_DIR,
    "HF_DATASETS_CACHE": CACHE_DIR,
    "HF_MODULES_CACHE": CACHE_DIR
})
print(f"✅ Using Hugging Face cache at {CACHE_DIR}")

# ==========================================================
# Query Embedding Model
# ==========================================================
try:
    _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
    print("✅ Loaded embedding model: intfloat/e5-small-v2")
except Exception as e:
    print(f"⚠️ Fallback to MiniLM due to {e}")
    _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)

# ==========================================================
# Phi-2 Model (Causal LM)
# ==========================================================
MODEL_NAME = "microsoft/phi-2"
print(f"✅ Loading LLM: {MODEL_NAME}")

_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    cache_dir=CACHE_DIR,
    torch_dtype="auto",
    low_cpu_mem_usage=True
)
_answer_model = pipeline(
    "text-generation",
    model=_model,
    tokenizer=_tokenizer,
    device=-1  # CPU-compatible
)
print("✅ Phi-2 generation pipeline ready.")

# ==========================================================
# Prompt Templates
# ==========================================================
REASONING_PROMPT = """
You are an intelligent enterprise assistant.
Use the CONTEXT below and your general understanding to answer the QUESTION logically and clearly.
Explain your reasoning briefly if helpful.

---
CONTEXT:
{context}
---
QUESTION:
{query}
---
ANSWER:
"""

STRICT_PROMPT = """
You are an enterprise document assistant.
Use ONLY the CONTEXT below to answer the QUESTION clearly and factually.
If the answer is not found in the context, reply exactly:
"I don't know based on the provided document."

---
CONTEXT:
{context}
---
QUESTION:
{query}
---
ANSWER:
"""

# ==========================================================
# Retrieve Chunks
# ==========================================================
def retrieve_chunks(query: str, index, chunks: list, top_k: int = 3):
    """Retrieve top-K most relevant chunks quickly (no re-ranking for speed)."""
    if not index or not chunks:
        return []
    query_emb = _query_model.encode(
        [f"query: {query.strip()}"],
        convert_to_numpy=True,
        normalize_embeddings=True
    )[0]
    distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k)
    return [chunks[i] for i in indices[0]]

# ==========================================================
# Generate Answer (Phi-2)
# ==========================================================
def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = True):
    """Generate answers using Phi-2. Supports reasoning or strict factual modes."""
    if not retrieved_chunks:
        return "Sorry, I couldn’t find relevant information in the document."

    context = "\n".join([chunk.strip() for chunk in retrieved_chunks])
    prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)

    try:
        result = _answer_model(
            prompt,
            max_new_tokens=180,        # keeps output short & fast
            temperature=0.4 if reasoning_mode else 0.2,
            do_sample=False,           # deterministic
            num_beams=1,               # no beam search for speed
            early_stopping=True,
        )
        text = result[0]["generated_text"].split("ANSWER:")[-1].strip()
        return text
    except Exception as e:
        print(f"⚠️ Generation failed: {e}")
        return "⚠️ Error: Could not generate an answer."

# ==========================================================
# Local Test (optional)
# ==========================================================
if __name__ == "__main__":
    from vectorstore import build_faiss_index

    dummy_chunks = [
        "Step 1: Open the dashboard and navigate to reports.",
        "Step 2: Click 'Export' to download a CSV summary.",
        "Step 3: Review the generated report in your downloads folder."
    ]

    index = build_faiss_index([
        _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
        for chunk in dummy_chunks
    ])

    query = "What are the steps to export a report?"
    retrieved = retrieve_chunks(query, index, dummy_chunks)
    print("🔍 Retrieved:", retrieved)
    print("💬 Answer:", generate_answer(query, retrieved))