"""
qa.py — Fast, Reasoning-Enabled Phi-2 Version
----------------------------------------------
• Uses SentenceTransformer (E5-small) for embeddings
• Uses microsoft/phi-2 for generation
• Retains reasoning vs factual modes
• Optimized for speed and low VRAM on CPU
"""

import os
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.metrics.pairwise import cosine_similarity

print("✅ qa.py (Phi-2 optimized) loaded from:", __file__)

# ==========================================================
# Hugging Face Cache Setup
# ==========================================================
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ.update({
    "HF_HOME": CACHE_DIR,
    "TRANSFORMERS_CACHE": CACHE_DIR,
    "HF_DATASETS_CACHE": CACHE_DIR,
    "HF_MODULES_CACHE": CACHE_DIR
})
print(f"✅ Using Hugging Face cache at {CACHE_DIR}")

# ==========================================================
# Query Embedding Model
# ==========================================================
try:
    _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
    print("✅ Loaded embedding model: intfloat/e5-small-v2")
except Exception as e:
    print(f"⚠️ Fallback to MiniLM due to {e}")
    _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)

# ==========================================================
# Phi-2 Model (Causal LM)
# ==========================================================
MODEL_NAME = "microsoft/phi-2"
print(f"✅ Loading LLM: {MODEL_NAME}")

_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    cache_dir=CACHE_DIR,
    torch_dtype="auto",
    low_cpu_mem_usage=True
)
_answer_model = pipeline(
    "text-generation",
    model=_model,
    tokenizer=_tokenizer,
    device=-1  # CPU-compatible
)
print("✅ Phi-2 generation pipeline ready.")

# ==========================================================
# Prompt Templates
# ==========================================================
REASONING_PROMPT = """
You are an intelligent enterprise assistant.
Use the CONTEXT below and your general understanding to answer the QUESTION logically and clearly.
Explain your reasoning briefly if helpful.

---
CONTEXT:
{context}
---
QUESTION:
{query}
---
ANSWER:
"""

STRICT_PROMPT = """
You are an enterprise document assistant.
Use ONLY the CONTEXT below to answer the QUESTION clearly and factually.
If the answer is not found in the context, reply exactly:
"I don't know based on the provided document."

---
CONTEXT:
{context}
---
QUESTION:
{query}
---
ANSWER:
"""

# ==========================================================
# Retrieve Chunks
# ==========================================================
def retrieve_chunks(query: str, index, chunks: list, top_k: int = 3):
    """Retrieve top-K most relevant chunks quickly (no re-ranking for speed)."""
    if not index or not chunks:
        return []
    query_emb = _query_model.encode(
        [f"query: {query.strip()}"],
        convert_to_numpy=True,
        normalize_embeddings=True
    )[0]
    distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k)
    return [chunks[i] for i in indices[0]]

# ==========================================================
# Generate Answer (Phi-2)
# ==========================================================
def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = True):
    """Generate answers using Phi-2. Supports reasoning or strict factual modes."""
    if not retrieved_chunks:
        return "Sorry, I couldn’t find relevant information in the document."

    context = "\n".join([chunk.strip() for chunk in retrieved_chunks])
    prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)

    try:
        result = _answer_model(
            prompt,
            max_new_tokens=180,        # keeps output short & fast
            temperature=0.4 if reasoning_mode else 0.2,
            do_sample=False,           # deterministic
            num_beams=1,               # no beam search for speed
            early_stopping=True,
        )
        text = result[0]["generated_text"].split("ANSWER:")[-1].strip()
        return text
    except Exception as e:
        print(f"⚠️ Generation failed: {e}")
        return "⚠️ Error: Could not generate an answer."

# ==========================================================
# Local Test (optional)
# ==========================================================
if __name__ == "__main__":
    from vectorstore import build_faiss_index

    dummy_chunks = [
        "Step 1: Open the dashboard and navigate to reports.",
        "Step 2: Click 'Export' to download a CSV summary.",
        "Step 3: Review the generated report in your downloads folder."
    ]

    index = build_faiss_index([
        _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
        for chunk in dummy_chunks
    ])

    query = "What are the steps to export a report?"
    retrieved = retrieve_chunks(query, index, dummy_chunks)
    print("🔍 Retrieved:", retrieved)
    print("💬 Answer:", generate_answer(query, retrieved))