import streamlit as st
import torch
import fitz  # PyMuPDF
import os
import faiss
import numpy as np

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# ================= CONFIG =================
st.set_page_config(page_title="RAG with Phi-2", layout="wide")

HF_TOKEN = os.environ.get("HF_TOKEN")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

with st.sidebar:
    st.markdown("### 🖥️ System Info")
    st.text(f"Device: {DEVICE}")

# ================= LOAD MODEL =================
@st.cache_resource
def load_llm():
    tokenizer = AutoTokenizer.from_pretrained(
        "microsoft/phi-2",
        token=HF_TOKEN
    )

    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/phi-2",
        torch_dtype=torch.float32,  # REQUIRED for CPU
        low_cpu_mem_usage=True
    )

    model.eval()
    return tokenizer, model


@st.cache_resource
def load_embedder():
    return SentenceTransformer("all-MiniLM-L6-v2")


tokenizer, model = load_llm()
embedder = load_embedder()

# ================= UI =================
st.title("🔍 RAG App using 🤖 Phi-2")

with st.sidebar:
    st.header("📁 Upload Document")
    uploaded_file = st.file_uploader("Upload PDF or TXT", type=["pdf", "txt"])

# ================= HELPERS =================
def extract_text(file):
    if file.type == "application/pdf":
        doc = fitz.open(stream=file.read(), filetype="pdf")
        return "\n".join(page.get_text() for page in doc)
    else:
        return file.read().decode("utf-8")


def split_into_chunks(text, chunk_size=500):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]


def create_faiss_index(chunks):
    embeddings = embedder.encode(chunks, show_progress_bar=True)
    embeddings = np.array(embeddings).astype("float32")

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index, embeddings


def retrieve_chunks(query, chunks, index, k=5):
    query_embedding = embedder.encode([query]).astype("float32")
    _, indices = index.search(query_embedding, k)
    return [chunks[i] for i in indices[0]]


def generate_answer(context, question):
    prompt = f"""
Instruction: Answer ONLY using the context below.
If the answer is not present, say "Information not found."

Context:
{context}

Question:
{question}

Answer:
"""

    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.2,
            do_sample=True,
            top_p=0.9
        )

    return tokenizer.decode(output[0], skip_special_tokens=True).split("Answer:")[-1].strip()

# ================= MAIN LOGIC =================
if uploaded_file:
    raw_text = extract_text(uploaded_file)
    chunks = split_into_chunks(raw_text)

    st.sidebar.success(f"✅ {len(chunks)} chunks created")

    with st.sidebar.expander("📄 Extracted Text"):
        st.text_area("Text", raw_text, height=300)

    index, _ = create_faiss_index(chunks)

    st.markdown("### 💬 Chat with your document")

    if "messages" not in st.session_state:
        st.session_state.messages = []

    for msg in st.session_state.messages:
        with st.chat_message(msg["role"]):
            st.markdown(msg["content"])

    if user_query := st.chat_input("Ask a question"):
        with st.chat_message("user"):
            st.markdown(user_query)

        st.session_state.messages.append(
            {"role": "user", "content": user_query}
        )

        with st.chat_message("assistant"):
            with st.spinner("Thinking..."):
                context = "\n".join(
                    retrieve_chunks(user_query, chunks, index)
                )
                answer = generate_answer(context, user_query)
                st.markdown(answer)

        st.session_state.messages.append(
            {"role": "assistant", "content": answer}
        )

else:
    st.info("👈 Upload a document to begin chatting")