import os
import streamlit as st
import torch
import logging
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)
from peft import PeftModel

# ── Configuration ──────────────────────────────────────────────────────────
BASE_MODEL     = "microsoft/phi-2"
ADAPTER_REPO   = "sourize/phi2-memory-deeptalks"
CONTEXT_TURNS  = 7
MAX_NEW_TOKENS = 128
OFFLOAD_DIR    = "offload"

SYSTEM = (
    "You are a helpful assistant for DeepTalks with base Phi-2\n"
    "Fine-tuned by Sourish for making personal conversations.\n"
    "Answer **only** using the conversation context below.\n"
    "Do NOT output any lines beginning with 'User:' or 'Assistant:'.\n"
    "If you don't know, say \"I don't know.\"\n"
)

@st.cache_resource(show_spinner=False)
def load_pipeline():
    # 1) Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL, trust_remote_code=True, padding_side="left"
    )
    if tokenizer.pad_token_id is None:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})

    # 2) Base model: 4-bit on CUDA, plain FP16/FP32 on CPU
    if torch.cuda.is_available():
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype="float16",
            low_cpu_mem_usage=True,
        )
        base = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            trust_remote_code=True,
            quantization_config=quant_config,
            device_map="auto",
            offload_folder=OFFLOAD_DIR,
            offload_state_dict=True,
        )
    else:
        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        base = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            trust_remote_code=True,
            torch_dtype=dtype,
            device_map="cpu",           # force CPU
        )

    # 3) Resize + LoRA overlay
    base.resize_token_embeddings(len(tokenizer))
    model = PeftModel.from_pretrained(
        base,
        ADAPTER_REPO,
        trust_remote_code=True,
        device_map="auto" if torch.cuda.is_available() else None,
        torch_dtype=None,
    )
    model.eval()

    # 4) Build generation pipeline
    gen = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto" if torch.cuda.is_available() else None,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        use_cache=True,
        return_full_text=False,
    )

    logging.info("Pipeline loaded.")
    return gen

generator = load_pipeline()

# ── Streamlit UI ──────────────────────────────────────────────────────────
st.set_page_config(layout="centered")
st.title("🧠 DeepTalks")
st.markdown("⏳ It takes time to generate responses since it's running on the CPU free tier")
st.subheader("Your personal AI Companion", divider='grey')

if "history" not in st.session_state:
    st.session_state.history = []

for role, text in st.session_state.history:
    st.chat_message("user" if role == "You" else "assistant").write(text)

user_input = st.chat_input("Your message…")
if user_input:
    st.chat_message("user").write(user_input)
    st.session_state.history.append(("You", user_input))

    recent = st.session_state.history[-CONTEXT_TURNS*2:]
    context = "\n".join(t for _, t in recent)
    prompt  = f"""{SYSTEM}

Context:
{context}

User: {user_input}
Assistant:"""

    with st.spinner("Thinking…"):
        try:
            reply = generator(prompt)[0]["generated_text"].strip()
            for marker in ["User:", "Assistant:"]:
                if marker in reply:
                    reply = reply.split(marker)[0].strip()
            if not reply:
                reply = "I’m sorry, I didn’t catch that. Could you rephrase?"
        except Exception as e:
            reply = "I’m sorry, something went wrong."
            st.error(f"Error: {e}")

    st.chat_message("assistant").write(reply)
    st.session_state.history.append(("Bot", reply))