import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import gradio as gr

# --------------------
# Model setup
# --------------------
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
LORA_REPO = "nitya001/autotrain-4n1y9-5ekvs"

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=dtype,
    device_map="auto" if device == "cuda" else None,
)

print("Loading LoRA adapter:", LORA_REPO)
model = PeftModel.from_pretrained(base_model, LORA_REPO)
model.to(device)
model.eval()

SYSTEM_PROMPT = (
    "You are a helpful assistant fine-tuned for loan journeys and UTR queries. "
    "Answer clearly and concisely. If you don't know some specific account value, "
    "explain what information is needed instead of hallucinating numbers."
)

# --------------------
# Generation function
# --------------------
def generate_reply(message: str, history: list):
    """
    ChatInterface passes:
      message: latest user message (string)
      history: list of dicts: [{role: 'user'/'assistant', content: '...'}, ...]

    We return just the assistant's reply as a string.
    ChatInterface will handle the messages format for the UI.
    """
    # Build a simple conversation prompt using TinyLlama chat-style tags
    conversation = f"<|system|>{SYSTEM_PROMPT}</s>\n"

    if history:
        for msg in history:
            role = msg.get("role", "user")
            content = msg.get("content", "")
            if role == "user":
                conversation += f"<|user|>{content}</s>\n"
            elif role == "assistant":
                conversation += f"<|assistant|>{content}</s>\n"

    # Add the latest user message
    conversation += f"<|user|>{message}</s>\n<|assistant|>"

    inputs = tokenizer(
        conversation,
        return_tensors="pt",
        truncation=True,
        max_length=2048,
    ).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Take only the newly generated tokens
    generated_ids = output_ids[0][inputs["input_ids"].shape[-1] :]
    answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    if not answer:
        answer = "I'm not sure how to answer that. Could you rephrase your question?"

    return answer

# --------------------
# Gradio UI
# --------------------
demo = gr.ChatInterface(
    fn=generate_reply,
    title="UTR & Loan Assistant (TinyLlama LoRA)",
    description=(
        "Ask things like:\n"
        "- What is my latest UTR?\n"
        "- How is my EMI calculated?\n"
        "- Summarize my repayment schedule.\n"
    ),
    examples=[
        "What is my latest UTR?",
        "Explain my repayment schedule.",
        "How are late payment charges calculated?",
    ],
)

if __name__ == "__main__":
    # Spaces will call `app.py` directly, so this is mainly for local testing
    demo.launch()