"""
HuggingFace Space — Gemma 4 Quantitative Finance Chat
Hardware: Nvidia T4 medium (16 GB VRAM)
"""

import os
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

HF_TOKEN    = os.environ.get("HF_TOKEN", "")
HF_USERNAME = os.environ.get("HF_USERNAME", "mo35")
BASE_MODEL  = "google/gemma-4-E4B-it"
LORA_REPO   = f"{HF_USERNAME}/gemma4-quantfin-lora"

# ── Load model at startup ─────────────────────────────────────────────────────
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)

print("Loading base model in 4-bit...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit              = True,
    bnb_4bit_compute_dtype    = torch.bfloat16,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type       = "nf4",
)
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config = bnb_config,
    device_map          = "auto",
    token               = HF_TOKEN,
)

print(f"Loading LoRA adapter from {LORA_REPO}...")
model = PeftModel.from_pretrained(base_model, LORA_REPO, token=HF_TOKEN)
model.eval()
print("Model ready.")

# ── Inference ─────────────────────────────────────────────────────────────────
def respond(message: str, history: list) -> str:
    messages = []
    for msg in history:
        messages.append({"role": msg["role"], "content": msg["content"]})
    messages.append({"role": "user", "content": message})

    # apply_chat_template returns BatchEncoding in newer transformers
    encoded = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt = True,
        return_tensors        = "pt",
        return_dict           = True,
    )
    input_ids      = encoded["input_ids"].to(model.device)
    attention_mask = encoded.get("attention_mask", None)
    if attention_mask is not None:
        attention_mask = attention_mask.to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            attention_mask     = attention_mask,
            max_new_tokens     = 1024,
            temperature        = 0.7,
            do_sample          = True,
            repetition_penalty = 1.1,
        )

    return tokenizer.decode(
        outputs[0][input_ids.shape[-1]:],
        skip_special_tokens = True,
    )

# ── Gradio UI ─────────────────────────────────────────────────────────────────
demo = gr.ChatInterface(
    fn          = respond,
    type        = "messages",
    title       = "Gemma 4 — Quantitative Finance",
    description = (
        "A specialized AI assistant fine-tuned on quantitative finance: derivatives pricing, "
        "stochastic calculus, risk models, and portfolio theory. "
        "Answers include LaTeX mathematical derivations."
    ),
    examples    = [
        "Derive the Black-Scholes PDE from first principles.",
        "Explain the SABR model and its implied volatility approximation.",
        "What is the difference between risk-neutral and real-world measures?",
        "Derive the Heston model characteristic function.",
        "Explain Value at Risk vs Expected Shortfall.",
    ],
    theme          = gr.themes.Soft(),
    cache_examples = False,
)

demo.launch()