import os, torch, gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

BASE_MODEL   = os.getenv("BASE_MODEL", "Qwen/Qwen2.5-7B")
ADAPTER_REPO = os.getenv("ADAPTER_REPO", "your-username/tt-qwen25-7b-tt-lora")
LOAD_IN_4BIT = os.getenv("LOAD_IN_4BIT", "true").lower() == "true"

def load_model():
    tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
    if tok.pad_token_id is None:
        tok.pad_token = tok.eos_token

    base = None
    if LOAD_IN_4BIT:
        try:
            bnb_cfg = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16,  # float16 для Spaces GPU
            )
            base = AutoModelForCausalLM.from_pretrained(
                BASE_MODEL, quantization_config=bnb_cfg, device_map="auto"
            )
            print("Loaded base in 4-bit NF4")
        except Exception as e:
            print("[warn] 4-bit failed:", e)

    if base is None:
        try:
            bnb8 = BitsAndBytesConfig(load_in_8bit=True)
            base = AutoModelForCausalLM.from_pretrained(
                BASE_MODEL, quantization_config=bnb8, device_map="auto"
            )
            print("Loaded base in 8-bit")
        except Exception as e:
            print("[warn] 8-bit failed:", e)
            base = AutoModelForCausalLM.from_pretrained(
                BASE_MODEL, torch_dtype=torch.float16, device_map="auto"
            )
            print("Loaded base in FP16 (may offload to CPU)")

    base.config.pad_token_id = tok.pad_token_id
    model = PeftModel.from_pretrained(
        base, ADAPTER_REPO, is_trainable=False, torch_dtype=torch.float16
    )
    model = model.to(dtype=torch.float16)
    model.eval()
    return tok, model

tok, model = load_model()

def format_prompt(user, system, mode):
    if mode == "Qwen chat":
        msgs = [{"role":"system","content":system},{"role":"user","content":user}]
        input_ids = tok.apply_chat_template(msgs, add_generation_prompt=True, return_tensors="pt")
        attn = torch.ones_like(input_ids)
        return {"input_ids": input_ids.to(model.device), "attention_mask": attn.to(model.device)}
    else:
        prompt = f"<|system|> {system}\n<|user|> {user}\n<|assistant|>"
        enc = tok(prompt, return_tensors="pt")
        return {
            "input_ids": enc["input_ids"].to(model.device),
            "attention_mask": enc["attention_mask"].to(model.device)
        }

@torch.inference_mode()
def respond(message, history, system_prompt, mode, temperature, top_p, rep_penalty, max_new_tokens):
    inputs = format_prompt(message, system_prompt, mode)
    with torch.autocast("cuda", dtype=torch.float16):
        out = model.generate(
            **inputs,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=rep_penalty,
            max_new_tokens=max_new_tokens,
            pad_token_id=tok.pad_token_id,
            eos_token_id=tok.eos_token_id,
            no_repeat_ngram_size=4
        )
    gen_only = out[0][inputs["input_ids"].shape[1]:]
    text = tok.decode(gen_only, skip_special_tokens=True)
    return text

with gr.Blocks() as demo:
    gr.Markdown("## Татарча чат-демо (Qwen2.5-7B + LoRA)")
    gr.Markdown("Бета-версия. Модель обучена отвечать **по-татарски**. Если переключаться на русский/английский — это ошибка; сообщите нам примеры.")
    with gr.Row():
        system_prompt = gr.Textbox(
            value="Син бары тик татарча гына җавап бир. Җавапларың кыска һәм нейтраль булсын.",
            label="System prompt"
        )
        mode = gr.Radio(choices=["SFT tags", "Qwen chat"], value="SFT tags", label="Формат промпта")
    with gr.Row():
        temperature      = gr.Slider(0.1, 1.2, value=0.7, step=0.05, label="temperature")
        top_p            = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
        rep_penalty      = gr.Slider(1.0, 1.4, value=1.15, step=0.05, label="repetition_penalty")
        max_new_tokens   = gr.Slider(16, 512, value=200, step=8, label="max_new_tokens")

    gr.ChatInterface(
        fn=respond,
        additional_inputs=[system_prompt, mode, temperature, top_p, rep_penalty, max_new_tokens],
        title=None, undo_btn=None, retry_btn=None, clear_btn="Clear"
    )

demo.queue(concurrency_count=1).launch()