import gradio as gr
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer

# ============================================================
# Qwen3-0.6B – Fast Chat
# ============================================================
MODEL_ID = "Qwen/Qwen3-0.6B"
model = None
tokenizer = None

def load_model():
    global model, tokenizer
    if model is not None:
        return
    import os
    token = os.getenv("HF_TOKEN", None)
    print("Loading Qwen3-0.6B ...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=token)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        dtype=torch.float32,
        low_cpu_mem_usage=True,
        token=token
    ).to("cpu")
    model.eval()
    print("Model loaded.")

def strip_thinking(text):
    """Remove <think...</think*> blocks from Qwen3 output."""
    return re.sub(r'<think[^>]*>.*?</think[^>]*>', '', text, flags=re.DOTALL).strip()

def normalize_content(msg):
    """Convert list content to string (Gradio may pass content as a list)."""
    if isinstance(msg.get("content"), list):
        parts = []
        for item in msg["content"]:
            if isinstance(item, dict) and "text" in item:
                parts.append(item["text"])
            elif isinstance(item, str):
                parts.append(item)
        return {"role": msg["role"], "content": " ".join(parts)}
    return msg

def chat_response(message, history):
    load_model()

    # Normalize history: convert any list content to plain strings
    clean_history = [normalize_content(m) for m in history]
    messages = clean_history + [{"role": "user", "content": message}]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False  # disable thinking mode
    )
    inputs = tokenizer(text, return_tensors="pt").to("cpu")

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )
    output_ids = generated_ids[0][len(inputs.input_ids[0]):]
    response = tokenizer.decode(output_ids, skip_special_tokens=True)

    # Fallback: strip any <think...> blocks if they still appear
    response = strip_thinking(response)

    return messages + [{"role": "assistant", "content": response}]

# ============================================================
# Gradio Interface
# ============================================================
with gr.Blocks(title="Qwen3-0.6B Fast Chat") as demo:
    gr.Markdown("""
    # ⚡ Qwen3-0.6B – Fast Chat
    Small and fast model. Great for quick answers on CPU.
    """)

    chatbot = gr.Chatbot(label="Conversation")
    msg = gr.Textbox(label="Your Message", placeholder="Type your message and press Enter...")
    clear = gr.Button("Clear Conversation")

    msg.submit(
        chat_response, [msg, chatbot], chatbot,
        concurrency_limit=3
    ).then(
        lambda: "", None, msg
    )
    clear.click(lambda: [], None, chatbot)

if __name__ == "__main__":
    demo.queue(default_concurrency_limit=3)
    demo.launch(server_name="0.0.0.0")