import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gradio as gr

model_id = "ogflash/merged-mistral-4bit-bitnetQnA"

tokenizer = AutoTokenizer.from_pretrained(model_id)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16
)
model.eval()

def respond(message, history):
    prompt = ""
    for user, bot in history:
        prompt += f"### Instruction:\n{user}\n\n### Response:\n{bot}\n\n"
    prompt += f"### Instruction:\n{message}\n\n### Response:\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.7)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = decoded.split("### Response:")[-1].strip()
    return response

with gr.Blocks() as demo:
    gr.Markdown("# BitNet Q&A Chatbot")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder="Ask about 1-bit LLMs or BitNet...")
    clear = gr.Button("Clear")
    history = gr.State([])

    def user_submit(user_message, chat_history):
        response = respond(user_message, chat_history)
        chat_history.append((user_message, response))
        return "", chat_history, chat_history

    msg.submit(user_submit, [msg, history], [msg, chatbot, history])
    clear.click(lambda: ([], []), outputs=[chatbot, history])
demo.launch()