import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr

# Initialize model and tokenizer
MODEL_ID = "abdelac/Mistral_Test"


def load_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    return tokenizer, model

tokenizer, model = load_model()

def respond(message, history):
    # Format chat history
    prompt = ""
    for user_msg, assistant_msg in history:
        prompt += f"Human: {user_msg}\nAssistant: {assistant_msg}\n"
    prompt += f"Human: {message}\nAssistant:"
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return response

# Create chat interface
gr.ChatInterface(
    respond,
    title="TinyLlama Chat",
    description="Chat with TinyLlama model",
).launch()