import gradio as gr
from ctransformers import AutoModelForCausalLM

MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"

llm = AutoModelForCausalLM.from_pretrained(
    MODEL_REPO,
    model_file=MODEL_FILE,
    model_type="llama",
    gpu_layers=0,
    context_length=4096,
)

def respond(message: str, history):
    prompt = ""
    for turn in history:
        if isinstance(turn, (list, tuple)) and len(turn) >= 2:
            user_msg, bot_msg = turn[0], turn[1]
        else:
            continue
        prompt += f"[INST]\n{user_msg}\n[/INST]\n{bot_msg}\n"

    prompt += f"[INST]\n{message}\n[/INST]"

    out = llm(
        prompt,
        max_new_tokens=64,
        temperature=0.7,
        top_p=0.9,
    )

    if isinstance(out, dict) and "text" in out:
        return out["text"]
    return str(out)

demo = gr.ChatInterface(respond)

if __name__ == "__main__":
    demo.launch()