from llama_cpp import Llama import gradio as gr model = Llama( model_path="qwen2.5-1.5B-q4.gguf", n_ctx=4096, n_gpu_layers=0, ) def chat(prompt): out = model( prompt, max_tokens=256, temperature=0.7, ) return out["choices"][0]["text"] gr.Interface( fn=chat, inputs="text", outputs="text", title="Qwen2.5-1.5B Q4 Chatbot" ).launch()