from llama_cpp import Llama import gradio as gr # IMPORTANT: Use chat_format="qwen" (qwen2 is NOT supported) model = Llama( model_path="qwen2.5-1.5B-q4.gguf", n_ctx=4096, n_gpu_layers=0, chat_format="qwen", ) def chat(user_input): messages = [ {"role": "system", "content": "You are a helpful assistant. Answer ONLY the question. Do NOT continue, do NOT ask questions, do NOT add extra text."}, {"role": "user", "content": user_input} ] response = model.create_chat_completion( messages=messages, max_tokens=256, temperature=0.7, ) return response["choices"][0]["message"]["content"] gr.Interface( fn=chat, inputs="text", outputs="text", title="Qwen2.5-1.5B Q4 Chatbot" ).launch()