import os import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama MODEL_REPO = "Qwen/Qwen3-4B-Instruct-GGUF" MODEL_FILE = "Qwen3-4B-Instruct-Q4_K_M.gguf" print("Loading model...") model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE ) llm = Llama( model_path=model_path, n_ctx=1024, n_batch=512, n_threads=max(1, os.cpu_count() - 1), verbose=False ) PERSONAS = { "Assistant": "You are a helpful AI assistant.", "Programmer": "You are an expert software engineer.", "Writer": "You are a creative writer.", "Teacher": "You explain concepts clearly and step-by-step." } def chat(message, history, persona): messages = [ { "role": "system", "content": PERSONAS[persona] } ] for user, assistant in history: messages.append({"role": "user", "content": user}) messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": message}) output = llm.create_chat_completion( messages=messages, temperature=0.8, top_p=0.95, max_tokens=1024, repeat_penalty=1.1 ) return output["choices"][0]["message"]["content"] with gr.Blocks() as demo: gr.Markdown("# Qwen 4B Chat") persona = gr.Dropdown( choices=list(PERSONAS.keys()), value="Assistant", label="Persona" ) chatbot = gr.Chatbot(height=600) msg = gr.Textbox( placeholder="Type a message..." ) clear = gr.Button("Clear") def respond(message, history, persona): answer = chat(message, history, persona) history.append((message, answer)) return "", history msg.submit( respond, [msg, chatbot, persona], [msg, chatbot] ) clear.click( lambda: [], outputs=chatbot ) demo.launch()