| import os |
| import gradio as gr |
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
|
|
| MODEL_REPO = "Qwen/Qwen3-4B-Instruct-GGUF" |
| MODEL_FILE = "Qwen3-4B-Instruct-Q4_K_M.gguf" |
|
|
| print("Loading model...") |
|
|
| model_path = hf_hub_download( |
| repo_id=MODEL_REPO, |
| filename=MODEL_FILE |
| ) |
|
|
| llm = Llama( |
| model_path=model_path, |
| n_ctx=1024, |
| n_batch=512, |
| n_threads=max(1, os.cpu_count() - 1), |
| verbose=False |
| ) |
|
|
| PERSONAS = { |
| "Assistant": "You are a helpful AI assistant.", |
| "Programmer": "You are an expert software engineer.", |
| "Writer": "You are a creative writer.", |
| "Teacher": "You explain concepts clearly and step-by-step." |
| } |
|
|
| def chat(message, history, persona): |
| messages = [ |
| { |
| "role": "system", |
| "content": PERSONAS[persona] |
| } |
| ] |
|
|
| for user, assistant in history: |
| messages.append({"role": "user", "content": user}) |
| messages.append({"role": "assistant", "content": assistant}) |
|
|
| messages.append({"role": "user", "content": message}) |
|
|
| output = llm.create_chat_completion( |
| messages=messages, |
| temperature=0.8, |
| top_p=0.95, |
| max_tokens=1024, |
| repeat_penalty=1.1 |
| ) |
|
|
| return output["choices"][0]["message"]["content"] |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown("# Qwen 4B Chat") |
|
|
| persona = gr.Dropdown( |
| choices=list(PERSONAS.keys()), |
| value="Assistant", |
| label="Persona" |
| ) |
|
|
| chatbot = gr.Chatbot(height=600) |
|
|
| msg = gr.Textbox( |
| placeholder="Type a message..." |
| ) |
|
|
| clear = gr.Button("Clear") |
|
|
| def respond(message, history, persona): |
| answer = chat(message, history, persona) |
| history.append((message, answer)) |
| return "", history |
|
|
| msg.submit( |
| respond, |
| [msg, chatbot, persona], |
| [msg, chatbot] |
| ) |
|
|
| clear.click( |
| lambda: [], |
| outputs=chatbot |
| ) |
|
|
| demo.launch() |