| import os |
| os.environ["OMP_NUM_THREADS"] = "8" |
| os.environ["KMP_AFFINITY"] = "granularity=fine,compact,1,0" |
|
|
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
| import gradio as gr |
| import spaces |
| import re |
| import html |
|
|
| |
| MODEL_REPO = "MegaTronX/Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M_gguf" |
| MODEL_FILE = "Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M.gguf" |
|
|
| |
| |
|
|
| hf_hub_download( |
| repo_id="MegaTronX/Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M_gguf", |
| filename="Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M.gguf", |
| local_dir="./models" |
| ) |
|
|
| |
| llm = Llama( |
| model_path=f"models/{MODEL_FILE}", |
| n_ctx=32768, |
| n_batch=1024, |
| n_threads=8, |
| n_gpu_layers=99, |
| flash_attn=True, |
| verbose=False, |
| ) |
|
|
| |
| CHAT_TEMPLATE = MessagesFormatterType.Qwen2 |
|
|
| @spaces.GPU(duration=180) |
| def chat(message: str, history: list, temperature: float, top_p: float, max_tokens: int): |
| |
| messages = [] |
| for human, assistant in history: |
| messages.append({"role": "user", "content": human}) |
| if assistant: |
| messages.append({"role": "assistant", "content": assistant}) |
| messages.append({"role": "user", "content": message}) |
|
|
| |
| output = llm.create_chat_completion( |
| messages=messages, |
| temperature=temperature, |
| top_p=top_p, |
| max_tokens=max_tokens, |
| stop=["<|im_end|>", "<|endoftext|>"], |
| ) |
|
|
| text = output["choices"][0]["message"]["content"] |
|
|
| |
| def format_response(text): |
| |
| def replacer(match): |
| reasoning = html.escape(match.group(1).strip()) |
| return f"<details><summary>Show reasoning</summary><pre>{reasoning}</pre></details>" |
|
|
| text = re.sub(r"<think>(.*?)</think>", replacer, text, flags=re.DOTALL | re.IGNORECASE) |
| |
| text = re.sub(r"<tool_call>.*?</tool_call>", "[tool use hidden]", text, flags=re.DOTALL) |
| return text.strip() |
|
|
| return format_response(text) |
|
|
| |
| with gr.Blocks(title="Qwen3-Deckard 6B β Almost Human III Final Ξ©", theme=gr.themes.Soft()) as demo: |
| gr.Markdown("# Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA") |
| |
|
|
| chatbot = gr.Chatbot(height=600) |
| with gr.Row(): |
| msg = gr.Textbox( |
| label="Message", |
| placeholder="Ask me anythingβ¦", |
| lines=2, |
| container=False, |
| scale=7 |
| ) |
| submit = gr.Button("Send", variant="primary", scale=1) |
|
|
| with gr.Accordion("Parameters", open=False): |
| temperature = gr.Slider(0.1, 1.5, 0.7, step=0.05, label="Temperature") |
| top_p = gr.Slider(0.01, 1.0, 0.95, step=0.01, label="Top-p") |
| max_tokens = gr.Slider(512, 131072, 32768, step=512, label="Max new tokens") |
|
|
| |
| submit.click(chat, [msg, chatbot, temperature, top_p, max_tokens], chatbot).then( |
| lambda: gr.update(value=""), None, msg |
| ) |
| msg.submit(chat, [msg, chatbot, temperature, top_p, max_tokens], chatbot).then( |
| lambda: gr.update(value=""), None, msg |
| ) |
|
|
| demo.queue(max_size=32).launch() |