MegaTronX's picture
Update app.bak
4379126 verified
import os
os.environ["OMP_NUM_THREADS"] = "8" # fixes the harmless libgomp warning
os.environ["KMP_AFFINITY"] = "granularity=fine,compact,1,0"
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import gradio as gr
import spaces
import re
import html
# === Model config ===
MODEL_REPO = "MegaTronX/Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M_gguf"
MODEL_FILE = "Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M.gguf"
# Download once at startup
#model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir="./models")
hf_hub_download(
repo_id="MegaTronX/Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M_gguf",
filename="Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M.gguf",
local_dir="./models"
)
# Load the model globally (faster than reloading every request)
llm = Llama(
model_path=f"models/{MODEL_FILE}",
n_ctx=32768, # Qwen3 supports up to 256k+, 32k is plenty for most chats
n_batch=1024,
n_threads=8,
n_gpu_layers=99, # use all GPU layers (A100 on HF Spaces has enough VRAM)
flash_attn=True,
verbose=False,
)
# Exact chat template from your model card (stored in the GGUF already, but we paste it here for clarity)
CHAT_TEMPLATE = MessagesFormatterType.Qwen2
@spaces.GPU(duration=180)
def chat(message: str, history: list, temperature: float, top_p: float, max_tokens: int):
# Build proper message list for the official template
messages = []
for human, assistant in history:
messages.append({"role": "user", "content": human})
if assistant:
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": message})
# Let llama-cpp apply the exact template that is inside the GGUF
output = llm.create_chat_completion(
messages=messages,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens,
stop=["<|im_end|>", "<|endoftext|>"],
)
text = output["choices"][0]["message"]["content"]
# ────── Pretty reasoning display (exactly like you wanted) ──────
def format_response(text):
# Hide <think>...</think> blocks in a collapsible box
def replacer(match):
reasoning = html.escape(match.group(1).strip())
return f"<details><summary>Show reasoning</summary><pre>{reasoning}</pre></details>"
text = re.sub(r"<think>(.*?)</think>", replacer, text, flags=re.DOTALL | re.IGNORECASE)
# Hide tool calls
text = re.sub(r"<tool_call>.*?</tool_call>", "[tool use hidden]", text, flags=re.DOTALL)
return text.strip()
return format_response(text)
# ────── Gradio UI ──────
with gr.Blocks(title="Qwen3-Deckard 6B – Almost Human III Final Ξ©", theme=gr.themes.Soft()) as demo:
gr.Markdown("# Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA")
#gr.Markdown("Fully uncensored β€’ 256k context β€’ Tool-calling ready β€’ Running on A100")
chatbot = gr.Chatbot(height=600)
with gr.Row():
msg = gr.Textbox(
label="Message",
placeholder="Ask me anything…",
lines=2,
container=False,
scale=7
)
submit = gr.Button("Send", variant="primary", scale=1)
with gr.Accordion("Parameters", open=False):
temperature = gr.Slider(0.1, 1.5, 0.7, step=0.05, label="Temperature")
top_p = gr.Slider(0.01, 1.0, 0.95, step=0.01, label="Top-p")
max_tokens = gr.Slider(512, 131072, 32768, step=512, label="Max new tokens")
# Click or Enter to send
submit.click(chat, [msg, chatbot, temperature, top_p, max_tokens], chatbot).then(
lambda: gr.update(value=""), None, msg
)
msg.submit(chat, [msg, chatbot, temperature, top_p, max_tokens], chatbot).then(
lambda: gr.update(value=""), None, msg
)
demo.queue(max_size=32).launch()