Spaces:

MegaTronX
/

DeckardFinalOmegaGGUF

Paused

App Files Files Community

DeckardFinalOmegaGGUF / app.bak

MegaTronX

Update app.bak

4379126 verified 5 months ago

raw

history blame contribute delete

4.01 kB

	import os
	os.environ["OMP_NUM_THREADS"] = "8" # fixes the harmless libgomp warning
	os.environ["KMP_AFFINITY"] = "granularity=fine,compact,1,0"

	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	import gradio as gr
	import spaces
	import re
	import html

	# === Model config ===
	MODEL_REPO = "MegaTronX/Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M_gguf"
	MODEL_FILE = "Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M.gguf"

	# Download once at startup
	#model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir="./models")

	hf_hub_download(
	repo_id="MegaTronX/Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M_gguf",
	filename="Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA.i1-Q4_K_M.gguf",
	local_dir="./models"
	)

	# Load the model globally (faster than reloading every request)
	llm = Llama(
	model_path=f"models/{MODEL_FILE}",
	n_ctx=32768, # Qwen3 supports up to 256k+, 32k is plenty for most chats
	n_batch=1024,
	n_threads=8,
	n_gpu_layers=99, # use all GPU layers (A100 on HF Spaces has enough VRAM)
	flash_attn=True,
	verbose=False,
	)

	# Exact chat template from your model card (stored in the GGUF already, but we paste it here for clarity)
	CHAT_TEMPLATE = MessagesFormatterType.Qwen2

	@spaces.GPU(duration=180)
	def chat(message: str, history: list, temperature: float, top_p: float, max_tokens: int):
	# Build proper message list for the official template
	messages = []
	for human, assistant in history:
	messages.append({"role": "user", "content": human})
	if assistant:
	messages.append({"role": "assistant", "content": assistant})
	messages.append({"role": "user", "content": message})

	# Let llama-cpp apply the exact template that is inside the GGUF
	output = llm.create_chat_completion(
	messages=messages,
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens,
	stop=["<\|im_end\|>", "<\|endoftext\|>"],
	)

	text = output["choices"][0]["message"]["content"]

	# ────── Pretty reasoning display (exactly like you wanted) ──────
	def format_response(text):
	# Hide <think>...</think> blocks in a collapsible box
	def replacer(match):
	reasoning = html.escape(match.group(1).strip())
	return f"<details><summary>Show reasoning</summary><pre>{reasoning}</pre></details>"

	text = re.sub(r"<think>(.*?)</think>", replacer, text, flags=re.DOTALL \| re.IGNORECASE)
	# Hide tool calls
	text = re.sub(r"<tool_call>.*?</tool_call>", "[tool use hidden]", text, flags=re.DOTALL)
	return text.strip()

	return format_response(text)

	# ────── Gradio UI ──────
	with gr.Blocks(title="Qwen3-Deckard 6B – Almost Human III Final Ω", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Qwen3-Deckard-Large-Almost-Human-6B-III-Final-OMEGA")
	#gr.Markdown("Fully uncensored • 256k context • Tool-calling ready • Running on A100")

	chatbot = gr.Chatbot(height=600)
	with gr.Row():
	msg = gr.Textbox(
	label="Message",
	placeholder="Ask me anything…",
	lines=2,
	container=False,
	scale=7
	)
	submit = gr.Button("Send", variant="primary", scale=1)

	with gr.Accordion("Parameters", open=False):
	temperature = gr.Slider(0.1, 1.5, 0.7, step=0.05, label="Temperature")
	top_p = gr.Slider(0.01, 1.0, 0.95, step=0.01, label="Top-p")
	max_tokens = gr.Slider(512, 131072, 32768, step=512, label="Max new tokens")

	# Click or Enter to send
	submit.click(chat, [msg, chatbot, temperature, top_p, max_tokens], chatbot).then(
	lambda: gr.update(value=""), None, msg
	)
	msg.submit(chat, [msg, chatbot, temperature, top_p, max_tokens], chatbot).then(
	lambda: gr.update(value=""), None, msg
	)

	demo.queue(max_size=32).launch()