import json import time import uuid from typing import Optional import gradio as gr import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- MODEL_ID = "Qwen/Qwen3-14B" MODEL_ALIAS = "qwen3-14b-4bit" print(f"Loading tokenizer for {MODEL_ID} …") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) print(f"Loading model {MODEL_ID} in 4-bit …") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, quantization_config=bnb_config, device_map="auto", ) model.eval() print("Model ready.") # --------------------------------------------------------------------------- # GPU generation functions — ZeroGPU anchors # --------------------------------------------------------------------------- @spaces.GPU def gradio_chat(message: str, history: list) -> str: hf_messages = [{"role": "user" if i % 2 == 0 else "assistant", "content": m} for i, m in enumerate([msg for pair in history for msg in pair] + [message])] prompt = tokenizer.apply_chat_template( hf_messages, tokenize=False, add_generation_prompt=True # NOTE: Qwen3-Coder is non-thinking only; enable_thinking is not supported. ) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id, ) new_ids = output_ids[0][inputs["input_ids"].shape[1]:] return tokenizer.decode(new_ids, skip_special_tokens=True) @spaces.GPU def _generate_response(prompt: str, gen_kwargs: dict) -> str: inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): output_ids = model.generate(**inputs, **gen_kwargs) new_ids = output_ids[0][inputs["input_ids"].shape[1]:] return tokenizer.decode(new_ids, skip_special_tokens=True) # --------------------------------------------------------------------------- # API functions # --------------------------------------------------------------------------- def list_models() -> str: """Returns a JSON string listing available models.""" result = { "object": "list", "data": [{"id": MODEL_ALIAS, "object": "model", "created": int(time.time()), "owned_by": "qwen"}], } return json.dumps(result) def chat_completions( messages_json: str, max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9, ) -> str: """ Non-streaming chat completions. Returns an OpenAI-compatible JSON string. messages_json: JSON array of {role, content} objects, e.g. '[{"role":"user","content":"Hello"}]' NOTE: Qwen3-14B supports thinking mode. Set enable_thinking=True in the chat template call if you want chain-of-thought reasoning. """ try: messages = json.loads(messages_json) except json.JSONDecodeError as e: return json.dumps({"error": f"Invalid messages_json: {e}"}) try: hf_messages = [{"role": m["role"], "content": m["content"]} for m in messages] prompt = tokenizer.apply_chat_template( hf_messages, tokenize=False, add_generation_prompt=True, ) except Exception as e: return json.dumps({"error": f"Prompt build failed: {e}"}) gen_kwargs = dict( max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) try: content = _generate_response(prompt, gen_kwargs) except Exception as e: return json.dumps({"error": f"Generation failed: {e}"}) cid = f"chatcmpl-{uuid.uuid4().hex}" result = { "id": cid, "object": "chat.completion", "created": int(time.time()), "model": MODEL_ALIAS, "choices": [{"index": 0, "message": {"role": "assistant", "content": content}, "finish_reason": "stop"}], "usage": {"prompt_tokens": -1, "completion_tokens": -1, "total_tokens": -1}, } return json.dumps(result) def health() -> str: """Returns a JSON health-check string.""" return json.dumps({"status": "ok", "model": MODEL_ID}) # --------------------------------------------------------------------------- # Gradio UI + API # --------------------------------------------------------------------------- with gr.Blocks(title=f"{MODEL_ALIAS} API") as demo: gr.Markdown(f""" # {MODEL_ALIAS} — Gradio API Endpoints (via Gradio built-in API): | api_name | Description | |----------|-------------| | `list_models` | List available models → JSON string | | `chat_completions` | Chat completions → JSON string | | `health` | Health check → JSON string | Call them at `/gradio_api/call/` (POST with `{{"data": [...]}}`) or use the Gradio Python client. You can also chat directly below. """) gr.ChatInterface(fn=gradio_chat) with gr.Row(visible=False): # -- health ------------------------------------------------------ _health_btn = gr.Button("health") _health_out = gr.Textbox() _health_btn.click(fn=health, inputs=[], outputs=[_health_out], api_name="health") # -- list_models ------------------------------------------------- _models_btn = gr.Button("list_models") _models_out = gr.Textbox() _models_btn.click(fn=list_models, inputs=[], outputs=[_models_out], api_name="list_models") with gr.Row(visible=False): # -- chat_completions -------------------------------------------- _cc_messages = gr.Textbox(label="messages_json") _cc_max_tokens = gr.Number(label="max_tokens", value=512) _cc_temp = gr.Number(label="temperature", value=0.7) _cc_top_p = gr.Number(label="top_p", value=0.9) _cc_out = gr.Textbox(label="result") _cc_btn = gr.Button("chat_completions") _cc_btn.click( fn=chat_completions, inputs=[_cc_messages, _cc_max_tokens, _cc_temp, _cc_top_p], outputs=[_cc_out], api_name="chat_completions", ) # --------------------------------------------------------------------------- # Entry-point # --------------------------------------------------------------------------- if __name__ == "__main__": demo.queue() demo.launch( server_name="0.0.0.0", server_port=7860, )