import gradio as gr import torch import re from transformers import AutoModelForCausalLM, AutoTokenizer # ============================================================ # Qwen3-0.6B – Fast Chat # ============================================================ MODEL_ID = "Qwen/Qwen3-0.6B" model = None tokenizer = None def load_model(): global model, tokenizer if model is not None: return import os token = os.getenv("HF_TOKEN", None) print("Loading Qwen3-0.6B ...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=token) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, dtype=torch.float32, low_cpu_mem_usage=True, token=token ).to("cpu") model.eval() print("Model loaded.") def strip_thinking(text): """Remove blocks from Qwen3 output.""" return re.sub(r']*>.*?]*>', '', text, flags=re.DOTALL).strip() def normalize_content(msg): """Convert list content to string (Gradio may pass content as a list).""" if isinstance(msg.get("content"), list): parts = [] for item in msg["content"]: if isinstance(item, dict) and "text" in item: parts.append(item["text"]) elif isinstance(item, str): parts.append(item) return {"role": msg["role"], "content": " ".join(parts)} return msg def chat_response(message, history): load_model() # Normalize history: convert any list content to plain strings clean_history = [normalize_content(m) for m in history] messages = clean_history + [{"role": "user", "content": message}] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False # disable thinking mode ) inputs = tokenizer(text, return_tensors="pt").to("cpu") with torch.no_grad(): generated_ids = model.generate( **inputs, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9, ) output_ids = generated_ids[0][len(inputs.input_ids[0]):] response = tokenizer.decode(output_ids, skip_special_tokens=True) # Fallback: strip any blocks if they still appear response = strip_thinking(response) return messages + [{"role": "assistant", "content": response}] # ============================================================ # Gradio Interface # ============================================================ with gr.Blocks(title="Qwen3-0.6B Fast Chat") as demo: gr.Markdown(""" # ⚡ Qwen3-0.6B – Fast Chat Small and fast model. Great for quick answers on CPU. """) chatbot = gr.Chatbot(label="Conversation") msg = gr.Textbox(label="Your Message", placeholder="Type your message and press Enter...") clear = gr.Button("Clear Conversation") msg.submit( chat_response, [msg, chatbot], chatbot, concurrency_limit=3 ).then( lambda: "", None, msg ) clear.click(lambda: [], None, chatbot) if __name__ == "__main__": demo.queue(default_concurrency_limit=3) demo.launch(server_name="0.0.0.0")