Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import re | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # ============================================================ | |
| # Qwen3-0.6B – Fast Chat | |
| # ============================================================ | |
| MODEL_ID = "Qwen/Qwen3-0.6B" | |
| model = None | |
| tokenizer = None | |
| def load_model(): | |
| global model, tokenizer | |
| if model is not None: | |
| return | |
| import os | |
| token = os.getenv("HF_TOKEN", None) | |
| print("Loading Qwen3-0.6B ...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=token) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| dtype=torch.float32, | |
| low_cpu_mem_usage=True, | |
| token=token | |
| ).to("cpu") | |
| model.eval() | |
| print("Model loaded.") | |
| def strip_thinking(text): | |
| """Remove <think...</think*> blocks from Qwen3 output.""" | |
| return re.sub(r'<think[^>]*>.*?</think[^>]*>', '', text, flags=re.DOTALL).strip() | |
| def normalize_content(msg): | |
| """Convert list content to string (Gradio may pass content as a list).""" | |
| if isinstance(msg.get("content"), list): | |
| parts = [] | |
| for item in msg["content"]: | |
| if isinstance(item, dict) and "text" in item: | |
| parts.append(item["text"]) | |
| elif isinstance(item, str): | |
| parts.append(item) | |
| return {"role": msg["role"], "content": " ".join(parts)} | |
| return msg | |
| def chat_response(message, history): | |
| load_model() | |
| # Normalize history: convert any list content to plain strings | |
| clean_history = [normalize_content(m) for m in history] | |
| messages = clean_history + [{"role": "user", "content": message}] | |
| text = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| enable_thinking=False # disable thinking mode | |
| ) | |
| inputs = tokenizer(text, return_tensors="pt").to("cpu") | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| ) | |
| output_ids = generated_ids[0][len(inputs.input_ids[0]):] | |
| response = tokenizer.decode(output_ids, skip_special_tokens=True) | |
| # Fallback: strip any <think...> blocks if they still appear | |
| response = strip_thinking(response) | |
| return messages + [{"role": "assistant", "content": response}] | |
| # ============================================================ | |
| # Gradio Interface | |
| # ============================================================ | |
| with gr.Blocks(title="Qwen3-0.6B Fast Chat") as demo: | |
| gr.Markdown(""" | |
| # ⚡ Qwen3-0.6B – Fast Chat | |
| Small and fast model. Great for quick answers on CPU. | |
| """) | |
| chatbot = gr.Chatbot(label="Conversation") | |
| msg = gr.Textbox(label="Your Message", placeholder="Type your message and press Enter...") | |
| clear = gr.Button("Clear Conversation") | |
| msg.submit( | |
| chat_response, [msg, chatbot], chatbot, | |
| concurrency_limit=3 | |
| ).then( | |
| lambda: "", None, msg | |
| ) | |
| clear.click(lambda: [], None, chatbot) | |
| if __name__ == "__main__": | |
| demo.queue(default_concurrency_limit=3) | |
| demo.launch(server_name="0.0.0.0") | |