import os from collections.abc import Iterator from threading import Thread import gradio as gr import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation.streamers import TextIteratorStreamer # Model configuration - Changed to Qwen3-14B model_id = "anaspro/Shako-iraqi-8B-it" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.bfloat16 ) # Settings MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "32_000")) @spaces.GPU() @torch.inference_mode() def generate(message: str | dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512, enable_thinking: bool = True) -> Iterator[str]: # Build messages for Qwen3 (text-only format) messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) # Process history - convert to simple text format # Note: Don't include thinking content in history (best practice) for item in history: if item["role"] == "assistant": # Extract only the response part (without thinking content) content = item["content"] # Remove thinking process markers if present if "**🤔 Thinking Process:**" in content: # Extract only the response part parts = content.split("**💬 Response:**") if len(parts) > 1: content = parts[1].strip() messages.append({"role": "assistant", "content": content}) else: # Extract text from user message content = item["content"] if isinstance(content, str): messages.append({"role": "user", "content": content}) elif isinstance(content, dict): # Handle dict format messages.append({"role": "user", "content": content.get("text", "")}) # Add current user message # Handle both string and dict message formats if isinstance(message, str): current_message = message else: current_message = message.get("text", "") messages.append({"role": "user", "content": current_message}) # Apply chat template with enable_thinking parameter # Note: When enable_thinking=True, the model supports /think and /no_think soft switches text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) n_tokens = model_inputs["input_ids"].shape[1] if n_tokens > MAX_INPUT_TOKENS: gr.Warning( f"Input too long. Max {MAX_INPUT_TOKENS} tokens. Got {n_tokens} tokens. This limit is set to avoid CUDA out-of-memory errors in this Space." ) yield "" return # Set generation parameters based on mode if enable_thinking: # Thinking mode: Temperature=0.6, TopP=0.95, TopK=20, MinP=0 # DO NOT use greedy decoding (temperature=0) to avoid performance degradation temperature = 0.6 top_p = 0.95 top_k = 20 else: # Non-thinking mode: Temperature=0.7, TopP=0.8, TopK=20, MinP=0 temperature = 0.7 top_p = 0.8 top_k = 20 streamer = TextIteratorStreamer(tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=False) generate_kwargs = dict( **model_inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_k=top_k, top_p=top_p, min_p=0.0, ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() output = "" thinking_content = "" response_content = "" for delta in streamer: output += delta # Parse thinking content if in thinking mode # When enable_thinking=True, the model always outputs ... block # (even if empty when using /no_think soft switch) if enable_thinking and "" in output: if "" in output: # Extract thinking and response parts try: think_start = output.index("") + 7 think_end = output.index("") thinking_content = output[think_start:think_end].strip() response_content = output[think_end + 8:].strip() # Display formatted output if thinking_content: # Thinking content exists (user didn't use /no_think or used /think) formatted_output = f"**🤔 Thinking Process:**\n{thinking_content}\n\n**💬 Response:**\n{response_content}" else: # Empty thinking block (user used /no_think soft switch) formatted_output = f"**💬 Response:**\n{response_content}" yield formatted_output except ValueError: # Still parsing, yield raw output yield output else: # Still generating thinking content yield output else: # Non-thinking mode or no tag yet yield output # Examples for the chat interface (with additional inputs: system_prompt, max_new_tokens, enable_thinking) examples = [ ["What is the capital of France? /no_think", "You are a helpful assistant.", 700, True], ["Explain quantum computing in simple terms", "You are a helpful assistant.", 512, False], ["Solve this math problem: If x^2 + 5x + 6 = 0, what are the values of x? /think", "You are a helpful assistant.", 2000, True] ] system_prompt = ( "انت موديل عراقي ذكي من بغداد. تتحدث باللهجة العراقية فقط. " "جاوب على كل سؤال بشرح كامل وموسع، ووضح الأسباب والخلفية والمعلومات المهمة. " "استخدم أمثلة عراقية واقعية أو حياتية كلما أمكن. " "تجنب الفصحى نهائيًا، وخلي الرد مطول وممتع." ) # Create the chat interface demo = gr.ChatInterface( fn=generate, type="messages", textbox=gr.Textbox( placeholder="Type your message here...", autofocus=True, ), multimodal=False, # Qwen3-14B is text-only additional_inputs=[ gr.Textbox(label="System Prompt", value=system_prompt), gr.Slider(label="Max New Tokens", minimum=100, maximum=32768, step=100, value=2048), gr.Checkbox(label="Enable Thinking Mode", value=True, info="Enable for complex reasoning tasks (math, coding). Disable for faster general chat."), ], title="Qwen3-14B Iraqi Chatbot with Thinking Mode", description=""" 🤔 **Thinking Mode ON**: Better for math, coding, and complex reasoning 💬 **Thinking Mode OFF**: Faster responses for general conversation **💡 Pro Tip**: When Thinking Mode is enabled, you can use: - `/think` in your message to force thinking for that turn - `/no_think` in your message to skip thinking for that turn Example: "Solve this equation: x^2 + 5x + 6 = 0 /think" """, examples=examples, stop_btn=False, css=""" .gradio-container, .chatbot, .chatbot * { direction: rtl !important; text-align: right !important; unicode-bidi: plaintext !important; font-family: 'Tajawal', 'Cairo', sans-serif; } """ ) if __name__ == "__main__": demo.launch()