Spaces:

anaspro
/

chatbox2

Sleeping

File size: 7,653 Bytes

import os
from collections.abc import Iterator
from threading import Thread

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.streamers import TextIteratorStreamer

# Model configuration - Changed to Qwen3-14B
model_id = "Qwen/Qwen3-14B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

# Settings
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "32_000"))


@spaces.GPU()
@torch.inference_mode()
def generate(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512, enable_thinking: bool = True) -> Iterator[str]:
    # Build messages for Qwen3 (text-only format)
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    
    # Process history - convert to simple text format
    # Note: Don't include thinking content in history (best practice)
    for item in history:
        if item["role"] == "assistant":
            # Extract only the response part (without thinking content)
            content = item["content"]
            # Remove thinking process markers if present
            if "**🤔 Thinking Process:**" in content:
                # Extract only the response part
                parts = content.split("**💬 Response:**")
                if len(parts) > 1:
                    content = parts[1].strip()
            messages.append({"role": "assistant", "content": content})
        else:
            # Extract text from user message
            content = item["content"]
            if isinstance(content, str):
                messages.append({"role": "user", "content": content})
            else:
                # For now, just use the text part (Qwen3-14B is text-only)
                messages.append({"role": "user", "content": message.get("text", "")})
    
    # Add current user message
    current_message = message.get("text", "")
    messages.append({"role": "user", "content": current_message})

    # Apply chat template with enable_thinking parameter
    # Note: When enable_thinking=True, the model supports /think and /no_think soft switches
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=enable_thinking
    )
    
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    n_tokens = model_inputs["input_ids"].shape[1]
    
    if n_tokens > MAX_INPUT_TOKENS:
        gr.Warning(
            f"Input too long. Max {MAX_INPUT_TOKENS} tokens. Got {n_tokens} tokens. This limit is set to avoid CUDA out-of-memory errors in this Space."
        )
        yield ""
        return

    # Set generation parameters based on mode
    if enable_thinking:
        # Thinking mode: Temperature=0.6, TopP=0.95, TopK=20, MinP=0
        # DO NOT use greedy decoding (temperature=0) to avoid performance degradation
        temperature = 0.6
        top_p = 0.95
        top_k = 20
    else:
        # Non-thinking mode: Temperature=0.7, TopP=0.8, TopK=20, MinP=0
        temperature = 0.7
        top_p = 0.8
        top_k = 20

    streamer = TextIteratorStreamer(tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=False)
    generate_kwargs = dict(
        **model_inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        min_p=0.0,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    output = ""
    thinking_content = ""
    response_content = ""
    
    for delta in streamer:
        output += delta
        
        # Parse thinking content if in thinking mode
        # When enable_thinking=True, the model always outputs <think>...</think> block
        # (even if empty when using /no_think soft switch)
        if enable_thinking and "<think>" in output:
            if "</think>" in output:
                # Extract thinking and response parts
                try:
                    think_start = output.index("<think>") + 7
                    think_end = output.index("</think>")
                    thinking_content = output[think_start:think_end].strip()
                    response_content = output[think_end + 8:].strip()
                    
                    # Display formatted output
                    if thinking_content:
                        # Thinking content exists (user didn't use /no_think or used /think)
                        formatted_output = f"**🤔 Thinking Process:**\n{thinking_content}\n\n**💬 Response:**\n{response_content}"
                    else:
                        # Empty thinking block (user used /no_think soft switch)
                        formatted_output = f"**💬 Response:**\n{response_content}"
                    
                    yield formatted_output
                except ValueError:
                    # Still parsing, yield raw output
                    yield output
            else:
                # Still generating thinking content
                yield output
        else:
            # Non-thinking mode or no <think> tag yet
            yield output


# Examples for the chat interface (with additional inputs: system_prompt, max_new_tokens, enable_thinking)
examples = [
    ["What is the capital of France? /no_think", "You are a helpful assistant.", 700, True],
    ["Explain quantum computing in simple terms", "You are a helpful assistant.", 512, False],
    ["Solve this math problem: If x^2 + 5x + 6 = 0, what are the values of x? /think", "You are a helpful assistant.", 2000, True]
]

system_prompt = (
    "انت موديل عراقي ذكي من بغداد. تتحدث باللهجة العراقية فقط. "
    "جاوب على كل سؤال بشرح كامل وموسع، ووضح الأسباب والخلفية والمعلومات المهمة. "
    "استخدم أمثلة عراقية واقعية أو حياتية كلما أمكن. "
    "تجنب الفصحى نهائيًا، وخلي الرد مطول وممتع."
)
# Create the chat interface
demo = gr.ChatInterface(
    fn=generate,
    type="messages",
    textbox=gr.Textbox(
        placeholder="Type your message here...",
        autofocus=True,
    ),
    multimodal=False,  # Qwen3-14B is text-only
    additional_inputs=[
        gr.Textbox(label="System Prompt", value=system_prompt),
        gr.Slider(label="Max New Tokens", minimum=100, maximum=32768, step=100, value=2048),
        gr.Checkbox(label="Enable Thinking Mode", value=True, info="Enable for complex reasoning tasks (math, coding). Disable for faster general chat."),
    ],
    title="Qwen3-14B Iraqi Chatbot with Thinking Mode",
    description="""
🤔 **Thinking Mode ON**: Better for math, coding, and complex reasoning  
💬 **Thinking Mode OFF**: Faster responses for general conversation

**💡 Pro Tip**: When Thinking Mode is enabled, you can use:
- `/think` in your message to force thinking for that turn
- `/no_think` in your message to skip thinking for that turn

Example: "Solve this equation: x^2 + 5x + 6 = 0 /think"
""",
    examples=examples,
    stop_btn=False,
    css="""
    .gradio-container, .chatbot, .chatbot * {
        direction: rtl !important;
        text-align: right !important;
        unicode-bidi: plaintext !important;
        font-family: 'Tajawal', 'Cairo', sans-serif;
    }
    """
)


if __name__ == "__main__":
    demo.launch()