chatbox2 / app.py
anaspro
update
b3d7317
import os
from collections.abc import Iterator
from threading import Thread
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.streamers import TextIteratorStreamer
# Model configuration - Changed to Qwen3-14B
model_id = "anaspro/Shako-iraqi-8B-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16
)
# Settings
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "32_000"))
@spaces.GPU()
@torch.inference_mode()
def generate(message: str | dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512, enable_thinking: bool = True) -> Iterator[str]:
# Build messages for Qwen3 (text-only format)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
# Process history - convert to simple text format
# Note: Don't include thinking content in history (best practice)
for item in history:
if item["role"] == "assistant":
# Extract only the response part (without thinking content)
content = item["content"]
# Remove thinking process markers if present
if "**🤔 Thinking Process:**" in content:
# Extract only the response part
parts = content.split("**💬 Response:**")
if len(parts) > 1:
content = parts[1].strip()
messages.append({"role": "assistant", "content": content})
else:
# Extract text from user message
content = item["content"]
if isinstance(content, str):
messages.append({"role": "user", "content": content})
elif isinstance(content, dict):
# Handle dict format
messages.append({"role": "user", "content": content.get("text", "")})
# Add current user message
# Handle both string and dict message formats
if isinstance(message, str):
current_message = message
else:
current_message = message.get("text", "")
messages.append({"role": "user", "content": current_message})
# Apply chat template with enable_thinking parameter
# Note: When enable_thinking=True, the model supports /think and /no_think soft switches
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=enable_thinking
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
n_tokens = model_inputs["input_ids"].shape[1]
if n_tokens > MAX_INPUT_TOKENS:
gr.Warning(
f"Input too long. Max {MAX_INPUT_TOKENS} tokens. Got {n_tokens} tokens. This limit is set to avoid CUDA out-of-memory errors in this Space."
)
yield ""
return
# Set generation parameters based on mode
if enable_thinking:
# Thinking mode: Temperature=0.6, TopP=0.95, TopK=20, MinP=0
# DO NOT use greedy decoding (temperature=0) to avoid performance degradation
temperature = 0.6
top_p = 0.95
top_k = 20
else:
# Non-thinking mode: Temperature=0.7, TopP=0.8, TopK=20, MinP=0
temperature = 0.7
top_p = 0.8
top_k = 20
streamer = TextIteratorStreamer(tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=False)
generate_kwargs = dict(
**model_inputs,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
top_k=top_k,
top_p=top_p,
min_p=0.0,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
output = ""
thinking_content = ""
response_content = ""
for delta in streamer:
output += delta
# Parse thinking content if in thinking mode
# When enable_thinking=True, the model always outputs <think>...</think> block
# (even if empty when using /no_think soft switch)
if enable_thinking and "<think>" in output:
if "</think>" in output:
# Extract thinking and response parts
try:
think_start = output.index("<think>") + 7
think_end = output.index("</think>")
thinking_content = output[think_start:think_end].strip()
response_content = output[think_end + 8:].strip()
# Display formatted output
if thinking_content:
# Thinking content exists (user didn't use /no_think or used /think)
formatted_output = f"**🤔 Thinking Process:**\n{thinking_content}\n\n**💬 Response:**\n{response_content}"
else:
# Empty thinking block (user used /no_think soft switch)
formatted_output = f"**💬 Response:**\n{response_content}"
yield formatted_output
except ValueError:
# Still parsing, yield raw output
yield output
else:
# Still generating thinking content
yield output
else:
# Non-thinking mode or no <think> tag yet
yield output
# Examples for the chat interface (with additional inputs: system_prompt, max_new_tokens, enable_thinking)
examples = [
["What is the capital of France? /no_think", "You are a helpful assistant.", 700, True],
["Explain quantum computing in simple terms", "You are a helpful assistant.", 512, False],
["Solve this math problem: If x^2 + 5x + 6 = 0, what are the values of x? /think", "You are a helpful assistant.", 2000, True]
]
system_prompt = (
"انت موديل عراقي ذكي من بغداد. تتحدث باللهجة العراقية فقط. "
"جاوب على كل سؤال بشرح كامل وموسع، ووضح الأسباب والخلفية والمعلومات المهمة. "
"استخدم أمثلة عراقية واقعية أو حياتية كلما أمكن. "
"تجنب الفصحى نهائيًا، وخلي الرد مطول وممتع."
)
# Create the chat interface
demo = gr.ChatInterface(
fn=generate,
type="messages",
textbox=gr.Textbox(
placeholder="Type your message here...",
autofocus=True,
),
multimodal=False, # Qwen3-14B is text-only
additional_inputs=[
gr.Textbox(label="System Prompt", value=system_prompt),
gr.Slider(label="Max New Tokens", minimum=100, maximum=32768, step=100, value=2048),
gr.Checkbox(label="Enable Thinking Mode", value=True, info="Enable for complex reasoning tasks (math, coding). Disable for faster general chat."),
],
title="Qwen3-14B Iraqi Chatbot with Thinking Mode",
description="""
🤔 **Thinking Mode ON**: Better for math, coding, and complex reasoning
💬 **Thinking Mode OFF**: Faster responses for general conversation
**💡 Pro Tip**: When Thinking Mode is enabled, you can use:
- `/think` in your message to force thinking for that turn
- `/no_think` in your message to skip thinking for that turn
Example: "Solve this equation: x^2 + 5x + 6 = 0 /think"
""",
examples=examples,
stop_btn=False,
css="""
.gradio-container, .chatbot, .chatbot * {
direction: rtl !important;
text-align: right !important;
unicode-bidi: plaintext !important;
font-family: 'Tajawal', 'Cairo', sans-serif;
}
"""
)
if __name__ == "__main__":
demo.launch()