|
|
import os |
|
|
from collections.abc import Iterator |
|
|
from threading import Thread |
|
|
|
|
|
import gradio as gr |
|
|
import spaces |
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from transformers.generation.streamers import TextIteratorStreamer |
|
|
|
|
|
|
|
|
model_id = "anaspro/Shako-iraqi-8B-it" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_id, |
|
|
device_map="auto", |
|
|
torch_dtype=torch.bfloat16 |
|
|
) |
|
|
|
|
|
|
|
|
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "32_000")) |
|
|
|
|
|
|
|
|
@spaces.GPU() |
|
|
@torch.inference_mode() |
|
|
def generate(message: str | dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512, enable_thinking: bool = True) -> Iterator[str]: |
|
|
|
|
|
messages = [] |
|
|
if system_prompt: |
|
|
messages.append({"role": "system", "content": system_prompt}) |
|
|
|
|
|
|
|
|
|
|
|
for item in history: |
|
|
if item["role"] == "assistant": |
|
|
|
|
|
content = item["content"] |
|
|
|
|
|
if "**🤔 Thinking Process:**" in content: |
|
|
|
|
|
parts = content.split("**💬 Response:**") |
|
|
if len(parts) > 1: |
|
|
content = parts[1].strip() |
|
|
messages.append({"role": "assistant", "content": content}) |
|
|
else: |
|
|
|
|
|
content = item["content"] |
|
|
if isinstance(content, str): |
|
|
messages.append({"role": "user", "content": content}) |
|
|
elif isinstance(content, dict): |
|
|
|
|
|
messages.append({"role": "user", "content": content.get("text", "")}) |
|
|
|
|
|
|
|
|
|
|
|
if isinstance(message, str): |
|
|
current_message = message |
|
|
else: |
|
|
current_message = message.get("text", "") |
|
|
messages.append({"role": "user", "content": current_message}) |
|
|
|
|
|
|
|
|
|
|
|
text = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True, |
|
|
enable_thinking=enable_thinking |
|
|
) |
|
|
|
|
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
n_tokens = model_inputs["input_ids"].shape[1] |
|
|
|
|
|
if n_tokens > MAX_INPUT_TOKENS: |
|
|
gr.Warning( |
|
|
f"Input too long. Max {MAX_INPUT_TOKENS} tokens. Got {n_tokens} tokens. This limit is set to avoid CUDA out-of-memory errors in this Space." |
|
|
) |
|
|
yield "" |
|
|
return |
|
|
|
|
|
|
|
|
if enable_thinking: |
|
|
|
|
|
|
|
|
temperature = 0.6 |
|
|
top_p = 0.95 |
|
|
top_k = 20 |
|
|
else: |
|
|
|
|
|
temperature = 0.7 |
|
|
top_p = 0.8 |
|
|
top_k = 20 |
|
|
|
|
|
streamer = TextIteratorStreamer(tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=False) |
|
|
generate_kwargs = dict( |
|
|
**model_inputs, |
|
|
streamer=streamer, |
|
|
max_new_tokens=max_new_tokens, |
|
|
do_sample=True, |
|
|
temperature=temperature, |
|
|
top_k=top_k, |
|
|
top_p=top_p, |
|
|
min_p=0.0, |
|
|
) |
|
|
t = Thread(target=model.generate, kwargs=generate_kwargs) |
|
|
t.start() |
|
|
|
|
|
output = "" |
|
|
thinking_content = "" |
|
|
response_content = "" |
|
|
|
|
|
for delta in streamer: |
|
|
output += delta |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if enable_thinking and "<think>" in output: |
|
|
if "</think>" in output: |
|
|
|
|
|
try: |
|
|
think_start = output.index("<think>") + 7 |
|
|
think_end = output.index("</think>") |
|
|
thinking_content = output[think_start:think_end].strip() |
|
|
response_content = output[think_end + 8:].strip() |
|
|
|
|
|
|
|
|
if thinking_content: |
|
|
|
|
|
formatted_output = f"**🤔 Thinking Process:**\n{thinking_content}\n\n**💬 Response:**\n{response_content}" |
|
|
else: |
|
|
|
|
|
formatted_output = f"**💬 Response:**\n{response_content}" |
|
|
|
|
|
yield formatted_output |
|
|
except ValueError: |
|
|
|
|
|
yield output |
|
|
else: |
|
|
|
|
|
yield output |
|
|
else: |
|
|
|
|
|
yield output |
|
|
|
|
|
|
|
|
|
|
|
examples = [ |
|
|
["What is the capital of France? /no_think", "You are a helpful assistant.", 700, True], |
|
|
["Explain quantum computing in simple terms", "You are a helpful assistant.", 512, False], |
|
|
["Solve this math problem: If x^2 + 5x + 6 = 0, what are the values of x? /think", "You are a helpful assistant.", 2000, True] |
|
|
] |
|
|
|
|
|
system_prompt = ( |
|
|
"انت موديل عراقي ذكي من بغداد. تتحدث باللهجة العراقية فقط. " |
|
|
"جاوب على كل سؤال بشرح كامل وموسع، ووضح الأسباب والخلفية والمعلومات المهمة. " |
|
|
"استخدم أمثلة عراقية واقعية أو حياتية كلما أمكن. " |
|
|
"تجنب الفصحى نهائيًا، وخلي الرد مطول وممتع." |
|
|
) |
|
|
|
|
|
demo = gr.ChatInterface( |
|
|
fn=generate, |
|
|
type="messages", |
|
|
textbox=gr.Textbox( |
|
|
placeholder="Type your message here...", |
|
|
autofocus=True, |
|
|
), |
|
|
multimodal=False, |
|
|
additional_inputs=[ |
|
|
gr.Textbox(label="System Prompt", value=system_prompt), |
|
|
gr.Slider(label="Max New Tokens", minimum=100, maximum=32768, step=100, value=2048), |
|
|
gr.Checkbox(label="Enable Thinking Mode", value=True, info="Enable for complex reasoning tasks (math, coding). Disable for faster general chat."), |
|
|
], |
|
|
title="Qwen3-14B Iraqi Chatbot with Thinking Mode", |
|
|
description=""" |
|
|
🤔 **Thinking Mode ON**: Better for math, coding, and complex reasoning |
|
|
💬 **Thinking Mode OFF**: Faster responses for general conversation |
|
|
|
|
|
**💡 Pro Tip**: When Thinking Mode is enabled, you can use: |
|
|
- `/think` in your message to force thinking for that turn |
|
|
- `/no_think` in your message to skip thinking for that turn |
|
|
|
|
|
Example: "Solve this equation: x^2 + 5x + 6 = 0 /think" |
|
|
""", |
|
|
examples=examples, |
|
|
stop_btn=False, |
|
|
css=""" |
|
|
.gradio-container, .chatbot, .chatbot * { |
|
|
direction: rtl !important; |
|
|
text-align: right !important; |
|
|
unicode-bidi: plaintext !important; |
|
|
font-family: 'Tajawal', 'Cairo', sans-serif; |
|
|
} |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |