Spaces:
Running
on
Zero
Running
on
Zero
simplify UI
Browse files- .gitignore +1 -0
- app.py +112 -158
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
app.py
CHANGED
|
@@ -63,13 +63,13 @@ MODELS = {
|
|
| 63 |
"repo_id": "Qwen/Qwen3-4B-Instruct-2507",
|
| 64 |
"description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks.",
|
| 65 |
"params_b": 4.0
|
| 66 |
-
},
|
| 67 |
"Apriel-1.5-15b-Thinker": {
|
| 68 |
"repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
|
| 69 |
"description": "Multimodal reasoning model with 15B parameters, trained via extensive mid-training on text and image data, and fine-tuned only on text (no image SFT). Achieves competitive performance on reasoning benchmarks like Artificial Analysis (score: 52), Tau2 Bench Telecom (68), and IFBench (62). Supports both text and image understanding, fits on a single GPU, and includes structured reasoning output with tool and function calling capabilities.",
|
| 70 |
"params_b": 15.0
|
| 71 |
},
|
| 72 |
-
|
| 73 |
# 14.8B total parameters
|
| 74 |
"Qwen3-14B": {
|
| 75 |
"repo_id": "Qwen/Qwen3-14B",
|
|
@@ -176,6 +176,14 @@ MODELS = {
|
|
| 176 |
"params_b": 1.7
|
| 177 |
},
|
| 178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
# ~2B (effective)
|
| 180 |
"Gemma-3n-E2B": {
|
| 181 |
"repo_id": "google/gemma-3n-E2B",
|
|
@@ -438,10 +446,10 @@ def retrieve_context(query, max_results=6, max_chars=50):
|
|
| 438 |
except Exception:
|
| 439 |
return []
|
| 440 |
|
| 441 |
-
def format_conversation(history, system_prompt, tokenizer):
|
| 442 |
if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
|
| 443 |
messages = [{"role": "system", "content": system_prompt.strip()}] + history
|
| 444 |
-
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=
|
| 445 |
else:
|
| 446 |
# Fallback for base LMs without chat template
|
| 447 |
prompt = system_prompt.strip() + "\n"
|
|
@@ -454,7 +462,7 @@ def format_conversation(history, system_prompt, tokenizer):
|
|
| 454 |
prompt += "Assistant: "
|
| 455 |
return prompt
|
| 456 |
|
| 457 |
-
def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout, enable_tts):
|
| 458 |
# Get model size from the MODELS dict (more reliable than string parsing)
|
| 459 |
model_size = MODELS[model_name].get("params_b", 4.0) # Default to 4B if not found
|
| 460 |
|
|
@@ -474,14 +482,14 @@ def get_duration(user_msg, chat_history, system_prompt, enable_search, max_resul
|
|
| 474 |
def chat_response(user_msg, chat_history, system_prompt,
|
| 475 |
enable_search, max_results, max_chars,
|
| 476 |
model_name, max_tokens, temperature,
|
| 477 |
-
top_k, top_p, repeat_penalty, search_timeout, enable_tts):
|
| 478 |
"""
|
| 479 |
Generates streaming chat responses, optionally with background web search.
|
| 480 |
This version includes cancellation support.
|
| 481 |
"""
|
| 482 |
# Clear the cancellation event at the start of a new generation
|
| 483 |
cancel_event.clear()
|
| 484 |
-
|
| 485 |
history = list(chat_history or [])
|
| 486 |
history.append({'role': 'user', 'content': user_msg})
|
| 487 |
|
|
@@ -504,7 +512,7 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
| 504 |
cur_date = datetime.now().strftime('%Y-%m-%d')
|
| 505 |
# merge any fetched search results into the system prompt
|
| 506 |
if search_results:
|
| 507 |
-
|
| 508 |
enriched = system_prompt.strip() + \
|
| 509 |
f'''\n# The following contents are the search results related to the user's message:
|
| 510 |
{search_results}
|
|
@@ -557,7 +565,7 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
| 557 |
|
| 558 |
pipe = load_pipeline(model_name)
|
| 559 |
|
| 560 |
-
prompt = format_conversation(history, enriched, pipe.tokenizer)
|
| 561 |
prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
|
| 562 |
streamer = TextIteratorStreamer(pipe.tokenizer,
|
| 563 |
skip_prompt=True,
|
|
@@ -594,7 +602,7 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
| 594 |
history[-1]['content'] += " [Generation Canceled]"
|
| 595 |
yield history, debug, None
|
| 596 |
break
|
| 597 |
-
|
| 598 |
text = chunk
|
| 599 |
|
| 600 |
# Detect start of thinking
|
|
@@ -658,20 +666,21 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
| 658 |
|
| 659 |
|
| 660 |
def update_default_prompt(enable_search):
|
| 661 |
-
return f"You are a helpful assistant."
|
| 662 |
|
| 663 |
-
def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout, enable_tts):
|
| 664 |
"""Calculate and format the estimated GPU duration for current settings."""
|
| 665 |
try:
|
| 666 |
dummy_msg, dummy_history, dummy_system_prompt = "", [], ""
|
| 667 |
duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
|
| 668 |
enable_search, max_results, max_chars, model_name,
|
| 669 |
-
max_tokens, 0.7, 40, 0.9, 1.2, search_timeout, enable_tts)
|
| 670 |
model_size = MODELS[model_name].get("params_b", 4.0)
|
| 671 |
return (f"โฑ๏ธ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
|
| 672 |
f"๐ **Model Size:** {model_size:.1f}B parameters\n"
|
| 673 |
f"๐ **Web Search:** {'Enabled' if enable_search else 'Disabled'}\n"
|
| 674 |
-
f"๐ **TTS:** {'Enabled' if enable_tts else 'Disabled'}"
|
|
|
|
| 675 |
except Exception as e:
|
| 676 |
return f"โ ๏ธ Error calculating estimate: {e}"
|
| 677 |
|
|
@@ -695,161 +704,106 @@ CUSTOM_CSS = """
|
|
| 695 |
|
| 696 |
with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
|
| 697 |
# Header
|
| 698 |
-
gr.Markdown(""
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 703 |
with gr.Row():
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
model_dd = gr.Dropdown(
|
| 710 |
label="๐ค Model",
|
| 711 |
choices=list(MODELS.keys()),
|
| 712 |
-
value="Qwen3-
|
| 713 |
info="Select the language model to use"
|
| 714 |
)
|
| 715 |
-
search_chk = gr.Checkbox(
|
| 716 |
-
label="๐ Enable Web Search",
|
| 717 |
-
value=False,
|
| 718 |
-
info="Augment responses with real-time web data"
|
| 719 |
-
)
|
| 720 |
-
tts_chk = gr.Checkbox(
|
| 721 |
-
label="๐ Enable Text-to-Speech",
|
| 722 |
-
value=False,
|
| 723 |
-
info="Convert responses to speech using voice cloning"
|
| 724 |
-
)
|
| 725 |
-
sys_prompt = gr.Textbox(
|
| 726 |
-
label="๐ System Prompt",
|
| 727 |
-
lines=3,
|
| 728 |
-
value=update_default_prompt(search_chk.value),
|
| 729 |
-
placeholder="Define the assistant's behavior and personality..."
|
| 730 |
-
)
|
| 731 |
-
|
| 732 |
-
# Duration Estimate
|
| 733 |
-
duration_display = gr.Markdown(
|
| 734 |
-
value=update_duration_estimate("Qwen3-1.7B", False, 4, 50, 1024, 5.0, False),
|
| 735 |
-
elem_classes="duration-estimate"
|
| 736 |
-
)
|
| 737 |
-
|
| 738 |
-
# Advanced Settings (Collapsible)
|
| 739 |
-
with gr.Accordion("๐๏ธ Advanced Generation Parameters", open=False):
|
| 740 |
-
max_tok = gr.Slider(
|
| 741 |
-
64, 16384, value=1024, step=32,
|
| 742 |
-
label="Max Tokens",
|
| 743 |
-
info="Maximum length of generated response"
|
| 744 |
-
)
|
| 745 |
-
temp = gr.Slider(
|
| 746 |
-
0.1, 2.0, value=0.7, step=0.1,
|
| 747 |
-
label="Temperature",
|
| 748 |
-
info="Higher = more creative, Lower = more focused"
|
| 749 |
-
)
|
| 750 |
with gr.Row():
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
info="
|
| 755 |
)
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
info="
|
| 760 |
)
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
value=
|
| 771 |
-
|
| 772 |
-
info="Number of search results to retrieve"
|
| 773 |
-
)
|
| 774 |
-
mc = gr.Number(
|
| 775 |
-
value=50, precision=0,
|
| 776 |
-
label="Max Chars/Result",
|
| 777 |
-
info="Character limit per search result"
|
| 778 |
-
)
|
| 779 |
-
st = gr.Slider(
|
| 780 |
-
minimum=0.0, maximum=30.0, step=0.5, value=5.0,
|
| 781 |
-
label="Search Timeout (s)",
|
| 782 |
-
info="Maximum time to wait for search results"
|
| 783 |
)
|
| 784 |
-
|
| 785 |
-
# Actions
|
| 786 |
-
with gr.Row():
|
| 787 |
-
clr = gr.Button("๐๏ธ Clear Chat", variant="secondary", scale=1)
|
| 788 |
-
|
| 789 |
-
# Right Panel - Chat Interface
|
| 790 |
-
with gr.Column(scale=7):
|
| 791 |
-
chat = gr.Chatbot(
|
| 792 |
-
height=600,
|
| 793 |
-
label="๐ฌ Conversation",
|
| 794 |
-
buttons=["copy"],
|
| 795 |
-
avatar_images=(None, "๐ค"),
|
| 796 |
-
layout="bubble"
|
| 797 |
-
)
|
| 798 |
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
visible=False,
|
| 805 |
-
elem_id="tts-audio"
|
| 806 |
-
)
|
| 807 |
|
| 808 |
-
|
|
|
|
|
|
|
|
|
|
| 809 |
with gr.Row():
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
["Write a Python function to calculate fibonacci numbers"],
|
| 827 |
-
["What are the latest developments in AI? (Enable web search)"],
|
| 828 |
-
["Tell me a creative story about a time traveler"],
|
| 829 |
-
["Help me debug this code: def add(a,b): return a+b+1"]
|
| 830 |
-
],
|
| 831 |
-
inputs=txt,
|
| 832 |
-
label="๐ก Example Prompts"
|
| 833 |
-
)
|
| 834 |
-
|
| 835 |
-
# Debug/Status Info (Collapsible)
|
| 836 |
-
with gr.Accordion("๐ Debug Info", open=False):
|
| 837 |
-
dbg = gr.Markdown()
|
| 838 |
-
|
| 839 |
-
# Footer
|
| 840 |
-
gr.Markdown("""
|
| 841 |
-
---
|
| 842 |
-
๐ก **Tips:**
|
| 843 |
-
- Use **Advanced Parameters** to fine-tune creativity and response length
|
| 844 |
-
- Enable **Web Search** for real-time, up-to-date information
|
| 845 |
-
- Try different **models** for various tasks (reasoning, coding, general chat)
|
| 846 |
-
- Click the **Copy** button on responses to save them to your clipboard
|
| 847 |
-
""", elem_classes="footer")
|
| 848 |
|
| 849 |
# --- Event Listeners ---
|
| 850 |
|
| 851 |
# Group all inputs for cleaner event handling
|
| 852 |
-
chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st, tts_chk]
|
| 853 |
# Group all UI components that can be updated.
|
| 854 |
ui_components = [chat, dbg, txt, submit_btn, cancel_btn, tts_audio_output]
|
| 855 |
|
|
@@ -927,7 +881,7 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
|
|
| 927 |
"""Called by the cancel button, sets the global event."""
|
| 928 |
cancel_event.set()
|
| 929 |
print("Cancellation signal sent.")
|
| 930 |
-
|
| 931 |
def reset_ui_after_cancel():
|
| 932 |
"""Reset UI components after cancellation."""
|
| 933 |
cancel_event.clear() # Clear the flag for next generation
|
|
@@ -962,21 +916,21 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
|
|
| 962 |
)
|
| 963 |
|
| 964 |
# Listeners for updating the duration estimate
|
| 965 |
-
duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st, tts_chk]
|
| 966 |
for component in duration_inputs:
|
| 967 |
component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)
|
| 968 |
|
| 969 |
# Toggle web search settings visibility
|
| 970 |
def toggle_search_settings(enabled):
|
| 971 |
return gr.update(visible=enabled)
|
| 972 |
-
|
| 973 |
search_chk.change(
|
| 974 |
fn=lambda enabled: (update_default_prompt(enabled), gr.update(visible=enabled)),
|
| 975 |
inputs=search_chk,
|
| 976 |
outputs=[sys_prompt, search_settings]
|
| 977 |
)
|
| 978 |
-
|
| 979 |
# Clear chat action
|
| 980 |
clr.click(fn=lambda: ([], "", "", gr.update(visible=False, value=None)), outputs=[chat, txt, dbg, tts_audio_output])
|
| 981 |
-
|
| 982 |
-
demo.launch(theme=CUSTOM_THEME, css=CUSTOM_CSS)
|
|
|
|
| 63 |
"repo_id": "Qwen/Qwen3-4B-Instruct-2507",
|
| 64 |
"description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks.",
|
| 65 |
"params_b": 4.0
|
| 66 |
+
},
|
| 67 |
"Apriel-1.5-15b-Thinker": {
|
| 68 |
"repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
|
| 69 |
"description": "Multimodal reasoning model with 15B parameters, trained via extensive mid-training on text and image data, and fine-tuned only on text (no image SFT). Achieves competitive performance on reasoning benchmarks like Artificial Analysis (score: 52), Tau2 Bench Telecom (68), and IFBench (62). Supports both text and image understanding, fits on a single GPU, and includes structured reasoning output with tool and function calling capabilities.",
|
| 70 |
"params_b": 15.0
|
| 71 |
},
|
| 72 |
+
|
| 73 |
# 14.8B total parameters
|
| 74 |
"Qwen3-14B": {
|
| 75 |
"repo_id": "Qwen/Qwen3-14B",
|
|
|
|
| 176 |
"params_b": 1.7
|
| 177 |
},
|
| 178 |
|
| 179 |
+
# 0.6B
|
| 180 |
+
"Qwen3-0.6B": {
|
| 181 |
+
"repo_id": "Qwen/Qwen3-0.6B",
|
| 182 |
+
"description": "Causal Language Model, Training Stage: Pretraining & Post-training. Number of Parameters: 0.6B, Number of Paramaters (Non-Embedding): 0.44B, Number of Layers: 28, Number of Attention Heads (GQA): 16 for Q and 8 for KV, Context Length: 32,768",
|
| 183 |
+
"params_b": 0.6
|
| 184 |
+
},
|
| 185 |
+
|
| 186 |
+
|
| 187 |
# ~2B (effective)
|
| 188 |
"Gemma-3n-E2B": {
|
| 189 |
"repo_id": "google/gemma-3n-E2B",
|
|
|
|
| 446 |
except Exception:
|
| 447 |
return []
|
| 448 |
|
| 449 |
+
def format_conversation(history, system_prompt, tokenizer, enable_thinking=False):
|
| 450 |
if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
|
| 451 |
messages = [{"role": "system", "content": system_prompt.strip()}] + history
|
| 452 |
+
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking)
|
| 453 |
else:
|
| 454 |
# Fallback for base LMs without chat template
|
| 455 |
prompt = system_prompt.strip() + "\n"
|
|
|
|
| 462 |
prompt += "Assistant: "
|
| 463 |
return prompt
|
| 464 |
|
| 465 |
+
def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout, enable_tts, enable_thinking):
|
| 466 |
# Get model size from the MODELS dict (more reliable than string parsing)
|
| 467 |
model_size = MODELS[model_name].get("params_b", 4.0) # Default to 4B if not found
|
| 468 |
|
|
|
|
| 482 |
def chat_response(user_msg, chat_history, system_prompt,
|
| 483 |
enable_search, max_results, max_chars,
|
| 484 |
model_name, max_tokens, temperature,
|
| 485 |
+
top_k, top_p, repeat_penalty, search_timeout, enable_tts, enable_thinking):
|
| 486 |
"""
|
| 487 |
Generates streaming chat responses, optionally with background web search.
|
| 488 |
This version includes cancellation support.
|
| 489 |
"""
|
| 490 |
# Clear the cancellation event at the start of a new generation
|
| 491 |
cancel_event.clear()
|
| 492 |
+
|
| 493 |
history = list(chat_history or [])
|
| 494 |
history.append({'role': 'user', 'content': user_msg})
|
| 495 |
|
|
|
|
| 512 |
cur_date = datetime.now().strftime('%Y-%m-%d')
|
| 513 |
# merge any fetched search results into the system prompt
|
| 514 |
if search_results:
|
| 515 |
+
|
| 516 |
enriched = system_prompt.strip() + \
|
| 517 |
f'''\n# The following contents are the search results related to the user's message:
|
| 518 |
{search_results}
|
|
|
|
| 565 |
|
| 566 |
pipe = load_pipeline(model_name)
|
| 567 |
|
| 568 |
+
prompt = format_conversation(history, enriched, pipe.tokenizer, enable_thinking)
|
| 569 |
prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
|
| 570 |
streamer = TextIteratorStreamer(pipe.tokenizer,
|
| 571 |
skip_prompt=True,
|
|
|
|
| 602 |
history[-1]['content'] += " [Generation Canceled]"
|
| 603 |
yield history, debug, None
|
| 604 |
break
|
| 605 |
+
|
| 606 |
text = chunk
|
| 607 |
|
| 608 |
# Detect start of thinking
|
|
|
|
| 666 |
|
| 667 |
|
| 668 |
def update_default_prompt(enable_search):
|
| 669 |
+
return f"You are a helpful assistant. Don't use emojis in your response. Keep replies short to a maximum of three sentences."
|
| 670 |
|
| 671 |
+
def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout, enable_tts, enable_thinking):
|
| 672 |
"""Calculate and format the estimated GPU duration for current settings."""
|
| 673 |
try:
|
| 674 |
dummy_msg, dummy_history, dummy_system_prompt = "", [], ""
|
| 675 |
duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
|
| 676 |
enable_search, max_results, max_chars, model_name,
|
| 677 |
+
max_tokens, 0.7, 40, 0.9, 1.2, search_timeout, enable_tts, enable_thinking)
|
| 678 |
model_size = MODELS[model_name].get("params_b", 4.0)
|
| 679 |
return (f"โฑ๏ธ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
|
| 680 |
f"๐ **Model Size:** {model_size:.1f}B parameters\n"
|
| 681 |
f"๐ **Web Search:** {'Enabled' if enable_search else 'Disabled'}\n"
|
| 682 |
+
f"๐ **TTS:** {'Enabled' if enable_tts else 'Disabled'}\n"
|
| 683 |
+
f"๐ญ **Thinking:** {'Enabled' if enable_thinking else 'Disabled'}")
|
| 684 |
except Exception as e:
|
| 685 |
return f"โ ๏ธ Error calculating estimate: {e}"
|
| 686 |
|
|
|
|
| 704 |
|
| 705 |
with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
|
| 706 |
# Header
|
| 707 |
+
gr.Markdown("# ๐ง ZeroGPU LLM Inference")
|
| 708 |
+
|
| 709 |
+
# Main Chat Interface
|
| 710 |
+
chat = gr.Chatbot(
|
| 711 |
+
height=500,
|
| 712 |
+
label="๐ฌ Conversation",
|
| 713 |
+
buttons=["copy"],
|
| 714 |
+
avatar_images=(None, "๐ค"),
|
| 715 |
+
layout="bubble"
|
| 716 |
+
)
|
| 717 |
+
|
| 718 |
+
# TTS Audio Output (visible by default since TTS is on)
|
| 719 |
+
tts_audio_output = gr.Audio(
|
| 720 |
+
label="๐ Generated Speech",
|
| 721 |
+
type="numpy",
|
| 722 |
+
autoplay=True,
|
| 723 |
+
visible=True,
|
| 724 |
+
elem_id="tts-audio"
|
| 725 |
+
)
|
| 726 |
+
|
| 727 |
+
# Input Area
|
| 728 |
with gr.Row():
|
| 729 |
+
txt = gr.Textbox(
|
| 730 |
+
placeholder="๐ญ Type your message here... (Press Enter to send)",
|
| 731 |
+
scale=9,
|
| 732 |
+
container=False,
|
| 733 |
+
show_label=False,
|
| 734 |
+
lines=1,
|
| 735 |
+
max_lines=5
|
| 736 |
+
)
|
| 737 |
+
with gr.Column(scale=1, min_width=120):
|
| 738 |
+
submit_btn = gr.Button("๐ค Send", variant="primary", size="lg")
|
| 739 |
+
cancel_btn = gr.Button("โน๏ธ Stop", variant="stop", visible=False, size="lg")
|
| 740 |
+
|
| 741 |
+
# Collapsed Settings Section at Bottom
|
| 742 |
+
with gr.Accordion("โ๏ธ Settings", open=False):
|
| 743 |
+
with gr.Row():
|
| 744 |
+
with gr.Column(scale=1):
|
| 745 |
model_dd = gr.Dropdown(
|
| 746 |
label="๐ค Model",
|
| 747 |
choices=list(MODELS.keys()),
|
| 748 |
+
value="Qwen3-0.6B",
|
| 749 |
info="Select the language model to use"
|
| 750 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
with gr.Row():
|
| 752 |
+
tts_chk = gr.Checkbox(
|
| 753 |
+
label="๐ Text-to-Speech",
|
| 754 |
+
value=True,
|
| 755 |
+
info="Convert responses to speech"
|
| 756 |
)
|
| 757 |
+
thinking_chk = gr.Checkbox(
|
| 758 |
+
label="๐ญ Thinking",
|
| 759 |
+
value=False,
|
| 760 |
+
info="Show model reasoning"
|
| 761 |
)
|
| 762 |
+
search_chk = gr.Checkbox(
|
| 763 |
+
label="๐ Web Search",
|
| 764 |
+
value=False,
|
| 765 |
+
info="Augment with web data"
|
| 766 |
+
)
|
| 767 |
+
with gr.Column(scale=1):
|
| 768 |
+
sys_prompt = gr.Textbox(
|
| 769 |
+
label="๐ System Prompt",
|
| 770 |
+
lines=3,
|
| 771 |
+
value=update_default_prompt(False),
|
| 772 |
+
placeholder="Define the assistant's behavior..."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 774 |
|
| 775 |
+
# Duration Estimate
|
| 776 |
+
duration_display = gr.Markdown(
|
| 777 |
+
value=update_duration_estimate("Qwen3-0.6B", False, 4, 50, 1024, 5.0, True, False),
|
| 778 |
+
elem_classes="duration-estimate"
|
| 779 |
+
)
|
|
|
|
|
|
|
|
|
|
| 780 |
|
| 781 |
+
# Advanced Settings
|
| 782 |
+
with gr.Accordion("๐๏ธ Advanced Parameters", open=False):
|
| 783 |
+
max_tok = gr.Slider(64, 16384, value=512, step=32, label="Max Tokens")
|
| 784 |
+
temp = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
|
| 785 |
with gr.Row():
|
| 786 |
+
k = gr.Slider(1, 100, value=40, step=1, label="Top-K")
|
| 787 |
+
p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
|
| 788 |
+
rp = gr.Slider(1.0, 2.0, value=1.2, step=0.1, label="Repetition Penalty")
|
| 789 |
+
|
| 790 |
+
# Web Search Settings
|
| 791 |
+
with gr.Accordion("๐ Web Search Settings", open=False, visible=False) as search_settings:
|
| 792 |
+
mr = gr.Number(value=4, precision=0, label="Max Results")
|
| 793 |
+
mc = gr.Number(value=50, precision=0, label="Max Chars/Result")
|
| 794 |
+
st = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, value=5.0, label="Search Timeout (s)")
|
| 795 |
+
|
| 796 |
+
# Debug Info
|
| 797 |
+
with gr.Accordion("๐ Debug Info", open=False):
|
| 798 |
+
dbg = gr.Markdown()
|
| 799 |
+
|
| 800 |
+
# Clear Button
|
| 801 |
+
clr = gr.Button("๐๏ธ Clear Chat", variant="secondary")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 802 |
|
| 803 |
# --- Event Listeners ---
|
| 804 |
|
| 805 |
# Group all inputs for cleaner event handling
|
| 806 |
+
chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st, tts_chk, thinking_chk]
|
| 807 |
# Group all UI components that can be updated.
|
| 808 |
ui_components = [chat, dbg, txt, submit_btn, cancel_btn, tts_audio_output]
|
| 809 |
|
|
|
|
| 881 |
"""Called by the cancel button, sets the global event."""
|
| 882 |
cancel_event.set()
|
| 883 |
print("Cancellation signal sent.")
|
| 884 |
+
|
| 885 |
def reset_ui_after_cancel():
|
| 886 |
"""Reset UI components after cancellation."""
|
| 887 |
cancel_event.clear() # Clear the flag for next generation
|
|
|
|
| 916 |
)
|
| 917 |
|
| 918 |
# Listeners for updating the duration estimate
|
| 919 |
+
duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st, tts_chk, thinking_chk]
|
| 920 |
for component in duration_inputs:
|
| 921 |
component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)
|
| 922 |
|
| 923 |
# Toggle web search settings visibility
|
| 924 |
def toggle_search_settings(enabled):
|
| 925 |
return gr.update(visible=enabled)
|
| 926 |
+
|
| 927 |
search_chk.change(
|
| 928 |
fn=lambda enabled: (update_default_prompt(enabled), gr.update(visible=enabled)),
|
| 929 |
inputs=search_chk,
|
| 930 |
outputs=[sys_prompt, search_settings]
|
| 931 |
)
|
| 932 |
+
|
| 933 |
# Clear chat action
|
| 934 |
clr.click(fn=lambda: ([], "", "", gr.update(visible=False, value=None)), outputs=[chat, txt, dbg, tts_audio_output])
|
| 935 |
+
|
| 936 |
+
demo.launch(theme=CUSTOM_THEME, css=CUSTOM_CSS)
|