polats commited on
Commit
8b6384b
ยท
1 Parent(s): 26998d4

simplify UI

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +112 -158
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
app.py CHANGED
@@ -63,13 +63,13 @@ MODELS = {
63
  "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
64
  "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks.",
65
  "params_b": 4.0
66
- },
67
  "Apriel-1.5-15b-Thinker": {
68
  "repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
69
  "description": "Multimodal reasoning model with 15B parameters, trained via extensive mid-training on text and image data, and fine-tuned only on text (no image SFT). Achieves competitive performance on reasoning benchmarks like Artificial Analysis (score: 52), Tau2 Bench Telecom (68), and IFBench (62). Supports both text and image understanding, fits on a single GPU, and includes structured reasoning output with tool and function calling capabilities.",
70
  "params_b": 15.0
71
  },
72
-
73
  # 14.8B total parameters
74
  "Qwen3-14B": {
75
  "repo_id": "Qwen/Qwen3-14B",
@@ -176,6 +176,14 @@ MODELS = {
176
  "params_b": 1.7
177
  },
178
 
 
 
 
 
 
 
 
 
179
  # ~2B (effective)
180
  "Gemma-3n-E2B": {
181
  "repo_id": "google/gemma-3n-E2B",
@@ -438,10 +446,10 @@ def retrieve_context(query, max_results=6, max_chars=50):
438
  except Exception:
439
  return []
440
 
441
- def format_conversation(history, system_prompt, tokenizer):
442
  if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
443
  messages = [{"role": "system", "content": system_prompt.strip()}] + history
444
- return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
445
  else:
446
  # Fallback for base LMs without chat template
447
  prompt = system_prompt.strip() + "\n"
@@ -454,7 +462,7 @@ def format_conversation(history, system_prompt, tokenizer):
454
  prompt += "Assistant: "
455
  return prompt
456
 
457
- def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout, enable_tts):
458
  # Get model size from the MODELS dict (more reliable than string parsing)
459
  model_size = MODELS[model_name].get("params_b", 4.0) # Default to 4B if not found
460
 
@@ -474,14 +482,14 @@ def get_duration(user_msg, chat_history, system_prompt, enable_search, max_resul
474
  def chat_response(user_msg, chat_history, system_prompt,
475
  enable_search, max_results, max_chars,
476
  model_name, max_tokens, temperature,
477
- top_k, top_p, repeat_penalty, search_timeout, enable_tts):
478
  """
479
  Generates streaming chat responses, optionally with background web search.
480
  This version includes cancellation support.
481
  """
482
  # Clear the cancellation event at the start of a new generation
483
  cancel_event.clear()
484
-
485
  history = list(chat_history or [])
486
  history.append({'role': 'user', 'content': user_msg})
487
 
@@ -504,7 +512,7 @@ def chat_response(user_msg, chat_history, system_prompt,
504
  cur_date = datetime.now().strftime('%Y-%m-%d')
505
  # merge any fetched search results into the system prompt
506
  if search_results:
507
-
508
  enriched = system_prompt.strip() + \
509
  f'''\n# The following contents are the search results related to the user's message:
510
  {search_results}
@@ -557,7 +565,7 @@ def chat_response(user_msg, chat_history, system_prompt,
557
 
558
  pipe = load_pipeline(model_name)
559
 
560
- prompt = format_conversation(history, enriched, pipe.tokenizer)
561
  prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
562
  streamer = TextIteratorStreamer(pipe.tokenizer,
563
  skip_prompt=True,
@@ -594,7 +602,7 @@ def chat_response(user_msg, chat_history, system_prompt,
594
  history[-1]['content'] += " [Generation Canceled]"
595
  yield history, debug, None
596
  break
597
-
598
  text = chunk
599
 
600
  # Detect start of thinking
@@ -658,20 +666,21 @@ def chat_response(user_msg, chat_history, system_prompt,
658
 
659
 
660
  def update_default_prompt(enable_search):
661
- return f"You are a helpful assistant."
662
 
663
- def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout, enable_tts):
664
  """Calculate and format the estimated GPU duration for current settings."""
665
  try:
666
  dummy_msg, dummy_history, dummy_system_prompt = "", [], ""
667
  duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
668
  enable_search, max_results, max_chars, model_name,
669
- max_tokens, 0.7, 40, 0.9, 1.2, search_timeout, enable_tts)
670
  model_size = MODELS[model_name].get("params_b", 4.0)
671
  return (f"โฑ๏ธ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
672
  f"๐Ÿ“Š **Model Size:** {model_size:.1f}B parameters\n"
673
  f"๐Ÿ” **Web Search:** {'Enabled' if enable_search else 'Disabled'}\n"
674
- f"๐Ÿ”Š **TTS:** {'Enabled' if enable_tts else 'Disabled'}")
 
675
  except Exception as e:
676
  return f"โš ๏ธ Error calculating estimate: {e}"
677
 
@@ -695,161 +704,106 @@ CUSTOM_CSS = """
695
 
696
  with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
697
  # Header
698
- gr.Markdown("""
699
- # ๐Ÿง  ZeroGPU LLM Inference
700
- ### Powered by Hugging Face ZeroGPU with Web Search Integration
701
- """)
702
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703
  with gr.Row():
704
- # Left Panel - Configuration
705
- with gr.Column(scale=3):
706
- # Core Settings (Always Visible)
707
- with gr.Group():
708
- gr.Markdown("### โš™๏ธ Core Settings")
 
 
 
 
 
 
 
 
 
 
 
709
  model_dd = gr.Dropdown(
710
  label="๐Ÿค– Model",
711
  choices=list(MODELS.keys()),
712
- value="Qwen3-1.7B",
713
  info="Select the language model to use"
714
  )
715
- search_chk = gr.Checkbox(
716
- label="๐Ÿ” Enable Web Search",
717
- value=False,
718
- info="Augment responses with real-time web data"
719
- )
720
- tts_chk = gr.Checkbox(
721
- label="๐Ÿ”Š Enable Text-to-Speech",
722
- value=False,
723
- info="Convert responses to speech using voice cloning"
724
- )
725
- sys_prompt = gr.Textbox(
726
- label="๐Ÿ“ System Prompt",
727
- lines=3,
728
- value=update_default_prompt(search_chk.value),
729
- placeholder="Define the assistant's behavior and personality..."
730
- )
731
-
732
- # Duration Estimate
733
- duration_display = gr.Markdown(
734
- value=update_duration_estimate("Qwen3-1.7B", False, 4, 50, 1024, 5.0, False),
735
- elem_classes="duration-estimate"
736
- )
737
-
738
- # Advanced Settings (Collapsible)
739
- with gr.Accordion("๐ŸŽ›๏ธ Advanced Generation Parameters", open=False):
740
- max_tok = gr.Slider(
741
- 64, 16384, value=1024, step=32,
742
- label="Max Tokens",
743
- info="Maximum length of generated response"
744
- )
745
- temp = gr.Slider(
746
- 0.1, 2.0, value=0.7, step=0.1,
747
- label="Temperature",
748
- info="Higher = more creative, Lower = more focused"
749
- )
750
  with gr.Row():
751
- k = gr.Slider(
752
- 1, 100, value=40, step=1,
753
- label="Top-K",
754
- info="Number of top tokens to consider"
755
  )
756
- p = gr.Slider(
757
- 0.1, 1.0, value=0.9, step=0.05,
758
- label="Top-P",
759
- info="Nucleus sampling threshold"
760
  )
761
- rp = gr.Slider(
762
- 1.0, 2.0, value=1.2, step=0.1,
763
- label="Repetition Penalty",
764
- info="Penalize repeated tokens"
765
- )
766
-
767
- # Web Search Settings (Collapsible)
768
- with gr.Accordion("๐ŸŒ Web Search Settings", open=False, visible=False) as search_settings:
769
- mr = gr.Number(
770
- value=4, precision=0,
771
- label="Max Results",
772
- info="Number of search results to retrieve"
773
- )
774
- mc = gr.Number(
775
- value=50, precision=0,
776
- label="Max Chars/Result",
777
- info="Character limit per search result"
778
- )
779
- st = gr.Slider(
780
- minimum=0.0, maximum=30.0, step=0.5, value=5.0,
781
- label="Search Timeout (s)",
782
- info="Maximum time to wait for search results"
783
  )
784
-
785
- # Actions
786
- with gr.Row():
787
- clr = gr.Button("๐Ÿ—‘๏ธ Clear Chat", variant="secondary", scale=1)
788
-
789
- # Right Panel - Chat Interface
790
- with gr.Column(scale=7):
791
- chat = gr.Chatbot(
792
- height=600,
793
- label="๐Ÿ’ฌ Conversation",
794
- buttons=["copy"],
795
- avatar_images=(None, "๐Ÿค–"),
796
- layout="bubble"
797
- )
798
 
799
- # TTS Audio Output
800
- tts_audio_output = gr.Audio(
801
- label="๐Ÿ”Š Generated Speech",
802
- type="numpy",
803
- autoplay=True,
804
- visible=False,
805
- elem_id="tts-audio"
806
- )
807
 
808
- # Input Area
 
 
 
809
  with gr.Row():
810
- txt = gr.Textbox(
811
- placeholder="๐Ÿ’ญ Type your message here... (Press Enter to send)",
812
- scale=9,
813
- container=False,
814
- show_label=False,
815
- lines=1,
816
- max_lines=5
817
- )
818
- with gr.Column(scale=1, min_width=120):
819
- submit_btn = gr.Button("๐Ÿ“ค Send", variant="primary", size="lg")
820
- cancel_btn = gr.Button("โน๏ธ Stop", variant="stop", visible=False, size="lg")
821
-
822
- # Example Prompts
823
- gr.Examples(
824
- examples=[
825
- ["Explain quantum computing in simple terms"],
826
- ["Write a Python function to calculate fibonacci numbers"],
827
- ["What are the latest developments in AI? (Enable web search)"],
828
- ["Tell me a creative story about a time traveler"],
829
- ["Help me debug this code: def add(a,b): return a+b+1"]
830
- ],
831
- inputs=txt,
832
- label="๐Ÿ’ก Example Prompts"
833
- )
834
-
835
- # Debug/Status Info (Collapsible)
836
- with gr.Accordion("๐Ÿ” Debug Info", open=False):
837
- dbg = gr.Markdown()
838
-
839
- # Footer
840
- gr.Markdown("""
841
- ---
842
- ๐Ÿ’ก **Tips:**
843
- - Use **Advanced Parameters** to fine-tune creativity and response length
844
- - Enable **Web Search** for real-time, up-to-date information
845
- - Try different **models** for various tasks (reasoning, coding, general chat)
846
- - Click the **Copy** button on responses to save them to your clipboard
847
- """, elem_classes="footer")
848
 
849
  # --- Event Listeners ---
850
 
851
  # Group all inputs for cleaner event handling
852
- chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st, tts_chk]
853
  # Group all UI components that can be updated.
854
  ui_components = [chat, dbg, txt, submit_btn, cancel_btn, tts_audio_output]
855
 
@@ -927,7 +881,7 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
927
  """Called by the cancel button, sets the global event."""
928
  cancel_event.set()
929
  print("Cancellation signal sent.")
930
-
931
  def reset_ui_after_cancel():
932
  """Reset UI components after cancellation."""
933
  cancel_event.clear() # Clear the flag for next generation
@@ -962,21 +916,21 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
962
  )
963
 
964
  # Listeners for updating the duration estimate
965
- duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st, tts_chk]
966
  for component in duration_inputs:
967
  component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)
968
 
969
  # Toggle web search settings visibility
970
  def toggle_search_settings(enabled):
971
  return gr.update(visible=enabled)
972
-
973
  search_chk.change(
974
  fn=lambda enabled: (update_default_prompt(enabled), gr.update(visible=enabled)),
975
  inputs=search_chk,
976
  outputs=[sys_prompt, search_settings]
977
  )
978
-
979
  # Clear chat action
980
  clr.click(fn=lambda: ([], "", "", gr.update(visible=False, value=None)), outputs=[chat, txt, dbg, tts_audio_output])
981
-
982
- demo.launch(theme=CUSTOM_THEME, css=CUSTOM_CSS)
 
63
  "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
64
  "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks.",
65
  "params_b": 4.0
66
+ },
67
  "Apriel-1.5-15b-Thinker": {
68
  "repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
69
  "description": "Multimodal reasoning model with 15B parameters, trained via extensive mid-training on text and image data, and fine-tuned only on text (no image SFT). Achieves competitive performance on reasoning benchmarks like Artificial Analysis (score: 52), Tau2 Bench Telecom (68), and IFBench (62). Supports both text and image understanding, fits on a single GPU, and includes structured reasoning output with tool and function calling capabilities.",
70
  "params_b": 15.0
71
  },
72
+
73
  # 14.8B total parameters
74
  "Qwen3-14B": {
75
  "repo_id": "Qwen/Qwen3-14B",
 
176
  "params_b": 1.7
177
  },
178
 
179
+ # 0.6B
180
+ "Qwen3-0.6B": {
181
+ "repo_id": "Qwen/Qwen3-0.6B",
182
+ "description": "Causal Language Model, Training Stage: Pretraining & Post-training. Number of Parameters: 0.6B, Number of Paramaters (Non-Embedding): 0.44B, Number of Layers: 28, Number of Attention Heads (GQA): 16 for Q and 8 for KV, Context Length: 32,768",
183
+ "params_b": 0.6
184
+ },
185
+
186
+
187
  # ~2B (effective)
188
  "Gemma-3n-E2B": {
189
  "repo_id": "google/gemma-3n-E2B",
 
446
  except Exception:
447
  return []
448
 
449
+ def format_conversation(history, system_prompt, tokenizer, enable_thinking=False):
450
  if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
451
  messages = [{"role": "system", "content": system_prompt.strip()}] + history
452
+ return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking)
453
  else:
454
  # Fallback for base LMs without chat template
455
  prompt = system_prompt.strip() + "\n"
 
462
  prompt += "Assistant: "
463
  return prompt
464
 
465
+ def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout, enable_tts, enable_thinking):
466
  # Get model size from the MODELS dict (more reliable than string parsing)
467
  model_size = MODELS[model_name].get("params_b", 4.0) # Default to 4B if not found
468
 
 
482
  def chat_response(user_msg, chat_history, system_prompt,
483
  enable_search, max_results, max_chars,
484
  model_name, max_tokens, temperature,
485
+ top_k, top_p, repeat_penalty, search_timeout, enable_tts, enable_thinking):
486
  """
487
  Generates streaming chat responses, optionally with background web search.
488
  This version includes cancellation support.
489
  """
490
  # Clear the cancellation event at the start of a new generation
491
  cancel_event.clear()
492
+
493
  history = list(chat_history or [])
494
  history.append({'role': 'user', 'content': user_msg})
495
 
 
512
  cur_date = datetime.now().strftime('%Y-%m-%d')
513
  # merge any fetched search results into the system prompt
514
  if search_results:
515
+
516
  enriched = system_prompt.strip() + \
517
  f'''\n# The following contents are the search results related to the user's message:
518
  {search_results}
 
565
 
566
  pipe = load_pipeline(model_name)
567
 
568
+ prompt = format_conversation(history, enriched, pipe.tokenizer, enable_thinking)
569
  prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
570
  streamer = TextIteratorStreamer(pipe.tokenizer,
571
  skip_prompt=True,
 
602
  history[-1]['content'] += " [Generation Canceled]"
603
  yield history, debug, None
604
  break
605
+
606
  text = chunk
607
 
608
  # Detect start of thinking
 
666
 
667
 
668
  def update_default_prompt(enable_search):
669
+ return f"You are a helpful assistant. Don't use emojis in your response. Keep replies short to a maximum of three sentences."
670
 
671
+ def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout, enable_tts, enable_thinking):
672
  """Calculate and format the estimated GPU duration for current settings."""
673
  try:
674
  dummy_msg, dummy_history, dummy_system_prompt = "", [], ""
675
  duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
676
  enable_search, max_results, max_chars, model_name,
677
+ max_tokens, 0.7, 40, 0.9, 1.2, search_timeout, enable_tts, enable_thinking)
678
  model_size = MODELS[model_name].get("params_b", 4.0)
679
  return (f"โฑ๏ธ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
680
  f"๐Ÿ“Š **Model Size:** {model_size:.1f}B parameters\n"
681
  f"๐Ÿ” **Web Search:** {'Enabled' if enable_search else 'Disabled'}\n"
682
+ f"๐Ÿ”Š **TTS:** {'Enabled' if enable_tts else 'Disabled'}\n"
683
+ f"๐Ÿ’ญ **Thinking:** {'Enabled' if enable_thinking else 'Disabled'}")
684
  except Exception as e:
685
  return f"โš ๏ธ Error calculating estimate: {e}"
686
 
 
704
 
705
  with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
706
  # Header
707
+ gr.Markdown("# ๐Ÿง  ZeroGPU LLM Inference")
708
+
709
+ # Main Chat Interface
710
+ chat = gr.Chatbot(
711
+ height=500,
712
+ label="๐Ÿ’ฌ Conversation",
713
+ buttons=["copy"],
714
+ avatar_images=(None, "๐Ÿค–"),
715
+ layout="bubble"
716
+ )
717
+
718
+ # TTS Audio Output (visible by default since TTS is on)
719
+ tts_audio_output = gr.Audio(
720
+ label="๐Ÿ”Š Generated Speech",
721
+ type="numpy",
722
+ autoplay=True,
723
+ visible=True,
724
+ elem_id="tts-audio"
725
+ )
726
+
727
+ # Input Area
728
  with gr.Row():
729
+ txt = gr.Textbox(
730
+ placeholder="๐Ÿ’ญ Type your message here... (Press Enter to send)",
731
+ scale=9,
732
+ container=False,
733
+ show_label=False,
734
+ lines=1,
735
+ max_lines=5
736
+ )
737
+ with gr.Column(scale=1, min_width=120):
738
+ submit_btn = gr.Button("๐Ÿ“ค Send", variant="primary", size="lg")
739
+ cancel_btn = gr.Button("โน๏ธ Stop", variant="stop", visible=False, size="lg")
740
+
741
+ # Collapsed Settings Section at Bottom
742
+ with gr.Accordion("โš™๏ธ Settings", open=False):
743
+ with gr.Row():
744
+ with gr.Column(scale=1):
745
  model_dd = gr.Dropdown(
746
  label="๐Ÿค– Model",
747
  choices=list(MODELS.keys()),
748
+ value="Qwen3-0.6B",
749
  info="Select the language model to use"
750
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
  with gr.Row():
752
+ tts_chk = gr.Checkbox(
753
+ label="๐Ÿ”Š Text-to-Speech",
754
+ value=True,
755
+ info="Convert responses to speech"
756
  )
757
+ thinking_chk = gr.Checkbox(
758
+ label="๐Ÿ’ญ Thinking",
759
+ value=False,
760
+ info="Show model reasoning"
761
  )
762
+ search_chk = gr.Checkbox(
763
+ label="๐Ÿ” Web Search",
764
+ value=False,
765
+ info="Augment with web data"
766
+ )
767
+ with gr.Column(scale=1):
768
+ sys_prompt = gr.Textbox(
769
+ label="๐Ÿ“ System Prompt",
770
+ lines=3,
771
+ value=update_default_prompt(False),
772
+ placeholder="Define the assistant's behavior..."
 
 
 
 
 
 
 
 
 
 
 
773
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
774
 
775
+ # Duration Estimate
776
+ duration_display = gr.Markdown(
777
+ value=update_duration_estimate("Qwen3-0.6B", False, 4, 50, 1024, 5.0, True, False),
778
+ elem_classes="duration-estimate"
779
+ )
 
 
 
780
 
781
+ # Advanced Settings
782
+ with gr.Accordion("๐ŸŽ›๏ธ Advanced Parameters", open=False):
783
+ max_tok = gr.Slider(64, 16384, value=512, step=32, label="Max Tokens")
784
+ temp = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
785
  with gr.Row():
786
+ k = gr.Slider(1, 100, value=40, step=1, label="Top-K")
787
+ p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
788
+ rp = gr.Slider(1.0, 2.0, value=1.2, step=0.1, label="Repetition Penalty")
789
+
790
+ # Web Search Settings
791
+ with gr.Accordion("๐ŸŒ Web Search Settings", open=False, visible=False) as search_settings:
792
+ mr = gr.Number(value=4, precision=0, label="Max Results")
793
+ mc = gr.Number(value=50, precision=0, label="Max Chars/Result")
794
+ st = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, value=5.0, label="Search Timeout (s)")
795
+
796
+ # Debug Info
797
+ with gr.Accordion("๐Ÿ” Debug Info", open=False):
798
+ dbg = gr.Markdown()
799
+
800
+ # Clear Button
801
+ clr = gr.Button("๐Ÿ—‘๏ธ Clear Chat", variant="secondary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
802
 
803
  # --- Event Listeners ---
804
 
805
  # Group all inputs for cleaner event handling
806
+ chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st, tts_chk, thinking_chk]
807
  # Group all UI components that can be updated.
808
  ui_components = [chat, dbg, txt, submit_btn, cancel_btn, tts_audio_output]
809
 
 
881
  """Called by the cancel button, sets the global event."""
882
  cancel_event.set()
883
  print("Cancellation signal sent.")
884
+
885
  def reset_ui_after_cancel():
886
  """Reset UI components after cancellation."""
887
  cancel_event.clear() # Clear the flag for next generation
 
916
  )
917
 
918
  # Listeners for updating the duration estimate
919
+ duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st, tts_chk, thinking_chk]
920
  for component in duration_inputs:
921
  component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)
922
 
923
  # Toggle web search settings visibility
924
  def toggle_search_settings(enabled):
925
  return gr.update(visible=enabled)
926
+
927
  search_chk.change(
928
  fn=lambda enabled: (update_default_prompt(enabled), gr.update(visible=enabled)),
929
  inputs=search_chk,
930
  outputs=[sys_prompt, search_settings]
931
  )
932
+
933
  # Clear chat action
934
  clr.click(fn=lambda: ([], "", "", gr.update(visible=False, value=None)), outputs=[chat, txt, dbg, tts_audio_output])
935
+
936
+ demo.launch(theme=CUSTOM_THEME, css=CUSTOM_CSS)