ZeroGPU-LLM-Inference

Paused

App Files Files Community

polats commited on Jan 20

Commit

8b6384b

1 Parent(s): 26998d4

simplify UI

Browse files

Files changed (2) hide show

.gitignore +1 -0
app.py +112 -158

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py CHANGED Viewed

@@ -63,13 +63,13 @@ MODELS = {
         "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
         "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks.",
         "params_b": 4.0
-    },
     "Apriel-1.5-15b-Thinker": {
         "repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
         "description": "Multimodal reasoning model with 15B parameters, trained via extensive mid-training on text and image data, and fine-tuned only on text (no image SFT). Achieves competitive performance on reasoning benchmarks like Artificial Analysis (score: 52), Tau2 Bench Telecom (68), and IFBench (62). Supports both text and image understanding, fits on a single GPU, and includes structured reasoning output with tool and function calling capabilities.",
         "params_b": 15.0
     },
     # 14.8B total parameters
     "Qwen3-14B": {
         "repo_id": "Qwen/Qwen3-14B",
@@ -176,6 +176,14 @@ MODELS = {
         "params_b": 1.7
     },
     # ~2B (effective)
     "Gemma-3n-E2B": {
         "repo_id": "google/gemma-3n-E2B",
@@ -438,10 +446,10 @@ def retrieve_context(query, max_results=6, max_chars=50):
     except Exception:
         return []
-def format_conversation(history, system_prompt, tokenizer):
     if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
         messages = [{"role": "system", "content": system_prompt.strip()}] + history
-        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
     else:
         # Fallback for base LMs without chat template
         prompt = system_prompt.strip() + "\n"
@@ -454,7 +462,7 @@ def format_conversation(history, system_prompt, tokenizer):
             prompt += "Assistant: "
         return prompt
-def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout, enable_tts):
     # Get model size from the MODELS dict (more reliable than string parsing)
     model_size = MODELS[model_name].get("params_b", 4.0)  # Default to 4B if not found
@@ -474,14 +482,14 @@ def get_duration(user_msg, chat_history, system_prompt, enable_search, max_resul
 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
                   model_name, max_tokens, temperature,
-                  top_k, top_p, repeat_penalty, search_timeout, enable_tts):
     """
     Generates streaming chat responses, optionally with background web search.
     This version includes cancellation support.
     """
     # Clear the cancellation event at the start of a new generation
     cancel_event.clear()
     history = list(chat_history or [])
     history.append({'role': 'user', 'content': user_msg})
@@ -504,7 +512,7 @@ def chat_response(user_msg, chat_history, system_prompt,
         cur_date = datetime.now().strftime('%Y-%m-%d')
         # merge any fetched search results into the system prompt
         if search_results:
             enriched = system_prompt.strip() + \
             f'''\n# The following contents are the search results related to the user's message:
             {search_results}
@@ -557,7 +565,7 @@ def chat_response(user_msg, chat_history, system_prompt,
         pipe = load_pipeline(model_name)
-        prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
         streamer = TextIteratorStreamer(pipe.tokenizer,
                                         skip_prompt=True,
@@ -594,7 +602,7 @@ def chat_response(user_msg, chat_history, system_prompt,
                     history[-1]['content'] += " [Generation Canceled]"
                 yield history, debug, None
                 break
             text = chunk
             # Detect start of thinking
@@ -658,20 +666,21 @@ def chat_response(user_msg, chat_history, system_prompt,
 def update_default_prompt(enable_search):
-    return f"You are a helpful assistant."
-def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout, enable_tts):
     """Calculate and format the estimated GPU duration for current settings."""
     try:
         dummy_msg, dummy_history, dummy_system_prompt = "", [], ""
         duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
                               enable_search, max_results, max_chars, model_name,
-                              max_tokens, 0.7, 40, 0.9, 1.2, search_timeout, enable_tts)
         model_size = MODELS[model_name].get("params_b", 4.0)
         return (f"⏱️ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
                 f"📊 **Model Size:** {model_size:.1f}B parameters\n"
                 f"🔍 **Web Search:** {'Enabled' if enable_search else 'Disabled'}\n"
-                f"🔊 **TTS:** {'Enabled' if enable_tts else 'Disabled'}")
     except Exception as e:
         return f"⚠️ Error calculating estimate: {e}"
@@ -695,161 +704,106 @@ CUSTOM_CSS = """
 with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     # Header
-    gr.Markdown("""
-    # 🧠 ZeroGPU LLM Inference
-    ### Powered by Hugging Face ZeroGPU with Web Search Integration
-    """)
     with gr.Row():
-        # Left Panel - Configuration
-        with gr.Column(scale=3):
-            # Core Settings (Always Visible)
-            with gr.Group():
-                gr.Markdown("### ⚙️ Core Settings")
                 model_dd = gr.Dropdown(
                     label="🤖 Model",
                     choices=list(MODELS.keys()),
-                    value="Qwen3-1.7B",
                     info="Select the language model to use"
                 )
-                search_chk = gr.Checkbox(
-                    label="🔍 Enable Web Search",
-                    value=False,
-                    info="Augment responses with real-time web data"
-                )
-                tts_chk = gr.Checkbox(
-                    label="🔊 Enable Text-to-Speech",
-                    value=False,
-                    info="Convert responses to speech using voice cloning"
-                )
-                sys_prompt = gr.Textbox(
-                    label="📝 System Prompt",
-                    lines=3,
-                    value=update_default_prompt(search_chk.value),
-                    placeholder="Define the assistant's behavior and personality..."
-                )
-            # Duration Estimate
-            duration_display = gr.Markdown(
-                value=update_duration_estimate("Qwen3-1.7B", False, 4, 50, 1024, 5.0, False),
-                elem_classes="duration-estimate"
-            )
-            # Advanced Settings (Collapsible)
-            with gr.Accordion("🎛️ Advanced Generation Parameters", open=False):
-                max_tok = gr.Slider(
-                    64, 16384, value=1024, step=32,
-                    label="Max Tokens",
-                    info="Maximum length of generated response"
-                )
-                temp = gr.Slider(
-                    0.1, 2.0, value=0.7, step=0.1,
-                    label="Temperature",
-                    info="Higher = more creative, Lower = more focused"
-                )
                 with gr.Row():
-                    k = gr.Slider(
-                        1, 100, value=40, step=1,
-                        label="Top-K",
-                        info="Number of top tokens to consider"
                     )
-                    p = gr.Slider(
-                        0.1, 1.0, value=0.9, step=0.05,
-                        label="Top-P",
-                        info="Nucleus sampling threshold"
                     )
-                rp = gr.Slider(
-                    1.0, 2.0, value=1.2, step=0.1,
-                    label="Repetition Penalty",
-                    info="Penalize repeated tokens"
-                )
-            # Web Search Settings (Collapsible)
-            with gr.Accordion("🌐 Web Search Settings", open=False, visible=False) as search_settings:
-                mr = gr.Number(
-                    value=4, precision=0,
-                    label="Max Results",
-                    info="Number of search results to retrieve"
-                )
-                mc = gr.Number(
-                    value=50, precision=0,
-                    label="Max Chars/Result",
-                    info="Character limit per search result"
-                )
-                st = gr.Slider(
-                    minimum=0.0, maximum=30.0, step=0.5, value=5.0,
-                    label="Search Timeout (s)",
-                    info="Maximum time to wait for search results"
                 )
-            # Actions
-            with gr.Row():
-                clr = gr.Button("🗑️ Clear Chat", variant="secondary", scale=1)
-        # Right Panel - Chat Interface
-        with gr.Column(scale=7):
-            chat = gr.Chatbot(
-                height=600,
-                label="💬 Conversation",
-                buttons=["copy"],
-                avatar_images=(None, "🤖"),
-                layout="bubble"
-            )
-            # TTS Audio Output
-            tts_audio_output = gr.Audio(
-                label="🔊 Generated Speech",
-                type="numpy",
-                autoplay=True,
-                visible=False,
-                elem_id="tts-audio"
-            )
-            # Input Area
             with gr.Row():
-                txt = gr.Textbox(
-                    placeholder="💭 Type your message here... (Press Enter to send)",
-                    scale=9,
-                    container=False,
-                    show_label=False,
-                    lines=1,
-                    max_lines=5
-                )
-                with gr.Column(scale=1, min_width=120):
-                    submit_btn = gr.Button("📤 Send", variant="primary", size="lg")
-                    cancel_btn = gr.Button("⏹️ Stop", variant="stop", visible=False, size="lg")
-            # Example Prompts
-            gr.Examples(
-                examples=[
-                    ["Explain quantum computing in simple terms"],
-                    ["Write a Python function to calculate fibonacci numbers"],
-                    ["What are the latest developments in AI? (Enable web search)"],
-                    ["Tell me a creative story about a time traveler"],
-                    ["Help me debug this code: def add(a,b): return a+b+1"]
-                ],
-                inputs=txt,
-                label="💡 Example Prompts"
-            )
-            # Debug/Status Info (Collapsible)
-            with gr.Accordion("🔍 Debug Info", open=False):
-                dbg = gr.Markdown()
-    # Footer
-    gr.Markdown("""
-    ---
-    💡 **Tips:**
-    - Use **Advanced Parameters** to fine-tune creativity and response length
-    - Enable **Web Search** for real-time, up-to-date information
-    - Try different **models** for various tasks (reasoning, coding, general chat)
-    - Click the **Copy** button on responses to save them to your clipboard
-    """, elem_classes="footer")
     # --- Event Listeners ---
     # Group all inputs for cleaner event handling
-    chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st, tts_chk]
     # Group all UI components that can be updated.
     ui_components = [chat, dbg, txt, submit_btn, cancel_btn, tts_audio_output]
@@ -927,7 +881,7 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
         """Called by the cancel button, sets the global event."""
         cancel_event.set()
         print("Cancellation signal sent.")
     def reset_ui_after_cancel():
         """Reset UI components after cancellation."""
         cancel_event.clear()  # Clear the flag for next generation
@@ -962,21 +916,21 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     )
     # Listeners for updating the duration estimate
-    duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st, tts_chk]
     for component in duration_inputs:
         component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)
     # Toggle web search settings visibility
     def toggle_search_settings(enabled):
         return gr.update(visible=enabled)
     search_chk.change(
         fn=lambda enabled: (update_default_prompt(enabled), gr.update(visible=enabled)),
         inputs=search_chk,
         outputs=[sys_prompt, search_settings]
     )
     # Clear chat action
     clr.click(fn=lambda: ([], "", "", gr.update(visible=False, value=None)), outputs=[chat, txt, dbg, tts_audio_output])
-    demo.launch(theme=CUSTOM_THEME, css=CUSTOM_CSS)

         "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
         "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks.",
         "params_b": 4.0
+    },
     "Apriel-1.5-15b-Thinker": {
         "repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
         "description": "Multimodal reasoning model with 15B parameters, trained via extensive mid-training on text and image data, and fine-tuned only on text (no image SFT). Achieves competitive performance on reasoning benchmarks like Artificial Analysis (score: 52), Tau2 Bench Telecom (68), and IFBench (62). Supports both text and image understanding, fits on a single GPU, and includes structured reasoning output with tool and function calling capabilities.",
         "params_b": 15.0
     },
     # 14.8B total parameters
     "Qwen3-14B": {
         "repo_id": "Qwen/Qwen3-14B",
         "params_b": 1.7
     },
+    # 0.6B
+    "Qwen3-0.6B": {
+        "repo_id": "Qwen/Qwen3-0.6B",
+        "description": "Causal Language Model, Training Stage: Pretraining & Post-training. Number of Parameters: 0.6B, Number of Paramaters (Non-Embedding): 0.44B, Number of Layers: 28, Number of Attention Heads (GQA): 16 for Q and 8 for KV, Context Length: 32,768",
+        "params_b": 0.6
+    },
     # ~2B (effective)
     "Gemma-3n-E2B": {
         "repo_id": "google/gemma-3n-E2B",
     except Exception:
         return []
+def format_conversation(history, system_prompt, tokenizer, enable_thinking=False):
     if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
         messages = [{"role": "system", "content": system_prompt.strip()}] + history
+        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking)
     else:
         # Fallback for base LMs without chat template
         prompt = system_prompt.strip() + "\n"
             prompt += "Assistant: "
         return prompt
+def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout, enable_tts, enable_thinking):
     # Get model size from the MODELS dict (more reliable than string parsing)
     model_size = MODELS[model_name].get("params_b", 4.0)  # Default to 4B if not found
 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
                   model_name, max_tokens, temperature,
+                  top_k, top_p, repeat_penalty, search_timeout, enable_tts, enable_thinking):
     """
     Generates streaming chat responses, optionally with background web search.
     This version includes cancellation support.
     """
     # Clear the cancellation event at the start of a new generation
     cancel_event.clear()
     history = list(chat_history or [])
     history.append({'role': 'user', 'content': user_msg})
         cur_date = datetime.now().strftime('%Y-%m-%d')
         # merge any fetched search results into the system prompt
         if search_results:
             enriched = system_prompt.strip() + \
             f'''\n# The following contents are the search results related to the user's message:
             {search_results}
         pipe = load_pipeline(model_name)
+        prompt = format_conversation(history, enriched, pipe.tokenizer, enable_thinking)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
         streamer = TextIteratorStreamer(pipe.tokenizer,
                                         skip_prompt=True,
                     history[-1]['content'] += " [Generation Canceled]"
                 yield history, debug, None
                 break
             text = chunk
             # Detect start of thinking
 def update_default_prompt(enable_search):
+    return f"You are a helpful assistant. Don't use emojis in your response. Keep replies short to a maximum of three sentences."
+def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout, enable_tts, enable_thinking):
     """Calculate and format the estimated GPU duration for current settings."""
     try:
         dummy_msg, dummy_history, dummy_system_prompt = "", [], ""
         duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
                               enable_search, max_results, max_chars, model_name,
+                              max_tokens, 0.7, 40, 0.9, 1.2, search_timeout, enable_tts, enable_thinking)
         model_size = MODELS[model_name].get("params_b", 4.0)
         return (f"⏱️ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
                 f"📊 **Model Size:** {model_size:.1f}B parameters\n"
                 f"🔍 **Web Search:** {'Enabled' if enable_search else 'Disabled'}\n"
+                f"🔊 **TTS:** {'Enabled' if enable_tts else 'Disabled'}\n"
+                f"💭 **Thinking:** {'Enabled' if enable_thinking else 'Disabled'}")
     except Exception as e:
         return f"⚠️ Error calculating estimate: {e}"
 with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     # Header
+    gr.Markdown("# 🧠 ZeroGPU LLM Inference")
+    # Main Chat Interface
+    chat = gr.Chatbot(
+        height=500,
+        label="💬 Conversation",
+        buttons=["copy"],
+        avatar_images=(None, "🤖"),
+        layout="bubble"
+    )
+    # TTS Audio Output (visible by default since TTS is on)
+    tts_audio_output = gr.Audio(
+        label="🔊 Generated Speech",
+        type="numpy",
+        autoplay=True,
+        visible=True,
+        elem_id="tts-audio"
+    )
+    # Input Area
     with gr.Row():
+        txt = gr.Textbox(
+            placeholder="💭 Type your message here... (Press Enter to send)",
+            scale=9,
+            container=False,
+            show_label=False,
+            lines=1,
+            max_lines=5
+        )
+        with gr.Column(scale=1, min_width=120):
+            submit_btn = gr.Button("📤 Send", variant="primary", size="lg")
+            cancel_btn = gr.Button("⏹️ Stop", variant="stop", visible=False, size="lg")
+    # Collapsed Settings Section at Bottom
+    with gr.Accordion("⚙️ Settings", open=False):
+        with gr.Row():
+            with gr.Column(scale=1):
                 model_dd = gr.Dropdown(
                     label="🤖 Model",
                     choices=list(MODELS.keys()),
+                    value="Qwen3-0.6B",
                     info="Select the language model to use"
                 )
                 with gr.Row():
+                    tts_chk = gr.Checkbox(
+                        label="🔊 Text-to-Speech",
+                        value=True,
+                        info="Convert responses to speech"
                     )
+                    thinking_chk = gr.Checkbox(
+                        label="💭 Thinking",
+                        value=False,
+                        info="Show model reasoning"
                     )
+                    search_chk = gr.Checkbox(
+                        label="🔍 Web Search",
+                        value=False,
+                        info="Augment with web data"
+                    )
+            with gr.Column(scale=1):
+                sys_prompt = gr.Textbox(
+                    label="📝 System Prompt",
+                    lines=3,
+                    value=update_default_prompt(False),
+                    placeholder="Define the assistant's behavior..."
                 )
+        # Duration Estimate
+        duration_display = gr.Markdown(
+            value=update_duration_estimate("Qwen3-0.6B", False, 4, 50, 1024, 5.0, True, False),
+            elem_classes="duration-estimate"
+        )
+        # Advanced Settings
+        with gr.Accordion("🎛️ Advanced Parameters", open=False):
+            max_tok = gr.Slider(64, 16384, value=512, step=32, label="Max Tokens")
+            temp = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
             with gr.Row():
+                k = gr.Slider(1, 100, value=40, step=1, label="Top-K")
+                p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
+            rp = gr.Slider(1.0, 2.0, value=1.2, step=0.1, label="Repetition Penalty")
+        # Web Search Settings
+        with gr.Accordion("🌐 Web Search Settings", open=False, visible=False) as search_settings:
+            mr = gr.Number(value=4, precision=0, label="Max Results")
+            mc = gr.Number(value=50, precision=0, label="Max Chars/Result")
+            st = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, value=5.0, label="Search Timeout (s)")
+        # Debug Info
+        with gr.Accordion("🔍 Debug Info", open=False):
+            dbg = gr.Markdown()
+        # Clear Button
+        clr = gr.Button("🗑️ Clear Chat", variant="secondary")
     # --- Event Listeners ---
     # Group all inputs for cleaner event handling
+    chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st, tts_chk, thinking_chk]
     # Group all UI components that can be updated.
     ui_components = [chat, dbg, txt, submit_btn, cancel_btn, tts_audio_output]
         """Called by the cancel button, sets the global event."""
         cancel_event.set()
         print("Cancellation signal sent.")
     def reset_ui_after_cancel():
         """Reset UI components after cancellation."""
         cancel_event.clear()  # Clear the flag for next generation
     )
     # Listeners for updating the duration estimate
+    duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st, tts_chk, thinking_chk]
     for component in duration_inputs:
         component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)
     # Toggle web search settings visibility
     def toggle_search_settings(enabled):
         return gr.update(visible=enabled)
     search_chk.change(
         fn=lambda enabled: (update_default_prompt(enabled), gr.update(visible=enabled)),
         inputs=search_chk,
         outputs=[sys_prompt, search_settings]
     )
     # Clear chat action
     clr.click(fn=lambda: ([], "", "", gr.update(visible=False, value=None)), outputs=[chat, txt, dbg, tts_audio_output])
+    demo.launch(theme=CUSTOM_THEME, css=CUSTOM_CSS)