ZeroGPU-LLM-Inference

Paused

App Files Files Community

polats commited on Jan 20

Commit

51a9e4a

1 Parent(s): c6a3e9f

update ui for mobile

Browse files

Files changed (1) hide show

app.py +159 -97

app.py CHANGED Viewed

@@ -462,7 +462,7 @@ def format_conversation(history, system_prompt, tokenizer, enable_thinking=False
             prompt += "Assistant: "
         return prompt
-def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout, enable_tts, enable_thinking):
     # Get model size from the MODELS dict (more reliable than string parsing)
     model_size = MODELS[model_name].get("params_b", 4.0)  # Default to 4B if not found
@@ -474,18 +474,17 @@ def get_duration(user_msg, chat_history, system_prompt, enable_search, max_resul
     token_duration = max_tokens * 0.005  # ~200 tokens/second average on H200
     search_duration = 10 if enable_search else 0  # Reduced search time
     aot_compilation_buffer = 20 if use_aot else 0  # Faster compilation on H200
-    tts_duration = 15 if enable_tts else 0  # TTS generation time
-    return base_duration + token_duration + search_duration + aot_compilation_buffer + tts_duration
 @spaces.GPU(duration=get_duration)
 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
                   model_name, max_tokens, temperature,
-                  top_k, top_p, repeat_penalty, search_timeout, enable_tts, enable_thinking):
     """
     Generates streaming chat responses, optionally with background web search.
-    This version includes cancellation support.
     """
     # Clear the cancellation event at the start of a new generation
     cancel_event.clear()
@@ -592,7 +591,7 @@ def chat_response(user_msg, chat_history, system_prompt,
         assistant_message_started = False
         # First yield contains the user message
-        yield history, debug, None
         # Stream tokens
         for chunk in streamer:
@@ -600,7 +599,7 @@ def chat_response(user_msg, chat_history, system_prompt,
             if cancel_event.is_set():
                 if assistant_message_started and history and history[-1]['role'] == 'assistant':
                     history[-1]['content'] += " [Generation Canceled]"
-                yield history, debug, None
                 break
             text = chunk
@@ -620,7 +619,7 @@ def chat_response(user_msg, chat_history, system_prompt,
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
-                yield history, debug, None
                 continue
             if in_thought:
@@ -633,7 +632,7 @@ def chat_response(user_msg, chat_history, system_prompt,
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
-                yield history, debug, None
                 continue
             # Stream answer
@@ -643,16 +642,11 @@ def chat_response(user_msg, chat_history, system_prompt,
             answer_buf += text
             history[-1]['content'] = answer_buf.strip()
-            yield history, debug, None
         gen_thread.join()
-        # Generate TTS audio if enabled
-        tts_audio = None
-        if enable_tts and answer_buf.strip():
-            tts_audio = generate_tts_audio(answer_buf)
-        yield history, debug + prompt_debug, tts_audio
     except GeneratorExit:
         # Handle cancellation gracefully
         print("Chat response cancelled.")
@@ -660,7 +654,7 @@ def chat_response(user_msg, chat_history, system_prompt,
         return
     except Exception as e:
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
-        yield history, debug, None
     finally:
         gc.collect()
@@ -668,22 +662,63 @@ def chat_response(user_msg, chat_history, system_prompt,
 def update_default_prompt(enable_search):
     return f"You are a helpful assistant. Don't use emojis in your response. Keep replies short to a maximum of three sentences."
-def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout, enable_tts, enable_thinking):
     """Calculate and format the estimated GPU duration for current settings."""
     try:
         dummy_msg, dummy_history, dummy_system_prompt = "", [], ""
         duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
                               enable_search, max_results, max_chars, model_name,
-                              max_tokens, 0.7, 40, 0.9, 1.2, search_timeout, enable_tts, enable_thinking)
         model_size = MODELS[model_name].get("params_b", 4.0)
         return (f"⏱️ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
                 f"📊 **Model Size:** {model_size:.1f}B parameters\n"
                 f"🔍 **Web Search:** {'Enabled' if enable_search else 'Disabled'}\n"
-                f"🔊 **TTS:** {'Enabled' if enable_tts else 'Disabled'}\n"
                 f"💭 **Thinking:** {'Enabled' if enable_thinking else 'Disabled'}")
     except Exception as e:
         return f"⚠️ Error calculating estimate: {e}"
 # ------------------------------
 # Gradio UI
 # ------------------------------
@@ -700,6 +735,32 @@ CUSTOM_CSS = """
     .chatbot { border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); }
     button.primary { font-weight: 600; }
     .gradio-accordion { margin-bottom: 12px; }
 """
 with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
@@ -711,7 +772,7 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
         height=500,
         label="💬 Conversation",
         buttons=["copy"],
-        avatar_images=(None, "🤖"),
         layout="bubble"
     )
@@ -725,7 +786,7 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     )
     # Input Area
-    with gr.Row():
         txt = gr.Textbox(
             placeholder="💭 Type your message here... (Press Enter to send)",
             scale=9,
@@ -774,7 +835,7 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
         # Duration Estimate
         duration_display = gr.Markdown(
-            value=update_duration_estimate("Qwen3-0.6B", False, 4, 50, 1024, 5.0, True, False),
             elem_classes="duration-estimate"
         )
@@ -802,81 +863,65 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     # --- Event Listeners ---
-    # Group all inputs for cleaner event handling
-    chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st, tts_chk, thinking_chk]
-    # Group all UI components that can be updated.
-    ui_components = [chat, dbg, txt, submit_btn, cancel_btn, tts_audio_output]
-    def submit_and_manage_ui(user_msg, chat_history, *args):
         """
-        Orchestrator function that manages UI state and calls the backend chat function.
-        It uses a try...finally block to ensure the UI is always reset.
         """
         if not user_msg.strip():
-            # If the message is empty, do nothing.
-            yield {
-                chat: gr.update(),
-                dbg: gr.update(),
-                txt: gr.update(),
-                submit_btn: gr.update(),
-                cancel_btn: gr.update(),
-                tts_audio_output: gr.update(),
-            }
             return
-        # Check if TTS is enabled (last argument)
-        tts_enabled = args[-1] if args else False
-        # 1. Update UI to "generating" state.
-        #    Crucially, we do NOT update the `chat` component here, as the backend
-        #    will provide the correctly formatted history in the first response chunk.
-        #    Keep audio visible but clear it - Gradio will show loading state
-        yield {
-            txt: gr.update(value="", interactive=False),
-            submit_btn: gr.update(interactive=False),
-            cancel_btn: gr.update(visible=True),
-            tts_audio_output: gr.update(value=None),  # Clear audio but keep visible
-        }
-        cancelled = False
         try:
-            # 2. Call the backend and stream updates
-            backend_args = [user_msg, chat_history] + list(args)
-            for response_chunk in chat_response(*backend_args):
-                history, debug, audio = response_chunk[0], response_chunk[1], response_chunk[2] if len(response_chunk) > 2 else None
-                update_dict = {
-                    chat: history,
-                    dbg: debug,
-                }
-                # Update audio output when audio is generated (final yield with TTS)
-                if audio is not None:
-                    update_dict[tts_audio_output] = gr.update(value=audio)
-                yield update_dict
         except GeneratorExit:
-            # Mark as cancelled and re-raise to prevent "generator ignored GeneratorExit"
-            cancelled = True
             print("Generation cancelled by user.")
             raise
         except Exception as e:
             print(f"An error occurred during generation: {e}")
-            # If an error happens, add it to the chat history to inform the user.
             error_history = (chat_history or []) + [
                 {'role': 'user', 'content': user_msg},
                 {'role': 'assistant', 'content': f"**An error occurred:** {str(e)}"}
             ]
-            yield {chat: error_history}
-        finally:
-            # Only reset UI if not cancelled (to avoid "generator ignored GeneratorExit")
-            if not cancelled:
-                print("Resetting UI state.")
-                yield {
-                    txt: gr.update(interactive=True),
-                    submit_btn: gr.update(interactive=True),
-                    cancel_btn: gr.update(visible=False),
-                }
     def set_cancel_flag():
         """Called by the cancel button, sets the global event."""
@@ -887,37 +932,54 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
         """Reset UI components after cancellation."""
         cancel_event.clear()  # Clear the flag for next generation
         print("UI reset after cancellation.")
-        return {
-            txt: gr.update(interactive=True),
-            submit_btn: gr.update(interactive=True),
-            cancel_btn: gr.update(visible=False),
-            tts_audio_output: gr.update(value=None),  # Clear audio but keep visible
-        }
-    # Event for submitting text via Enter key or Submit button
     submit_event = txt.submit(
-        fn=submit_and_manage_ui,
         inputs=chat_inputs,
-        outputs=ui_components,
     )
-    submit_btn.click(
-        fn=submit_and_manage_ui,
         inputs=chat_inputs,
-        outputs=ui_components,
     )
     # Event for the "Cancel" button.
-    # It sets the cancel flag, cancels the submit event, then resets the UI.
     cancel_btn.click(
         fn=set_cancel_flag,
-        cancels=[submit_event]
     ).then(
         fn=reset_ui_after_cancel,
-        outputs=ui_components
     )
     # Listeners for updating the duration estimate
-    duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st, tts_chk, thinking_chk]
     for component in duration_inputs:
         component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)

             prompt += "Assistant: "
         return prompt
+def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout, enable_thinking):
     # Get model size from the MODELS dict (more reliable than string parsing)
     model_size = MODELS[model_name].get("params_b", 4.0)  # Default to 4B if not found
     token_duration = max_tokens * 0.005  # ~200 tokens/second average on H200
     search_duration = 10 if enable_search else 0  # Reduced search time
     aot_compilation_buffer = 20 if use_aot else 0  # Faster compilation on H200
+    return base_duration + token_duration + search_duration + aot_compilation_buffer
 @spaces.GPU(duration=get_duration)
 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
                   model_name, max_tokens, temperature,
+                  top_k, top_p, repeat_penalty, search_timeout, enable_thinking):
     """
     Generates streaming chat responses, optionally with background web search.
+    TTS is handled separately after this completes.
     """
     # Clear the cancellation event at the start of a new generation
     cancel_event.clear()
         assistant_message_started = False
         # First yield contains the user message
+        yield history, debug
         # Stream tokens
         for chunk in streamer:
             if cancel_event.is_set():
                 if assistant_message_started and history and history[-1]['role'] == 'assistant':
                     history[-1]['content'] += " [Generation Canceled]"
+                yield history, debug
                 break
             text = chunk
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
+                yield history, debug
                 continue
             if in_thought:
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
+                yield history, debug
                 continue
             # Stream answer
             answer_buf += text
             history[-1]['content'] = answer_buf.strip()
+            yield history, debug
         gen_thread.join()
+        yield history, debug + prompt_debug
     except GeneratorExit:
         # Handle cancellation gracefully
         print("Chat response cancelled.")
         return
     except Exception as e:
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
+        yield history, debug
     finally:
         gc.collect()
 def update_default_prompt(enable_search):
     return f"You are a helpful assistant. Don't use emojis in your response. Keep replies short to a maximum of three sentences."
+def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout, enable_thinking):
     """Calculate and format the estimated GPU duration for current settings."""
     try:
         dummy_msg, dummy_history, dummy_system_prompt = "", [], ""
         duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
                               enable_search, max_results, max_chars, model_name,
+                              max_tokens, 0.7, 40, 0.9, 1.2, search_timeout, enable_thinking)
         model_size = MODELS[model_name].get("params_b", 4.0)
         return (f"⏱️ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
                 f"📊 **Model Size:** {model_size:.1f}B parameters\n"
                 f"🔍 **Web Search:** {'Enabled' if enable_search else 'Disabled'}\n"
                 f"💭 **Thinking:** {'Enabled' if enable_thinking else 'Disabled'}")
     except Exception as e:
         return f"⚠️ Error calculating estimate: {e}"
+def generate_speech_from_chat(chat_history, enable_tts):
+    """
+    Generate TTS audio from the last assistant message in chat history.
+    This runs as a separate step after text generation, allowing the audio
+    component to show its loading state.
+    """
+    if not enable_tts:
+        return None
+    if not chat_history:
+        return None
+    # Find the last assistant message (skip thought bubbles)
+    last_message = None
+    for msg in reversed(chat_history):
+        if msg.get('role') == 'assistant':
+            # Skip thought bubbles (they have metadata with title starting with 💭)
+            metadata = msg.get('metadata') or {}
+            if metadata.get('title', '').startswith('💭'):
+                continue
+            content = msg.get('content', '')
+            # Handle both string and list content (Gradio multi-modal format)
+            if isinstance(content, list):
+                # Extract text from list items
+                text_parts = []
+                for item in content:
+                    if isinstance(item, str):
+                        text_parts.append(item)
+                    elif isinstance(item, dict) and 'text' in item:
+                        text_parts.append(item['text'])
+                last_message = ' '.join(text_parts)
+            else:
+                last_message = content
+            break
+    if not last_message or not last_message.strip():
+        return None
+    return generate_tts_audio(last_message)
 # ------------------------------
 # Gradio UI
 # ------------------------------
     .chatbot { border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); }
     button.primary { font-weight: 600; }
     .gradio-accordion { margin-bottom: 12px; }
+    /* Mobile: sticky input at bottom */
+    @media (max-width: 768px) {
+        #input-row {
+            position: fixed;
+            bottom: 0;
+            left: 0;
+            right: 0;
+            background: var(--background-fill-primary);
+            padding: 12px;
+            box-shadow: 0 -2px 10px rgba(0, 0, 0, 0.1);
+            z-index: 1000;
+            margin: 0 !important;
+        }
+        /* Add padding at bottom of main content to prevent overlap */
+        .main {
+            padding-bottom: 80px !important;
+        }
+        /* Adjust chatbot height on mobile */
+        .chatbot {
+            height: calc(100vh - 200px) !important;
+            max-height: none !important;
+        }
+    }
 """
 with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
         height=500,
         label="💬 Conversation",
         buttons=["copy"],
+        avatar_images=(None, "pfp.png"),
         layout="bubble"
     )
     )
     # Input Area
+    with gr.Row(elem_id="input-row"):
         txt = gr.Textbox(
             placeholder="💭 Type your message here... (Press Enter to send)",
             scale=9,
         # Duration Estimate
         duration_display = gr.Markdown(
+            value=update_duration_estimate("Qwen3-0.6B", False, 4, 50, 1024, 5.0, False),
             elem_classes="duration-estimate"
         )
     # --- Event Listeners ---
+    # Group inputs for chat generation (no TTS - handled separately via .then())
+    chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st, thinking_chk]
+    # UI components for streaming phase
+    stream_outputs = [chat, dbg, txt, submit_btn, cancel_btn]
+    def stream_chat_and_update_ui(user_msg, chat_history, *args):
         """
+        Stream chat responses and manage UI state during generation.
+        TTS is handled separately via .then() chaining.
         """
         if not user_msg.strip():
+            # If the message is empty, do nothing - return current state
+            yield chat_history, "", gr.update(), gr.update(), gr.update()
             return
+        # 1. Update UI to "generating" state
+        yield (
+            chat_history,  # Keep current chat
+            "",  # Clear debug
+            gr.update(value="", interactive=False),  # Clear and disable input
+            gr.update(interactive=False),  # Disable submit
+            gr.update(visible=True),  # Show cancel
+        )
         try:
+            # 2. Stream chat responses
+            for history, debug in chat_response(user_msg, chat_history, *args):
+                yield (
+                    history,
+                    debug,
+                    gr.update(),  # Keep input state
+                    gr.update(),  # Keep submit state
+                    gr.update(),  # Keep cancel state
+                )
         except GeneratorExit:
             print("Generation cancelled by user.")
             raise
         except Exception as e:
             print(f"An error occurred during generation: {e}")
             error_history = (chat_history or []) + [
                 {'role': 'user', 'content': user_msg},
                 {'role': 'assistant', 'content': f"**An error occurred:** {str(e)}"}
             ]
+            yield (
+                error_history,
+                f"Error: {e}",
+                gr.update(),
+                gr.update(),
+                gr.update(),
+            )
+    def reset_ui_after_generation():
+        """Reset UI to idle state after generation completes."""
+        print("Resetting UI state after generation.")
+        return (
+            gr.update(interactive=True),  # Re-enable input
+            gr.update(interactive=True),  # Re-enable submit
+            gr.update(visible=False),  # Hide cancel
+        )
     def set_cancel_flag():
         """Called by the cancel button, sets the global event."""
         """Reset UI components after cancellation."""
         cancel_event.clear()  # Clear the flag for next generation
         print("UI reset after cancellation.")
+        return (
+            gr.update(interactive=True),
+            gr.update(interactive=True),
+            gr.update(visible=False),
+            None,  # Clear audio
+        )
+    # Event for submitting text via Enter key
+    # Uses .then() chaining: stream text -> generate TTS -> reset UI
     submit_event = txt.submit(
+        fn=stream_chat_and_update_ui,
         inputs=chat_inputs,
+        outputs=stream_outputs,
+    ).then(
+        fn=generate_speech_from_chat,
+        inputs=[chat, tts_chk],
+        outputs=[tts_audio_output],
+    ).then(
+        fn=reset_ui_after_generation,
+        outputs=[txt, submit_btn, cancel_btn],
     )
+    # Event for clicking Submit button
+    submit_btn_event = submit_btn.click(
+        fn=stream_chat_and_update_ui,
         inputs=chat_inputs,
+        outputs=stream_outputs,
+    ).then(
+        fn=generate_speech_from_chat,
+        inputs=[chat, tts_chk],
+        outputs=[tts_audio_output],
+    ).then(
+        fn=reset_ui_after_generation,
+        outputs=[txt, submit_btn, cancel_btn],
     )
     # Event for the "Cancel" button.
+    # It sets the cancel flag, cancels the submit events, then resets the UI.
     cancel_btn.click(
         fn=set_cancel_flag,
+        cancels=[submit_event, submit_btn_event]
     ).then(
         fn=reset_ui_after_cancel,
+        outputs=[txt, submit_btn, cancel_btn, tts_audio_output]
     )
     # Listeners for updating the duration estimate
+    duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st, thinking_chk]
     for component in duration_inputs:
         component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)