ZeroGPU-LLM-Inference

Paused

App Files Files Community

polats commited on Jan 20

Commit

65be932

1 Parent(s): 31c47f7

add pocket-tts voice

Browse files

Files changed (4) hide show

.gitattributes +1 -0
app.py +163 -50
requirements.txt +10 -1
voice.wav +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -13,11 +13,16 @@ from transformers import AutoTokenizer
 from ddgs import DDGS
 import spaces  # Import spaces early to enable ZeroGPU support
 from torch.utils._pytree import tree_map
 # Global event to signal cancellation from the UI thread to the generation thread
 cancel_event = threading.Event()
-access_token=os.environ['HF_TOKEN']
 # Optional: Disable GPU visibility if you wish to force CPU usage
 # os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -317,6 +322,74 @@ MODELS = {
 # Global cache for pipelines to avoid re-loading.
 PIPELINES = {}
 def load_pipeline(model_name):
     """
     Load and cache a transformers pipeline for text generation.
@@ -384,26 +457,27 @@ def format_conversation(history, system_prompt, tokenizer):
             prompt += "Assistant: "
         return prompt
-def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout):
     # Get model size from the MODELS dict (more reliable than string parsing)
     model_size = MODELS[model_name].get("params_b", 4.0)  # Default to 4B if not found
     # Only use AOT for models >= 2B parameters
     use_aot = model_size >= 2
     # Adjusted for H200 performance: faster inference, quicker compilation
     base_duration = 20 if not use_aot else 40  # Reduced base times
     token_duration = max_tokens * 0.005  # ~200 tokens/second average on H200
     search_duration = 10 if enable_search else 0  # Reduced search time
     aot_compilation_buffer = 20 if use_aot else 0  # Faster compilation on H200
-    return base_duration + token_duration + search_duration + aot_compilation_buffer
 @spaces.GPU(duration=get_duration)
 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
                   model_name, max_tokens, temperature,
-                  top_k, top_p, repeat_penalty, search_timeout):
     """
     Generates streaming chat responses, optionally with background web search.
     This version includes cancellation support.
@@ -513,7 +587,7 @@ def chat_response(user_msg, chat_history, system_prompt,
         assistant_message_started = False
         # First yield contains the user message
-        yield history, debug
         # Stream tokens
         for chunk in streamer:
@@ -521,7 +595,7 @@ def chat_response(user_msg, chat_history, system_prompt,
             if cancel_event.is_set():
                 if assistant_message_started and history and history[-1]['role'] == 'assistant':
                     history[-1]['content'] += " [Generation Canceled]"
-                yield history, debug
                 break
             text = chunk
@@ -541,7 +615,7 @@ def chat_response(user_msg, chat_history, system_prompt,
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
-                yield history, debug
                 continue
             if in_thought:
@@ -554,7 +628,7 @@ def chat_response(user_msg, chat_history, system_prompt,
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
-                yield history, debug
                 continue
             # Stream answer
@@ -564,10 +638,16 @@ def chat_response(user_msg, chat_history, system_prompt,
             answer_buf += text
             history[-1]['content'] = answer_buf.strip()
-            yield history, debug
         gen_thread.join()
-        yield history, debug + prompt_debug
     except GeneratorExit:
         # Handle cancellation gracefully
         print("Chat response cancelled.")
@@ -575,7 +655,7 @@ def chat_response(user_msg, chat_history, system_prompt,
         return
     except Exception as e:
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
-        yield history, debug
     finally:
         gc.collect()
@@ -583,39 +663,40 @@ def chat_response(user_msg, chat_history, system_prompt,
 def update_default_prompt(enable_search):
     return f"You are a helpful assistant."
-def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout):
     """Calculate and format the estimated GPU duration for current settings."""
     try:
         dummy_msg, dummy_history, dummy_system_prompt = "", [], ""
-        duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
-                              enable_search, max_results, max_chars, model_name,
-                              max_tokens, 0.7, 40, 0.9, 1.2, search_timeout)
         model_size = MODELS[model_name].get("params_b", 4.0)
         return (f"⏱️ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
                 f"📊 **Model Size:** {model_size:.1f}B parameters\n"
-                f"🔍 **Web Search:** {'Enabled' if enable_search else 'Disabled'}")
     except Exception as e:
         return f"⚠️ Error calculating estimate: {e}"
 # ------------------------------
 # Gradio UI
 # ------------------------------
-with gr.Blocks(
-    title="LLM Inference with ZeroGPU",
-    theme=gr.themes.Soft(
-        primary_hue="indigo",
-        secondary_hue="purple",
-        neutral_hue="slate",
-        radius_size="lg",
-        font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"]
-    ),
-    css="""
-        .duration-estimate { background: linear-gradient(135deg, #667eea15 0%, #764ba215 100%); border-left: 4px solid #667eea; padding: 12px; border-radius: 8px; margin: 16px 0; }
-        .chatbot { border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); }
-        button.primary { font-weight: 600; }
-        .gradio-accordion { margin-bottom: 12px; }
-    """
-) as demo:
     # Header
     gr.Markdown("""
     # 🧠 ZeroGPU LLM Inference
@@ -639,6 +720,11 @@ with gr.Blocks(
                     value=False,
                     info="Augment responses with real-time web data"
                 )
                 sys_prompt = gr.Textbox(
                     label="📝 System Prompt",
                     lines=3,
@@ -648,7 +734,7 @@ with gr.Blocks(
             # Duration Estimate
             duration_display = gr.Markdown(
-                value=update_duration_estimate("Qwen3-1.7B", False, 4, 50, 1024, 5.0),
                 elem_classes="duration-estimate"
             )
@@ -706,14 +792,22 @@ with gr.Blocks(
         # Right Panel - Chat Interface
         with gr.Column(scale=7):
             chat = gr.Chatbot(
-                type="messages",
                 height=600,
                 label="💬 Conversation",
-                show_copy_button=True,
                 avatar_images=(None, "🤖"),
-                bubble_full_width=False
             )
             # Input Area
             with gr.Row():
                 txt = gr.Textbox(
@@ -758,9 +852,9 @@ with gr.Blocks(
     # --- Event Listeners ---
     # Group all inputs for cleaner event handling
-    chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st]
     # Group all UI components that can be updated.
-    ui_components = [chat, dbg, txt, submit_btn, cancel_btn]
     def submit_and_manage_ui(user_msg, chat_history, *args):
         """
@@ -769,10 +863,19 @@ with gr.Blocks(
         """
         if not user_msg.strip():
             # If the message is empty, do nothing.
-            # We yield an empty dict to avoid any state changes.
-            yield {}
             return
         # 1. Update UI to "generating" state.
         #    Crucially, we do NOT update the `chat` component here, as the backend
         #    will provide the correctly formatted history in the first response chunk.
@@ -780,6 +883,7 @@ with gr.Blocks(
             txt: gr.update(value="", interactive=False),
             submit_btn: gr.update(interactive=False),
             cancel_btn: gr.update(visible=True),
         }
         cancelled = False
@@ -787,10 +891,18 @@ with gr.Blocks(
             # 2. Call the backend and stream updates
             backend_args = [user_msg, chat_history] + list(args)
             for response_chunk in chat_response(*backend_args):
-                yield {
-                    chat: response_chunk[0],
-                    dbg: response_chunk[1],
                 }
         except GeneratorExit:
             # Mark as cancelled and re-raise to prevent "generator ignored GeneratorExit"
             cancelled = True
@@ -827,6 +939,7 @@ with gr.Blocks(
             txt: gr.update(interactive=True),
             submit_btn: gr.update(interactive=True),
             cancel_btn: gr.update(visible=False),
         }
     # Event for submitting text via Enter key or Submit button
@@ -852,7 +965,7 @@ with gr.Blocks(
     )
     # Listeners for updating the duration estimate
-    duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st]
     for component in duration_inputs:
         component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)
@@ -867,6 +980,6 @@ with gr.Blocks(
     )
     # Clear chat action
-    clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])
-    demo.launch()

 from ddgs import DDGS
 import spaces  # Import spaces early to enable ZeroGPU support
 from torch.utils._pytree import tree_map
+import numpy as np
+# Add pocket-tts to path for TTS functionality
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'pocket-tts'))
+from pocket_tts import TTSModel
 # Global event to signal cancellation from the UI thread to the generation thread
 cancel_event = threading.Event()
+access_token = os.environ.get('HF_TOKEN')
 # Optional: Disable GPU visibility if you wish to force CPU usage
 # os.environ["CUDA_VISIBLE_DEVICES"] = ""
 # Global cache for pipelines to avoid re-loading.
 PIPELINES = {}
+# ------------------------------
+# TTS Configuration
+# ------------------------------
+TTS_VOICE_FILE = "./voice.wav"  # Path to custom voice file for voice cloning
+# Global TTS model cache
+TTS_MODEL = None
+TTS_VOICE_STATE = None
+def load_tts_model():
+    """Load and cache the TTS model."""
+    global TTS_MODEL
+    if TTS_MODEL is None:
+        TTS_MODEL = TTSModel.load_model()
+    return TTS_MODEL
+def get_voice_state():
+    """Get cached voice state from the custom voice file."""
+    global TTS_VOICE_STATE
+    if TTS_VOICE_STATE is None:
+        tts_model = load_tts_model()
+        TTS_VOICE_STATE = tts_model.get_state_for_audio_prompt(TTS_VOICE_FILE)
+    return TTS_VOICE_STATE
+def clean_text_for_tts(text: str) -> str:
+    """Clean text for better TTS output by removing code blocks, markdown, and thinking tags."""
+    # Remove thinking blocks (Qwen3 models)
+    text = re.sub(r'<think>[\s\S]*?</think>', '', text)
+    # Remove markdown code blocks
+    text = re.sub(r'```[\s\S]*?```', '', text)
+    # Remove inline code
+    text = re.sub(r'`[^`]+`', '', text)
+    # Remove citation markers
+    text = re.sub(r'\[citation:\d+\]', '', text)
+    # Remove markdown headers
+    text = re.sub(r'#{1,6}\s+', '', text)
+    # Remove markdown bold/italic
+    text = re.sub(r'\*{1,2}([^*]+)\*{1,2}', r'\1', text)
+    # Remove markdown links, keep text
+    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
+    # Remove multiple spaces/newlines
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+def generate_tts_audio(text: str) -> tuple[int, np.ndarray] | None:
+    """
+    Generate TTS audio from text using the custom voice.
+    Args:
+        text: The text to convert to speech
+    Returns:
+        Tuple of (sample_rate, audio_array) or None if TTS fails
+    """
+    try:
+        # Clean the text for better TTS
+        clean_text = clean_text_for_tts(text)
+        if not clean_text:
+            return None
+        tts_model = load_tts_model()
+        voice_state = get_voice_state()
+        audio = tts_model.generate_audio(voice_state, clean_text)
+        return (tts_model.sample_rate, audio.numpy())
+    except Exception as e:
+        print(f"TTS generation error: {e}")
+        return None
 def load_pipeline(model_name):
     """
     Load and cache a transformers pipeline for text generation.
             prompt += "Assistant: "
         return prompt
+def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout, enable_tts):
     # Get model size from the MODELS dict (more reliable than string parsing)
     model_size = MODELS[model_name].get("params_b", 4.0)  # Default to 4B if not found
     # Only use AOT for models >= 2B parameters
     use_aot = model_size >= 2
     # Adjusted for H200 performance: faster inference, quicker compilation
     base_duration = 20 if not use_aot else 40  # Reduced base times
     token_duration = max_tokens * 0.005  # ~200 tokens/second average on H200
     search_duration = 10 if enable_search else 0  # Reduced search time
     aot_compilation_buffer = 20 if use_aot else 0  # Faster compilation on H200
+    tts_duration = 15 if enable_tts else 0  # TTS generation time
+    return base_duration + token_duration + search_duration + aot_compilation_buffer + tts_duration
 @spaces.GPU(duration=get_duration)
 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
                   model_name, max_tokens, temperature,
+                  top_k, top_p, repeat_penalty, search_timeout, enable_tts):
     """
     Generates streaming chat responses, optionally with background web search.
     This version includes cancellation support.
         assistant_message_started = False
         # First yield contains the user message
+        yield history, debug, None
         # Stream tokens
         for chunk in streamer:
             if cancel_event.is_set():
                 if assistant_message_started and history and history[-1]['role'] == 'assistant':
                     history[-1]['content'] += " [Generation Canceled]"
+                yield history, debug, None
                 break
             text = chunk
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
+                yield history, debug, None
                 continue
             if in_thought:
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
+                yield history, debug, None
                 continue
             # Stream answer
             answer_buf += text
             history[-1]['content'] = answer_buf.strip()
+            yield history, debug, None
         gen_thread.join()
+        # Generate TTS audio if enabled
+        tts_audio = None
+        if enable_tts and answer_buf.strip():
+            tts_audio = generate_tts_audio(answer_buf)
+        yield history, debug + prompt_debug, tts_audio
     except GeneratorExit:
         # Handle cancellation gracefully
         print("Chat response cancelled.")
         return
     except Exception as e:
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
+        yield history, debug, None
     finally:
         gc.collect()
 def update_default_prompt(enable_search):
     return f"You are a helpful assistant."
+def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout, enable_tts):
     """Calculate and format the estimated GPU duration for current settings."""
     try:
         dummy_msg, dummy_history, dummy_system_prompt = "", [], ""
+        duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
+                              enable_search, max_results, max_chars, model_name,
+                              max_tokens, 0.7, 40, 0.9, 1.2, search_timeout, enable_tts)
         model_size = MODELS[model_name].get("params_b", 4.0)
         return (f"⏱️ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
                 f"📊 **Model Size:** {model_size:.1f}B parameters\n"
+                f"🔍 **Web Search:** {'Enabled' if enable_search else 'Disabled'}\n"
+                f"🔊 **TTS:** {'Enabled' if enable_tts else 'Disabled'}")
     except Exception as e:
         return f"⚠️ Error calculating estimate: {e}"
 # ------------------------------
 # Gradio UI
 # ------------------------------
+CUSTOM_THEME = gr.themes.Soft(
+    primary_hue="indigo",
+    secondary_hue="purple",
+    neutral_hue="slate",
+    radius_size="lg",
+    font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"]
+)
+CUSTOM_CSS = """
+    .duration-estimate { background: linear-gradient(135deg, #667eea15 0%, #764ba215 100%); border-left: 4px solid #667eea; padding: 12px; border-radius: 8px; margin: 16px 0; }
+    .chatbot { border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); }
+    button.primary { font-weight: 600; }
+    .gradio-accordion { margin-bottom: 12px; }
+"""
+with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     # Header
     gr.Markdown("""
     # 🧠 ZeroGPU LLM Inference
                     value=False,
                     info="Augment responses with real-time web data"
                 )
+                tts_chk = gr.Checkbox(
+                    label="🔊 Enable Text-to-Speech",
+                    value=False,
+                    info="Convert responses to speech using voice cloning"
+                )
                 sys_prompt = gr.Textbox(
                     label="📝 System Prompt",
                     lines=3,
             # Duration Estimate
             duration_display = gr.Markdown(
+                value=update_duration_estimate("Qwen3-1.7B", False, 4, 50, 1024, 5.0, False),
                 elem_classes="duration-estimate"
             )
         # Right Panel - Chat Interface
         with gr.Column(scale=7):
             chat = gr.Chatbot(
                 height=600,
                 label="💬 Conversation",
+                buttons=["copy"],
                 avatar_images=(None, "🤖"),
+                layout="bubble"
             )
+            # TTS Audio Output
+            tts_audio_output = gr.Audio(
+                label="🔊 Generated Speech",
+                type="numpy",
+                autoplay=True,
+                visible=False,
+                elem_id="tts-audio"
+            )
             # Input Area
             with gr.Row():
                 txt = gr.Textbox(
     # --- Event Listeners ---
     # Group all inputs for cleaner event handling
+    chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st, tts_chk]
     # Group all UI components that can be updated.
+    ui_components = [chat, dbg, txt, submit_btn, cancel_btn, tts_audio_output]
     def submit_and_manage_ui(user_msg, chat_history, *args):
         """
         """
         if not user_msg.strip():
             # If the message is empty, do nothing.
+            yield {
+                chat: gr.update(),
+                dbg: gr.update(),
+                txt: gr.update(),
+                submit_btn: gr.update(),
+                cancel_btn: gr.update(),
+                tts_audio_output: gr.update(),
+            }
             return
+        # Check if TTS is enabled (last argument)
+        tts_enabled = args[-1] if args else False
         # 1. Update UI to "generating" state.
         #    Crucially, we do NOT update the `chat` component here, as the backend
         #    will provide the correctly formatted history in the first response chunk.
             txt: gr.update(value="", interactive=False),
             submit_btn: gr.update(interactive=False),
             cancel_btn: gr.update(visible=True),
+            tts_audio_output: gr.update(visible=False, value=None),  # Hide audio during generation
         }
         cancelled = False
             # 2. Call the backend and stream updates
             backend_args = [user_msg, chat_history] + list(args)
             for response_chunk in chat_response(*backend_args):
+                history, debug, audio = response_chunk[0], response_chunk[1], response_chunk[2] if len(response_chunk) > 2 else None
+                update_dict = {
+                    chat: history,
+                    dbg: debug,
                 }
+                # Show audio output when audio is generated (final yield with TTS)
+                if audio is not None:
+                    update_dict[tts_audio_output] = gr.update(visible=True, value=audio)
+                yield update_dict
         except GeneratorExit:
             # Mark as cancelled and re-raise to prevent "generator ignored GeneratorExit"
             cancelled = True
             txt: gr.update(interactive=True),
             submit_btn: gr.update(interactive=True),
             cancel_btn: gr.update(visible=False),
+            tts_audio_output: gr.update(visible=False, value=None),
         }
     # Event for submitting text via Enter key or Submit button
     )
     # Listeners for updating the duration estimate
+    duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st, tts_chk]
     for component in duration_inputs:
         component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)
     )
     # Clear chat action
+    clr.click(fn=lambda: ([], "", "", gr.update(visible=False, value=None)), outputs=[chat, txt, dbg, tts_audio_output])
+    demo.launch(theme=CUSTOM_THEME, css=CUSTOM_CSS)

requirements.txt CHANGED Viewed

@@ -9,4 +9,13 @@ sentencepiece
 accelerate
 autoawq
 timm
-compressed-tensors

 accelerate
 autoawq
 timm
+compressed-tensors
+# pocket-tts dependencies
+numpy>=2
+pydantic>=2
+beartype>=0.22.5
+safetensors>=0.4.0
+scipy>=1.5.0
+einops>=0.4.0
+huggingface_hub>=0.10

voice.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ba0e2f61e1e03c63791bd946c935b4dbc3b1a0e2b38f960b52ba746f2ca7e30
+size 337028