Spaces:

toshuu
/

speak

Runtime error

App Files Files Community

toshuu commited on Dec 1, 2025

Commit

de64ba8

verified ·

1 Parent(s): 9bed061

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -118

app.py CHANGED Viewed

@@ -1,138 +1,184 @@
-# app.py  (replace your current file with this)
 import os
-import threading
 import tempfile
-import inspect
-import traceback
-import numpy as np
-import soundfile as sf
-import gradio as gr
-# torch import is required; HF Spaces requirements will install CPU wheels
 import torch
 MODEL_PATH = "v4_indic.pt"
-SAMPLE_RATE = 48000
-lock = threading.Lock()
-_model = None
-_apply_tts_callable = None
-_apply_tts_sig = None
-def load_model():
-    global _model, _apply_tts_callable, _apply_tts_sig
-    if _model is not None:
-        return _model
-    if not os.path.exists(MODEL_PATH):
-        raise FileNotFoundError(f"Model file not found in repo root: {MODEL_PATH}")
-    print("Loading model from", MODEL_PATH)
-    pkg = torch.package.PackageImporter(MODEL_PATH)
-    _model = pkg.load_pickle("tts_models", "model")
-    print("Model object loaded:", type(_model).__name__)
-    # discover apply_tts
-    if hasattr(_model, "apply_tts"):
-        _apply_tts_callable = getattr(_model, "apply_tts")
-        try:
-            _apply_tts_sig = inspect.signature(_apply_tts_callable)
-            print("apply_tts signature:", _apply_tts_sig)
-        except Exception as e:
-            print("Could not introspect apply_tts signature:", e)
-            _apply_tts_sig = None
-    else:
-        raise RuntimeError("Loaded model does not expose 'apply_tts'")
-    return _model
-def _call_apply_tts(text):
     """
-    Try a sequence of possible call signatures for apply_tts.
-    Return numpy array (float32) audio and sample rate.
     """
-    # ensure model loaded
-    m = load_model()
-    # Build candidate calls (ordered by likelihood)
-    # Each entry is (kwargs dict, args tuple)
-    candidates = [
-        ({"text": text}, ()),                          # apply_tts(text=text)
-        ({}, (text,)),                                 # apply_tts(text)
-        ({"text": text, "sample_rate": SAMPLE_RATE}, ()),  # apply_tts(text=..., sample_rate=...)
-        ({}, (text, SAMPLE_RATE)),                     # apply_tts(text, sample_rate)
-        ({"text": text, "speaker": 0, "sample_rate": SAMPLE_RATE}, ()),  # apply_tts(text=..., speaker=0,...)
-        ({"text": text, "lang": "hi", "speaker": 0, "sample_rate": SAMPLE_RATE}, ()), # apply_tts(text=..., lang=..., speaker=...)
-        ({"text": text, "lang_id": 0, "speaker_id": 0, "sample_rate": SAMPLE_RATE}, ()), # older variants
-    ]
     last_exc = None
-    for kw, args in candidates:
         try:
-            # attempt call
-            if args:
-                res = m.apply_tts(*args, **kw)
             else:
-                res = m.apply_tts(**kw)
-            # success: convert to numpy if torch tensor
-            if isinstance(res, torch.Tensor):
-                res = res.detach().cpu().numpy()
-            res = np.asarray(res, dtype=np.float32)
-            return res
-        except TypeError as te:
-            last_exc = te
-            # signature mismatch, try next
             continue
         except Exception as e:
-            # If a runtime error occurred within model (e.g. tokenizer / input length), raise it
-            print("Runtime error while calling apply_tts with", kw, args)
-            traceback.print_exc()
             last_exc = e
-            break
-    # If we exit loop without returning, raise a helpful error
     raise RuntimeError(f"apply_tts call failed for all known signatures. last error: {last_exc}")
-def synthesize_text_to_wavfile(text):
-    if not text or not isinstance(text, str) or len(text.strip()) == 0:
-        raise ValueError("Empty input text")
-    audio = _call_apply_tts(text)
-    # normalize audio to [-1,1] float32
-    if audio.dtype != np.float32:
-        audio = audio.astype(np.float32)
-    max_abs = np.max(np.abs(audio)) if audio.size > 0 else 1.0
-    if max_abs > 1.0:
-        audio = audio / max_abs
-    # write to temp WAV
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    sf.write(tmp.name, audio, SAMPLE_RATE)
-    tmp.close()
-    return tmp.name
-# Gradio function
-def tts_gradio_fn(text: str):
-    with lock:
-        path = synthesize_text_to_wavfile(text)
-        return path
-def build_demo():
-    with gr.Blocks() as demo:
-        gr.Markdown("# 🔊 Silero v4 Indic — Robust HF Space")
-        txt = gr.Textbox(label="Text to speak", lines=4, value="नमस्ते, यह टेस्‍ट है।")
-        btn = gr.Button("Generate")
-        out = gr.Audio(label="Output audio")
-        btn.click(fn=tts_gradio_fn, inputs=[txt], outputs=[out])
-    return demo
 if __name__ == "__main__":
-    # preload model on startup to avoid cold-call overhead
-    try:
-        load_model()
-    except Exception as e:
-        print("Model load failed at startup:", e)
-        traceback.print_exc()
-    demo = build_demo()
-    demo.launch()

 import os
+import sys
 import tempfile
 import torch
+import gradio as gr
+from datetime import datetime
+# Configuration
 MODEL_PATH = "v4_indic.pt"
+DEFAULT_SPEAKER = "hindi_female"  # Changed from 'xenia' to valid speaker
+DEFAULT_SAMPLE_RATE = 48000
+print(f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")
+# Load the model
+print(f"Loading model from {MODEL_PATH}")
+m = torch.package.PackageImporter(MODEL_PATH).load_pickle("tts_models", "model")
+print(f"Model object loaded: {type(m).__name__}")
+# Inspect apply_tts signature
+import inspect
+sig = inspect.signature(m.apply_tts)
+print(f"apply_tts signature: {sig}")
+# Available speakers
+AVAILABLE_SPEAKERS = [
+    "bengali_female", "bengali_male",
+    "gujarati_female", "gujarati_male",
+    "hindi_female", "hindi_male",
+    "kannada_female", "kannada_male",
+    "malayalam_female", "malayalam_male",
+    "manipuri_female",
+    "rajasthani_female", "rajasthani_male",
+    "tamil_female", "tamil_male",
+    "telugu_female", "telugu_male"
+]
+def _call_apply_tts(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
     """
+    Wrapper to call apply_tts with proper error handling.
     """
+    # Validate speaker
+    if speaker not in AVAILABLE_SPEAKERS:
+        print(f"Warning: Invalid speaker '{speaker}', using default '{DEFAULT_SPEAKER}'")
+        speaker = DEFAULT_SPEAKER
+    kw = {
+        'text': text,
+        'speaker': speaker,
+        'sample_rate': sample_rate
+    }
+    print(f"Runtime error while calling apply_tts with {kw}")
     last_exc = None
+    # Try different parameter combinations
+    for attempt_kw in [kw, {'text': text, 'speaker': speaker}]:
         try:
+            res = m.apply_tts(**attempt_kw)
+            # Handle different return types
+            if isinstance(res, tuple):
+                audio = res[0]
             else:
+                audio = res
+            return audio
+        except TypeError as e:
+            last_exc = e
+            print(f"Attempt failed with {attempt_kw}: {e}")
             continue
         except Exception as e:
             last_exc = e
+            print(f"Error with {attempt_kw}: {e}")
+            raise
     raise RuntimeError(f"apply_tts call failed for all known signatures. last error: {last_exc}")
+def synthesize_text_to_wavfile(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
+    """
+    Synthesize text to audio and save to temporary WAV file.
+    Args:
+        text: Text to synthesize
+        speaker: Speaker voice to use
+        sample_rate: Audio sample rate
+    Returns:
+        Path to generated WAV file
+    """
+    audio = _call_apply_tts(text, speaker, sample_rate)
+    # Create temporary file
+    fd, path = tempfile.mkstemp(suffix=".wav")
+    os.close(fd)
+    # Save audio
+    import scipy.io.wavfile as wavfile
+    wavfile.write(path, sample_rate, audio)
+    return path
+def tts_gradio_fn(text, speaker, sample_rate):
+    """
+    Gradio interface function.
+    Args:
+        text: Input text
+        speaker: Selected speaker voice
+        sample_rate: Audio sample rate
+    Returns:
+        Path to generated audio file
+    """
+    if not text or not text.strip():
+        raise ValueError("Please enter some text to synthesize")
+    path = synthesize_text_to_wavfile(text, speaker, sample_rate)
+    return path
+# Create Gradio interface
+with gr.Blocks(title="Silero v4 Indic TTS") as demo:
+    gr.Markdown("# Silero v4 Indic Text-to-Speech")
+    gr.Markdown("Convert text to speech in multiple Indian languages")
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Enter Text",
+                placeholder="नमस्ते, यह टेस्ट है। (Enter text in Hindi, Bengali, Tamil, Telugu, etc.)",
+                lines=5
+            )
+            speaker_dropdown = gr.Dropdown(
+                choices=AVAILABLE_SPEAKERS,
+                value=DEFAULT_SPEAKER,
+                label="Select Speaker Voice"
+            )
+            sample_rate_dropdown = gr.Dropdown(
+                choices=[8000, 16000, 24000, 48000],
+                value=DEFAULT_SAMPLE_RATE,
+                label="Sample Rate (Hz)"
+            )
+            submit_btn = gr.Button("Generate Speech", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(
+                label="Generated Audio",
+                type="filepath"
+            )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["नमस्ते, यह टेस्ट है।", "hindi_female", 48000],
+            ["হ্যালো, এটি একটি পরীক্ষা।", "bengali_female", 48000],
+            ["வணக்கம், இது ஒரு சோதனை.", "tamil_female", 48000],
+            ["హలో, ఇది ఒక పరీక్ష.", "telugu_female", 48000],
+        ],
+        inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
+        outputs=audio_output,
+        fn=tts_gradio_fn,
+        cache_examples=False
+    )
+    submit_btn.click(
+        fn=tts_gradio_fn,
+        inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
+        outputs=audio_output
+    )
+# Launch the app
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        ssr_mode=True
+    )