Spaces:

toshuu
/

speak

Runtime error

App Files Files Community

toshuu commited on Dec 1, 2025

Commit

e902520

verified ·

1 Parent(s): c06b429

Upload 4 files

Browse files

Files changed (4) hide show

README.md +198 -14
app.py +82 -0
requirements.txt +5 -0
v4_indic.pt +3 -0

README.md CHANGED Viewed

@@ -1,14 +1,198 @@
----
-title: Speak
-emoji: 🏢
-colorFrom: red
-colorTo: yellow
-sdk: gradio
-sdk_version: 6.0.1
-app_file: app.py
-pinned: false
-license: other
-short_description: speakhehe
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Silero v4 Indic — HuggingFace Space (Gradio) Backend
+This document contains a ready-to-run HuggingFace Space project that hosts **Silero v4_indic** as a free CPU Gradio app. Drop `v4_indic.pt` (downloaded from `https://models.silero.ai/models/tts/indic/v4_indic.pt`) into the project root, push to your Space, and it will provide a public `/` UI and a simple REST-style `/api/predict` endpoint.
+---
+## Project structure
+```
+space-repo/
+├── app.py                # Gradio app + server endpoints (main)
+├── requirements.txt      # Python dependencies
+├── README.md             # This file (short)
+└── v4_indic.pt           # Put the downloaded model here (NOT included)
+```
+---
+## requirements.txt
+```
+gradio==3.39.1
+torch==2.1.0
+soundfile==0.13.1
+onnxruntime==1.23.2
+numpy
+```
+> Use CPU-only torch (no GPU) in HF Spaces. The `torch` version should be compatible with the Space runtime; if HF Spaces provides `torch` preinstalled, you can remove it from requirements to speed deploy.
+---
+## app.py
+```python
+import os
+import threading
+import tempfile
+import numpy as np
+import soundfile as sf
+import gradio as gr
+# Attempt to import torch; HF Spaces usually has CPU torch available.
+import torch
+MODEL_PATH = "v4_indic.pt"
+SAMPLE_RATE = 48000
+lock = threading.Lock()
+model = None
+def load_model():
+    global model
+    if model is not None:
+        return model
+    if not os.path.exists(MODEL_PATH):
+        raise FileNotFoundError(f"Model file not found: {MODEL_PATH}. Put v4_indic.pt in repo root.")
+    # Silero packaged model loader
+    print(f"Loading Silero model from {MODEL_PATH}...")
+    pkg = torch.package.PackageImporter(MODEL_PATH)
+    # The original package uses "tts_models" and object name "model"
+    model = pkg.load_pickle("tts_models", "model")
+    print("Model loaded into memory")
+    return model
+# Try to call model.apply_tts with flexible signature
+def synthesize_text(text: str, lang: str = "hi", speaker: int = 0, sample_rate: int = SAMPLE_RATE):
+    m = load_model()
+    # Normalize inputs
+    if not isinstance(text, str) or len(text.strip()) == 0:
+        raise ValueError("Empty text")
+    # Some Silero wrappers accept (text=..., lang_id=..., speaker_id=...),
+    # others accept (text=..., lang=..., speaker=...). Use try/except to support both.
+    try:
+        # Common high-level API
+        audio = m.apply_tts(text=text, speaker=speaker, lang_id=int(lang) if isinstance(lang, (int, np.integer)) else lang, sample_rate=sample_rate)
+    except TypeError:
+        try:
+            audio = m.apply_tts(text=text, speaker_id=int(speaker), lang_id=int(lang) if isinstance(lang, (int, np.integer)) else lang, sample_rate=sample_rate)
+        except Exception:
+            # Fallback: some versions accept (text, speaker, lang)
+            audio = m.apply_tts(text, speaker, lang, sample_rate)
+    # The returned audio can be numpy array or torch tensor
+    if isinstance(audio, torch.Tensor):
+        audio = audio.detach().cpu().numpy()
+    audio = np.asarray(audio)
+    # Ensure float32 in [-1,1]
+    if audio.dtype == np.int16:
+        audio = audio.astype('float32') / 32768.0
+    audio = audio.astype('float32')
+    max_abs = np.max(np.abs(audio))
+    if max_abs > 1.0:
+        audio = audio / max_abs
+    return audio, sample_rate
+# Gradio wrapper: returns file-like audio buffer
+def tts_gradio(text, lang_dropdown, speaker_slider):
+    # Map dropdown label to lang id or code expected by model
+    # You might need to adjust mapping depending on model internal language ids
+    lang_map = {
+        "Hindi (hi)": 0,
+        "Marathi (mr)": 1,
+        "Bengali (bn)": 2,
+        "Tamil (ta)": 3,
+        "Telugu (te)": 4,
+        "Kannada (kn)": 5,
+        "Malayalam (ml)": 6,
+        "Gujarati (gu)": 7,
+    }
+    lang_id = lang_map.get(lang_dropdown, 0)
+    # Prevent concurrent synth calls
+    with lock:
+        audio, sr = synthesize_text(text, lang=lang_id, speaker=int(speaker_slider))
+        # Write to temporary wav file and return its path (gradio will serve it)
+        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        sf.write(tmp.name, audio, sr)
+        tmp.flush()
+        tmp.close()
+        return tmp.name
+# Build Gradio UI
+def build_ui():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Silero v4 Indic — TTS (HuggingFace Space)\nDrop `v4_indic.pt` in the repo root and reload the Space.")
+        with gr.Row():
+            with gr.Column(scale=3):
+                txt = gr.Textbox(lines=4, label="Text to synthesize", value="नमस्ते, यह एक परीक्षण है।")
+                lang = gr.Dropdown(list=["Hindi (hi)", "Marathi (mr)", "Bengali (bn)", "Tamil (ta)", "Telugu (te)", "Kannada (kn)", "Malayalam (ml)", "Gujarati (gu)"], label="Language")
+                speaker = gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Speaker ID (if model supports multiple speakers)")
+                btn = gr.Button("Synthesize")
+            with gr.Column(scale=2):
+                out = gr.Audio(label="Generated audio")
+        btn.click(fn=tts_gradio, inputs=[txt, lang, speaker], outputs=[out])
+    return demo
+if __name__ == "__main__":
+    # Preload model at startup (keeps first request fast)
+    try:
+        load_model()
+    except Exception as e:
+        print("Model failed to load at startup:", e)
+    demo = build_ui()
+    demo.launch(server_name="0.0.0.0", server_port=7860)
+```
+---
+## Notes & Deployment steps
+1. **Download model**: `wget https://models.silero.ai/models/tts/indic/v4_indic.pt -O v4_indic.pt` and place in repo root.
+2. **Create a new Space**: [https://huggingface.co/new-space](https://huggingface.co/new-space) → choose `Gradio` runtime and public/private as you wish.
+3. **Push repo**: Upload `app.py`, `requirements.txt`, `README.md`, and `v4_indic.pt` to the Space (via web UI drag & drop or via git).
+4. **Wait** until the Space builds; the model will be loaded on first startup.
+5. **API**: The Space exposes a Gradio UI and a `/api/predict` endpoint automatically (Gradio inference API). You can call it programmatically.
+---
+## Tips & Troubleshooting
+* If the Space build fails due to `torch` version mismatch, remove `torch` from `requirements.txt` and let the Space use its preinstalled torch.
+* If you see `AttributeError` when calling `apply_tts`, some packaged model versions have slightly different API names. The wrapper `synthesize_text` attempts several common signatures; adapt if necessary.
+* The model file is ~34MB — fits in Space disk quota.
+* If multiple users will call TTS concurrently, consider a small rate limiter or queue: Silero v4 is CPU-bound but reasonably fast for short utterances.
+---
+## Security
+* Avoid uploading private keys in the repo.
+* If you need to restrict usage, make the Space private and issue access tokens.
+---
+If you want, I can now:
+* Generate a git-ready ZIP of this project (app.py + requirements + README) so you can upload directly.
+* Or produce a minimal `Dockerfile` if you prefer deploying elsewhere.
+Which would you like?

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os
+audio = audio.astype('float32')
+max_abs = np.max(np.abs(audio))
+if max_abs > 1.0:
+audio = audio / max_abs
+return audio, sample_rate
+# Gradio wrapper: returns file-like audio buffer
+def tts_gradio(text, lang_dropdown, speaker_slider):
+# Map dropdown label to lang id or code expected by model
+# You might need to adjust mapping depending on model internal language ids
+lang_map = {
+"Hindi (hi)": 0,
+"Marathi (mr)": 1,
+"Bengali (bn)": 2,
+"Tamil (ta)": 3,
+"Telugu (te)": 4,
+"Kannada (kn)": 5,
+"Malayalam (ml)": 6,
+"Gujarati (gu)": 7,
+}
+lang_id = lang_map.get(lang_dropdown, 0)
+# Prevent concurrent synth calls
+with lock:
+audio, sr = synthesize_text(text, lang=lang_id, speaker=int(speaker_slider))
+# Write to temporary wav file and return its path (gradio will serve it)
+tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+sf.write(tmp.name, audio, sr)
+tmp.flush()
+tmp.close()
+return tmp.name
+# Build Gradio UI
+def build_ui():
+with gr.Blocks() as demo:
+gr.Markdown("# Silero v4 Indic — TTS (HuggingFace Space)\nDrop `v4_indic.pt` in the repo root and reload the Space.")
+with gr.Row():
+with gr.Column(scale=3):
+txt = gr.Textbox(lines=4, label="Text to synthesize", value="नमस्ते, यह एक परीक्षण है।")
+lang = gr.Dropdown(list=["Hindi (hi)", "Marathi (mr)", "Bengali (bn)", "Tamil (ta)", "Telugu (te)", "Kannada (kn)", "Malayalam (ml)", "Gujarati (gu)"], label="Language")
+speaker = gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Speaker ID (if model supports multiple speakers)")
+btn = gr.Button("Synthesize")
+with gr.Column(scale=2):
+out = gr.Audio(label="Generated audio")
+btn.click(fn=tts_gradio, inputs=[txt, lang, speaker], outputs=[out])
+return demo
+if __name__ == "__main__":
+# Preload model at startup (keeps first request fast)
+try:
+load_model()
+except Exception as e:
+print("Model failed to load at startup:", e)
+demo = build_ui()
+demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio==3.39.1
+torch==2.1.0
+soundfile==0.13.1
+onnxruntime==1.23.2
+numpy

v4_indic.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c0d0055340a9789a7ff8e5f7610bbc8d82355e577e483acb8a1fe4f2df0caa6
+size 35379600