toshuu commited on
Commit
e902520
·
verified ·
1 Parent(s): c06b429

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +198 -14
  2. app.py +82 -0
  3. requirements.txt +5 -0
  4. v4_indic.pt +3 -0
README.md CHANGED
@@ -1,14 +1,198 @@
1
- ---
2
- title: Speak
3
- emoji: 🏢
4
- colorFrom: red
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 6.0.1
8
- app_file: app.py
9
- pinned: false
10
- license: other
11
- short_description: speakhehe
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Silero v4 Indic — HuggingFace Space (Gradio) Backend
2
+
3
+ This document contains a ready-to-run HuggingFace Space project that hosts **Silero v4_indic** as a free CPU Gradio app. Drop `v4_indic.pt` (downloaded from `https://models.silero.ai/models/tts/indic/v4_indic.pt`) into the project root, push to your Space, and it will provide a public `/` UI and a simple REST-style `/api/predict` endpoint.
4
+
5
+ ---
6
+
7
+ ## Project structure
8
+
9
+ ```
10
+ space-repo/
11
+ ├── app.py # Gradio app + server endpoints (main)
12
+ ├── requirements.txt # Python dependencies
13
+ ├── README.md # This file (short)
14
+ └── v4_indic.pt # Put the downloaded model here (NOT included)
15
+ ```
16
+
17
+ ---
18
+
19
+ ## requirements.txt
20
+
21
+ ```
22
+ gradio==3.39.1
23
+ torch==2.1.0
24
+ soundfile==0.13.1
25
+ onnxruntime==1.23.2
26
+ numpy
27
+ ```
28
+
29
+ > Use CPU-only torch (no GPU) in HF Spaces. The `torch` version should be compatible with the Space runtime; if HF Spaces provides `torch` preinstalled, you can remove it from requirements to speed deploy.
30
+
31
+ ---
32
+
33
+ ## app.py
34
+
35
+ ```python
36
+ import os
37
+ import threading
38
+ import tempfile
39
+ import numpy as np
40
+ import soundfile as sf
41
+ import gradio as gr
42
+
43
+ # Attempt to import torch; HF Spaces usually has CPU torch available.
44
+ import torch
45
+
46
+ MODEL_PATH = "v4_indic.pt"
47
+ SAMPLE_RATE = 48000
48
+
49
+ lock = threading.Lock()
50
+ model = None
51
+
52
+
53
+ def load_model():
54
+ global model
55
+ if model is not None:
56
+ return model
57
+ if not os.path.exists(MODEL_PATH):
58
+ raise FileNotFoundError(f"Model file not found: {MODEL_PATH}. Put v4_indic.pt in repo root.")
59
+
60
+ # Silero packaged model loader
61
+ print(f"Loading Silero model from {MODEL_PATH}...")
62
+ pkg = torch.package.PackageImporter(MODEL_PATH)
63
+ # The original package uses "tts_models" and object name "model"
64
+ model = pkg.load_pickle("tts_models", "model")
65
+ print("Model loaded into memory")
66
+ return model
67
+
68
+
69
+ # Try to call model.apply_tts with flexible signature
70
+ def synthesize_text(text: str, lang: str = "hi", speaker: int = 0, sample_rate: int = SAMPLE_RATE):
71
+ m = load_model()
72
+
73
+ # Normalize inputs
74
+ if not isinstance(text, str) or len(text.strip()) == 0:
75
+ raise ValueError("Empty text")
76
+
77
+ # Some Silero wrappers accept (text=..., lang_id=..., speaker_id=...),
78
+ # others accept (text=..., lang=..., speaker=...). Use try/except to support both.
79
+ try:
80
+ # Common high-level API
81
+ audio = m.apply_tts(text=text, speaker=speaker, lang_id=int(lang) if isinstance(lang, (int, np.integer)) else lang, sample_rate=sample_rate)
82
+ except TypeError:
83
+ try:
84
+ audio = m.apply_tts(text=text, speaker_id=int(speaker), lang_id=int(lang) if isinstance(lang, (int, np.integer)) else lang, sample_rate=sample_rate)
85
+ except Exception:
86
+ # Fallback: some versions accept (text, speaker, lang)
87
+ audio = m.apply_tts(text, speaker, lang, sample_rate)
88
+
89
+ # The returned audio can be numpy array or torch tensor
90
+ if isinstance(audio, torch.Tensor):
91
+ audio = audio.detach().cpu().numpy()
92
+ audio = np.asarray(audio)
93
+
94
+ # Ensure float32 in [-1,1]
95
+ if audio.dtype == np.int16:
96
+ audio = audio.astype('float32') / 32768.0
97
+ audio = audio.astype('float32')
98
+ max_abs = np.max(np.abs(audio))
99
+ if max_abs > 1.0:
100
+ audio = audio / max_abs
101
+
102
+ return audio, sample_rate
103
+
104
+
105
+ # Gradio wrapper: returns file-like audio buffer
106
+ def tts_gradio(text, lang_dropdown, speaker_slider):
107
+ # Map dropdown label to lang id or code expected by model
108
+ # You might need to adjust mapping depending on model internal language ids
109
+ lang_map = {
110
+ "Hindi (hi)": 0,
111
+ "Marathi (mr)": 1,
112
+ "Bengali (bn)": 2,
113
+ "Tamil (ta)": 3,
114
+ "Telugu (te)": 4,
115
+ "Kannada (kn)": 5,
116
+ "Malayalam (ml)": 6,
117
+ "Gujarati (gu)": 7,
118
+ }
119
+
120
+ lang_id = lang_map.get(lang_dropdown, 0)
121
+
122
+ # Prevent concurrent synth calls
123
+ with lock:
124
+ audio, sr = synthesize_text(text, lang=lang_id, speaker=int(speaker_slider))
125
+
126
+ # Write to temporary wav file and return its path (gradio will serve it)
127
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
128
+ sf.write(tmp.name, audio, sr)
129
+ tmp.flush()
130
+ tmp.close()
131
+ return tmp.name
132
+
133
+
134
+ # Build Gradio UI
135
+ def build_ui():
136
+ with gr.Blocks() as demo:
137
+ gr.Markdown("# Silero v4 Indic — TTS (HuggingFace Space)\nDrop `v4_indic.pt` in the repo root and reload the Space.")
138
+
139
+ with gr.Row():
140
+ with gr.Column(scale=3):
141
+ txt = gr.Textbox(lines=4, label="Text to synthesize", value="नमस्ते, यह एक परीक्षण है।")
142
+ lang = gr.Dropdown(list=["Hindi (hi)", "Marathi (mr)", "Bengali (bn)", "Tamil (ta)", "Telugu (te)", "Kannada (kn)", "Malayalam (ml)", "Gujarati (gu)"], label="Language")
143
+ speaker = gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Speaker ID (if model supports multiple speakers)")
144
+ btn = gr.Button("Synthesize")
145
+
146
+ with gr.Column(scale=2):
147
+ out = gr.Audio(label="Generated audio")
148
+
149
+ btn.click(fn=tts_gradio, inputs=[txt, lang, speaker], outputs=[out])
150
+
151
+ return demo
152
+
153
+
154
+ if __name__ == "__main__":
155
+ # Preload model at startup (keeps first request fast)
156
+ try:
157
+ load_model()
158
+ except Exception as e:
159
+ print("Model failed to load at startup:", e)
160
+
161
+ demo = build_ui()
162
+ demo.launch(server_name="0.0.0.0", server_port=7860)
163
+ ```
164
+
165
+ ---
166
+
167
+ ## Notes & Deployment steps
168
+
169
+ 1. **Download model**: `wget https://models.silero.ai/models/tts/indic/v4_indic.pt -O v4_indic.pt` and place in repo root.
170
+ 2. **Create a new Space**: [https://huggingface.co/new-space](https://huggingface.co/new-space) → choose `Gradio` runtime and public/private as you wish.
171
+ 3. **Push repo**: Upload `app.py`, `requirements.txt`, `README.md`, and `v4_indic.pt` to the Space (via web UI drag & drop or via git).
172
+ 4. **Wait** until the Space builds; the model will be loaded on first startup.
173
+ 5. **API**: The Space exposes a Gradio UI and a `/api/predict` endpoint automatically (Gradio inference API). You can call it programmatically.
174
+
175
+ ---
176
+
177
+ ## Tips & Troubleshooting
178
+
179
+ * If the Space build fails due to `torch` version mismatch, remove `torch` from `requirements.txt` and let the Space use its preinstalled torch.
180
+ * If you see `AttributeError` when calling `apply_tts`, some packaged model versions have slightly different API names. The wrapper `synthesize_text` attempts several common signatures; adapt if necessary.
181
+ * The model file is ~34MB — fits in Space disk quota.
182
+ * If multiple users will call TTS concurrently, consider a small rate limiter or queue: Silero v4 is CPU-bound but reasonably fast for short utterances.
183
+
184
+ ---
185
+
186
+ ## Security
187
+
188
+ * Avoid uploading private keys in the repo.
189
+ * If you need to restrict usage, make the Space private and issue access tokens.
190
+
191
+ ---
192
+
193
+ If you want, I can now:
194
+
195
+ * Generate a git-ready ZIP of this project (app.py + requirements + README) so you can upload directly.
196
+ * Or produce a minimal `Dockerfile` if you prefer deploying elsewhere.
197
+
198
+ Which would you like?
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ audio = audio.astype('float32')
3
+ max_abs = np.max(np.abs(audio))
4
+ if max_abs > 1.0:
5
+ audio = audio / max_abs
6
+
7
+
8
+ return audio, sample_rate
9
+
10
+
11
+
12
+
13
+ # Gradio wrapper: returns file-like audio buffer
14
+ def tts_gradio(text, lang_dropdown, speaker_slider):
15
+ # Map dropdown label to lang id or code expected by model
16
+ # You might need to adjust mapping depending on model internal language ids
17
+ lang_map = {
18
+ "Hindi (hi)": 0,
19
+ "Marathi (mr)": 1,
20
+ "Bengali (bn)": 2,
21
+ "Tamil (ta)": 3,
22
+ "Telugu (te)": 4,
23
+ "Kannada (kn)": 5,
24
+ "Malayalam (ml)": 6,
25
+ "Gujarati (gu)": 7,
26
+ }
27
+
28
+
29
+ lang_id = lang_map.get(lang_dropdown, 0)
30
+
31
+
32
+ # Prevent concurrent synth calls
33
+ with lock:
34
+ audio, sr = synthesize_text(text, lang=lang_id, speaker=int(speaker_slider))
35
+
36
+
37
+ # Write to temporary wav file and return its path (gradio will serve it)
38
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
39
+ sf.write(tmp.name, audio, sr)
40
+ tmp.flush()
41
+ tmp.close()
42
+ return tmp.name
43
+
44
+
45
+
46
+
47
+ # Build Gradio UI
48
+ def build_ui():
49
+ with gr.Blocks() as demo:
50
+ gr.Markdown("# Silero v4 Indic — TTS (HuggingFace Space)\nDrop `v4_indic.pt` in the repo root and reload the Space.")
51
+
52
+
53
+ with gr.Row():
54
+ with gr.Column(scale=3):
55
+ txt = gr.Textbox(lines=4, label="Text to synthesize", value="नमस्ते, यह एक परीक्षण है।")
56
+ lang = gr.Dropdown(list=["Hindi (hi)", "Marathi (mr)", "Bengali (bn)", "Tamil (ta)", "Telugu (te)", "Kannada (kn)", "Malayalam (ml)", "Gujarati (gu)"], label="Language")
57
+ speaker = gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Speaker ID (if model supports multiple speakers)")
58
+ btn = gr.Button("Synthesize")
59
+
60
+
61
+ with gr.Column(scale=2):
62
+ out = gr.Audio(label="Generated audio")
63
+
64
+
65
+ btn.click(fn=tts_gradio, inputs=[txt, lang, speaker], outputs=[out])
66
+
67
+
68
+ return demo
69
+
70
+
71
+
72
+
73
+ if __name__ == "__main__":
74
+ # Preload model at startup (keeps first request fast)
75
+ try:
76
+ load_model()
77
+ except Exception as e:
78
+ print("Model failed to load at startup:", e)
79
+
80
+
81
+ demo = build_ui()
82
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==3.39.1
2
+ torch==2.1.0
3
+ soundfile==0.13.1
4
+ onnxruntime==1.23.2
5
+ numpy
v4_indic.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c0d0055340a9789a7ff8e5f7610bbc8d82355e577e483acb8a1fe4f2df0caa6
3
+ size 35379600