toshuu commited on
Commit
bad894e
·
verified ·
1 Parent(s): 22a5251

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -82
app.py CHANGED
@@ -1,82 +1,122 @@
1
- import os
2
- audio = audio.astype('float32')
3
- max_abs = np.max(np.abs(audio))
4
- if max_abs > 1.0:
5
- audio = audio / max_abs
6
-
7
-
8
- return audio, sample_rate
9
-
10
-
11
-
12
-
13
- # Gradio wrapper: returns file-like audio buffer
14
- def tts_gradio(text, lang_dropdown, speaker_slider):
15
- # Map dropdown label to lang id or code expected by model
16
- # You might need to adjust mapping depending on model internal language ids
17
- lang_map = {
18
- "Hindi (hi)": 0,
19
- "Marathi (mr)": 1,
20
- "Bengali (bn)": 2,
21
- "Tamil (ta)": 3,
22
- "Telugu (te)": 4,
23
- "Kannada (kn)": 5,
24
- "Malayalam (ml)": 6,
25
- "Gujarati (gu)": 7,
26
- }
27
-
28
-
29
- lang_id = lang_map.get(lang_dropdown, 0)
30
-
31
-
32
- # Prevent concurrent synth calls
33
- with lock:
34
- audio, sr = synthesize_text(text, lang=lang_id, speaker=int(speaker_slider))
35
-
36
-
37
- # Write to temporary wav file and return its path (gradio will serve it)
38
- tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
39
- sf.write(tmp.name, audio, sr)
40
- tmp.flush()
41
- tmp.close()
42
- return tmp.name
43
-
44
-
45
-
46
-
47
- # Build Gradio UI
48
- def build_ui():
49
- with gr.Blocks() as demo:
50
- gr.Markdown("# Silero v4 Indic — TTS (HuggingFace Space)\nDrop `v4_indic.pt` in the repo root and reload the Space.")
51
-
52
-
53
- with gr.Row():
54
- with gr.Column(scale=3):
55
- txt = gr.Textbox(lines=4, label="Text to synthesize", value="नमस्ते, यह एक परीक्षण है।")
56
- lang = gr.Dropdown(list=["Hindi (hi)", "Marathi (mr)", "Bengali (bn)", "Tamil (ta)", "Telugu (te)", "Kannada (kn)", "Malayalam (ml)", "Gujarati (gu)"], label="Language")
57
- speaker = gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Speaker ID (if model supports multiple speakers)")
58
- btn = gr.Button("Synthesize")
59
-
60
-
61
- with gr.Column(scale=2):
62
- out = gr.Audio(label="Generated audio")
63
-
64
-
65
- btn.click(fn=tts_gradio, inputs=[txt, lang, speaker], outputs=[out])
66
-
67
-
68
- return demo
69
-
70
-
71
-
72
-
73
- if __name__ == "__main__":
74
- # Preload model at startup (keeps first request fast)
75
- try:
76
- load_model()
77
- except Exception as e:
78
- print("Model failed to load at startup:", e)
79
-
80
-
81
- demo = build_ui()
82
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import threading
3
+ import tempfile
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import gradio as gr
7
+ import torch
8
+
9
+ MODEL_PATH = "v4_indic.pt"
10
+ SAMPLE_RATE = 48000
11
+
12
+ lock = threading.Lock()
13
+ model = None
14
+
15
+
16
+ def load_model():
17
+ global model
18
+ if model is not None:
19
+ return model
20
+
21
+ if not os.path.exists(MODEL_PATH):
22
+ raise FileNotFoundError(
23
+ f"Model file not found: {MODEL_PATH}. Upload v4_indic.pt to the Space root."
24
+ )
25
+
26
+ print("Loading Silero v4 model...")
27
+ pkg = torch.package.PackageImporter(MODEL_PATH)
28
+ model = pkg.load_pickle("tts_models", "model")
29
+ print("Model loaded.")
30
+ return model
31
+
32
+
33
+ def synthesize(text, lang_id, speaker_id):
34
+ m = load_model()
35
+
36
+ if not isinstance(text, str) or len(text.strip()) == 0:
37
+ raise ValueError("Empty text")
38
+
39
+ try:
40
+ audio = m.apply_tts(
41
+ text=text,
42
+ speaker=speaker_id,
43
+ lang_id=lang_id,
44
+ sample_rate=SAMPLE_RATE,
45
+ )
46
+ except Exception:
47
+ audio = m.apply_tts(
48
+ text=text,
49
+ speaker_id=speaker_id,
50
+ lang_id=lang_id,
51
+ sample_rate=SAMPLE_RATE,
52
+ )
53
+
54
+ # Convert torch → numpy
55
+ if isinstance(audio, torch.Tensor):
56
+ audio = audio.detach().cpu().numpy()
57
+
58
+ audio = np.asarray(audio).astype(np.float32)
59
+
60
+ max_abs = np.max(np.abs(audio))
61
+ if max_abs > 1.0:
62
+ audio = audio / max_abs
63
+
64
+ return audio
65
+
66
+
67
+ def tts_fn(text, language, speaker):
68
+ lang_map = {
69
+ "Hindi": 0,
70
+ "Marathi": 1,
71
+ "Bengali": 2,
72
+ "Tamil": 3,
73
+ "Telugu": 4,
74
+ "Kannada": 5,
75
+ "Malayalam": 6,
76
+ "Gujarati": 7,
77
+ }
78
+
79
+ lang_id = lang_map.get(language, 0)
80
+
81
+ with lock:
82
+ audio = synthesize(text, lang_id, int(speaker))
83
+
84
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
85
+ sf.write(tmp.name, audio, SAMPLE_RATE)
86
+ tmp.flush()
87
+ tmp.close()
88
+
89
+ return tmp.name
90
+
91
+
92
+ def build_ui():
93
+ with gr.Blocks() as demo:
94
+ gr.Markdown("# 🔊 Silero v4 Indic TTS<br>Text → Speech for 8 Indian languages")
95
+
96
+ with gr.Row():
97
+ with gr.Column():
98
+ text = gr.Textbox(
99
+ label="Enter text", value="नमस्ते, यह एक परीक्षण है।", lines=3
100
+ )
101
+ lang = gr.Dropdown(
102
+ ["Hindi", "Marathi", "Bengali", "Tamil", "Telugu", "Kannada", "Malayalam", "Gujarati"],
103
+ label="Language",
104
+ value="Hindi",
105
+ )
106
+ speaker = gr.Slider(
107
+ 0, 3, value=0, step=1, label="Speaker ID (if supported)"
108
+ )
109
+ btn = gr.Button("🎤 Generate Speech")
110
+
111
+ with gr.Column():
112
+ output_audio = gr.Audio(label="Output Audio")
113
+
114
+ btn.click(tts_fn, inputs=[text, lang, speaker], outputs=[output_audio])
115
+
116
+ return demo
117
+
118
+
119
+ if __name__ == "__main__":
120
+ load_model()
121
+ ui = build_ui()
122
+ ui.launch()