PatnaikAshish commited on
Commit
12f7f5e
Β·
verified Β·
1 Parent(s): c5a414c
Files changed (1) hide show
  1. webui.py +273 -255
webui.py CHANGED
@@ -1,256 +1,274 @@
1
- import gradio as gr
2
- import os
3
- import re
4
- import torch
5
- import numpy as np
6
- from scipy.io.wavfile import write
7
- from phonemizer.backend.espeak.wrapper import EspeakWrapper
8
- from safetensors.torch import load_file
9
- from tts import commons
10
- from tts import utils
11
- from tts.models import SynthesizerTrn
12
- from text.symbols import symbols
13
- from text import text_to_sequence
14
-
15
- _ESPEAK_LIBRARY = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
16
- if os.path.exists(_ESPEAK_LIBRARY):
17
- EspeakWrapper.set_library(_ESPEAK_LIBRARY)
18
- print(f"βœ… Found eSpeak-ng: {_ESPEAK_LIBRARY}")
19
-
20
- MODEL_PATH = "checkpoints/sonya-tts.safetensors"
21
- CONFIG_PATH = "checkpoints/config.json"
22
- device = "cuda" if torch.cuda.is_available() else "cpu"
23
-
24
-
25
- def clean_text_for_vits(text):
26
- text = text.strip()
27
- text = text.replace("'", "'")
28
- text = text.replace(""", '"').replace(""", '"')
29
- text = text.replace("–", "-").replace("β€”", "-")
30
- text = re.sub(r"[()\[\]{}<>]", "", text)
31
- text = re.sub(r"[^a-zA-Z0-9\s.,!?'\-]", "", text)
32
- text = re.sub(r"\s+", " ", text)
33
- return text
34
-
35
- def get_text(text, hps):
36
- text = clean_text_for_vits(text)
37
- text_norm = text_to_sequence(text, hps.data.text_cleaners)
38
- if hps.data.add_blank:
39
- text_norm = commons.intersperse(text_norm, 0)
40
- return torch.LongTensor(text_norm)
41
-
42
- def split_sentences(text):
43
- text = clean_text_for_vits(text)
44
- if not text:
45
- return []
46
- return re.split(r'(?<=[.!?])\s+', text)
47
-
48
-
49
- print("πŸ”„ Loading Sonya TTS Model...")
50
- hps = utils.get_hparams_from_file(CONFIG_PATH)
51
-
52
- net_g = SynthesizerTrn(
53
- len(symbols),
54
- hps.data.filter_length // 2 + 1,
55
- hps.train.segment_size // hps.data.hop_length,
56
- **hps.model
57
- ).to(device)
58
-
59
- net_g.eval()
60
-
61
- if os.path.exists(MODEL_PATH):
62
- state_dict = load_file(MODEL_PATH, device=device)
63
- net_g.load_state_dict(state_dict)
64
- print(f"βœ… Loaded weights from {MODEL_PATH}")
65
- else:
66
- raise FileNotFoundError(f"Could not find model at {MODEL_PATH}")
67
-
68
- def infer_short(text, noise_scale, noise_scale_w, length_scale):
69
- if not text.strip():
70
- return None
71
-
72
- stn_tst = get_text(text, hps)
73
-
74
- with torch.no_grad():
75
- x_tst = stn_tst.to(device).unsqueeze(0)
76
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
77
-
78
- audio = net_g.infer(
79
- x_tst,
80
- x_tst_lengths,
81
- noise_scale=noise_scale,
82
- noise_scale_w=noise_scale_w,
83
- length_scale=length_scale
84
- )[0][0,0].data.cpu().float().numpy()
85
-
86
- return (hps.data.sampling_rate, audio)
87
-
88
- def infer_long(text, length_scale, noise_scale):
89
- if not text.strip():
90
- return None
91
-
92
- sentences = split_sentences(text)
93
- audio_chunks = []
94
-
95
- fixed_noise_w = 0.6
96
- base_pause = 0.3
97
-
98
- for sent in sentences:
99
- if len(sent.strip()) < 2: continue
100
-
101
- stn_tst = get_text(sent, hps)
102
- with torch.no_grad():
103
- x_tst = stn_tst.to(device).unsqueeze(0)
104
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
105
-
106
- audio = net_g.infer(
107
- x_tst,
108
- x_tst_lengths,
109
- noise_scale=noise_scale,
110
- noise_scale_w=fixed_noise_w,
111
- length_scale=length_scale
112
- )[0][0,0].data.cpu().float().numpy()
113
-
114
- if sent.endswith("?"):
115
- pause_dur = base_pause + 0.2
116
- elif sent.endswith("!"):
117
- pause_dur = base_pause + 0.1
118
- else:
119
- pause_dur = base_pause
120
-
121
- silence = np.zeros(int(hps.data.sampling_rate * pause_dur))
122
-
123
- audio_chunks.append(audio)
124
- audio_chunks.append(silence)
125
-
126
- final_audio = np.concatenate(audio_chunks)
127
- return (hps.data.sampling_rate, final_audio)
128
-
129
-
130
- theme = gr.themes.Soft(
131
- primary_hue="pink",
132
- secondary_hue="rose",
133
- neutral_hue="slate"
134
- ).set(
135
- button_primary_background_fill="linear-gradient(90deg, #ff69b4, #ff1493)",
136
- button_primary_background_fill_hover="linear-gradient(90deg, #ff1493, #c71585)",
137
- button_primary_text_color="white",
138
- )
139
-
140
- custom_css = """
141
- .banner-container {
142
- width: 100%;
143
- max-width: 100%;
144
- margin: 0 auto 20px auto;
145
- display: flex;
146
- justify-content: center;
147
- align-items: center;
148
- }
149
-
150
- .banner-container img {
151
- width: 100%;
152
- max-width: 1800px;
153
- max-height: 120px;
154
- height: auto;
155
- object-fit: scale-down;
156
- object-position: center;
157
- border-radius: 8px;
158
- }
159
-
160
- .main-title {
161
- text-align: center;
162
- color: #ff1493;
163
- font-size: 2em;
164
- font-weight: 700;
165
- margin: 15px 0 8px 0;
166
- }
167
-
168
- .subtitle {
169
- text-align: center;
170
- color: white;
171
- font-size: 1.1em;
172
- margin-bottom: 25px;
173
- font-weight: 400;
174
- }
175
-
176
- footer {
177
- display: none !important;
178
- }
179
- """
180
-
181
- with gr.Blocks(theme=theme, css=custom_css, title="Sonya TTS") as app:
182
-
183
-
184
- with gr.Row(elem_classes="banner-container"):
185
- if os.path.exists("logo.png"):
186
- gr.Image("logo.png", show_label=False, container=False, elem_classes="banner-img")
187
-
188
-
189
- gr.HTML("""
190
- <h1 class="main-title">✨ Sonya TTS β€” A Beautiful, Expressive Neural Voice Engine</h1>
191
- <p class="subtitle">High-fidelity AI speech with emotion, rhythm, and audiobook mode</p>
192
- """)
193
-
194
- with gr.Tabs():
195
-
196
-
197
- with gr.TabItem("πŸŽ›οΈ Studio Mode"):
198
- with gr.Row():
199
- with gr.Column(scale=2):
200
- inp_short = gr.Textbox(
201
- label="πŸ’¬ Input Text",
202
- placeholder="Type something for Sonya to say...",
203
- lines=4,
204
- value="Hello! I am Sonya, your AI voice."
205
- )
206
-
207
- with gr.Accordion("βš™οΈ Voice Controls", open=True):
208
- slider_ns = gr.Slider(0.1, 1.0, value=0.4, label="🎭 Emotion", info="Higher = more expressive")
209
- slider_nsw = gr.Slider(0.1, 1.0, value=0.5, label="🎡 Rhythm", info="Higher = looser timing")
210
- slider_ls = gr.Slider(0.5, 1.5, value=0.97, label="⏱ Speed", info="Lower = faster, Higher = slower")
211
-
212
- btn_short = gr.Button("✨ Generate Voice", variant="primary", size="lg")
213
-
214
- with gr.Column(scale=1):
215
- out_short = gr.Audio(label="πŸ”Š Sonya's Voice", type="numpy")
216
-
217
- btn_short.click(
218
- infer_short,
219
- inputs=[inp_short, slider_ns, slider_nsw, slider_ls],
220
- outputs=[out_short]
221
- )
222
-
223
-
224
- with gr.TabItem("πŸ“– Audiobook Mode"):
225
- gr.Markdown(
226
- """<p style='text-align: center; color: #666; font-size: 1.05em;'>
227
- Paste long text. Sonya will read it beautifully with natural pauses.
228
- </p>""",
229
- elem_classes="audiobook-description"
230
- )
231
-
232
- with gr.Row():
233
- with gr.Column(scale=2):
234
- inp_long = gr.Textbox(
235
- label="πŸ“œ Long Text Input",
236
- placeholder="Paste your story or article here...",
237
- lines=10
238
- )
239
-
240
- with gr.Accordion("βš™οΈ Narration Settings", open=False):
241
- long_ls = gr.Slider(0.5, 1.5, value=1.0, label="⏱ Reading Speed")
242
- long_ns = gr.Slider(0.1, 1.0, value=0.5, label="🎭 Tone Variation")
243
-
244
- btn_long = gr.Button("🎧 Read Aloud", variant="primary", size="lg")
245
-
246
- with gr.Column(scale=1):
247
- out_long = gr.Audio(label="πŸ“’ Full Narration", type="numpy")
248
-
249
- btn_long.click(
250
- infer_long,
251
- inputs=[inp_long, long_ls, long_ns],
252
- outputs=[out_long]
253
- )
254
-
255
- if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  app.launch()
 
1
+ import gradio as gr
2
+ import os
3
+ import re
4
+ import torch
5
+ import numpy as np
6
+ from scipy.io.wavfile import write
7
+ from phonemizer.backend.espeak.wrapper import EspeakWrapper
8
+ from safetensors.torch import load_file
9
+ from huggingface_hub import hf_hub_download
10
+
11
+ from tts import commons
12
+ from tts import utils
13
+ from tts.models import SynthesizerTrn
14
+ from text.symbols import symbols
15
+ from text import text_to_sequence
16
+
17
+ _ESPEAK_LIBRARY = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
18
+ if os.path.exists(_ESPEAK_LIBRARY):
19
+ EspeakWrapper.set_library(_ESPEAK_LIBRARY)
20
+ print(f"βœ… Found eSpeak-ng: {_ESPEAK_LIBRARY}")
21
+
22
+
23
+ REPO_ID = "PatnaikAshish/Sonya-TTS"
24
+
25
+ MODEL_FILENAME = "sonya-tts.safetensors"
26
+ CONFIG_FILENAME = "config.json"
27
+
28
+ LOCAL_MODEL_PATH = "checkpoints/sonya-tts.safetensors"
29
+ LOCAL_CONFIG_PATH = "checkpoints/config.json"
30
+
31
+ device = "cuda" if torch.cuda.is_available() else "cpu"
32
+
33
+
34
+ def clean_text_for_vits(text):
35
+ text = text.strip()
36
+ text = text.replace("'", "'")
37
+ text = text.replace(""", '"').replace(""", '"')
38
+ text = text.replace("–", "-").replace("β€”", "-")
39
+ text = re.sub(r"[()\[\]{}<>]", "", text)
40
+ text = re.sub(r"[^a-zA-Z0-9\s.,!?'\-]", "", text)
41
+ text = re.sub(r"\s+", " ", text)
42
+ return text
43
+
44
+
45
+ def get_text(text, hps):
46
+ text = clean_text_for_vits(text)
47
+ text_norm = text_to_sequence(text, hps.data.text_cleaners)
48
+ if hps.data.add_blank:
49
+ text_norm = commons.intersperse(text_norm, 0)
50
+ return torch.LongTensor(text_norm)
51
+
52
+
53
+ def split_sentences(text):
54
+ text = clean_text_for_vits(text)
55
+ if not text:
56
+ return []
57
+ return re.split(r'(?<=[.!?])\s+', text)
58
+
59
+
60
+ print("πŸ”„ Loading Sonya TTS Model...")
61
+
62
+ if os.path.exists(LOCAL_MODEL_PATH) and os.path.exists(LOCAL_CONFIG_PATH):
63
+ print("βœ… Loading Sonya TTS from local checkpoints...")
64
+ model_path = LOCAL_MODEL_PATH
65
+ config_path = LOCAL_CONFIG_PATH
66
+ else:
67
+ print("🌍 Downloading Sonya TTS from Hugging Face...")
68
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
69
+ config_path = hf_hub_download(repo_id=REPO_ID, filename=CONFIG_FILENAME)
70
+
71
+ hps = utils.get_hparams_from_file(config_path)
72
+
73
+ net_g = SynthesizerTrn(
74
+ len(symbols),
75
+ hps.data.filter_length // 2 + 1,
76
+ hps.train.segment_size // hps.data.hop_length,
77
+ **hps.model
78
+ ).to(device)
79
+
80
+ net_g.eval()
81
+
82
+ state_dict = load_file(model_path)
83
+ net_g.load_state_dict(state_dict)
84
+ print("πŸŽ‰ Sonya TTS loaded successfully!")
85
+
86
+
87
+ def infer_short(text, noise_scale, noise_scale_w, length_scale):
88
+ if not text.strip():
89
+ return None
90
+
91
+ stn_tst = get_text(text, hps)
92
+
93
+ with torch.no_grad():
94
+ x_tst = stn_tst.to(device).unsqueeze(0)
95
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
96
+
97
+ audio = net_g.infer(
98
+ x_tst,
99
+ x_tst_lengths,
100
+ noise_scale=noise_scale,
101
+ noise_scale_w=noise_scale_w,
102
+ length_scale=length_scale
103
+ )[0][0,0].data.cpu().float().numpy()
104
+
105
+ return (hps.data.sampling_rate, audio)
106
+
107
+
108
+ def infer_long(text, length_scale, noise_scale):
109
+ if not text.strip():
110
+ return None
111
+
112
+ sentences = split_sentences(text)
113
+ audio_chunks = []
114
+
115
+ fixed_noise_w = 0.6
116
+ base_pause = 0.3
117
+
118
+ for sent in sentences:
119
+ if len(sent.strip()) < 2:
120
+ continue
121
+
122
+ stn_tst = get_text(sent, hps)
123
+ with torch.no_grad():
124
+ x_tst = stn_tst.to(device).unsqueeze(0)
125
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
126
+
127
+ audio = net_g.infer(
128
+ x_tst,
129
+ x_tst_lengths,
130
+ noise_scale=noise_scale,
131
+ noise_scale_w=fixed_noise_w,
132
+ length_scale=length_scale
133
+ )[0][0,0].data.cpu().float().numpy()
134
+
135
+ if sent.endswith("?"):
136
+ pause_dur = base_pause + 0.2
137
+ elif sent.endswith("!"):
138
+ pause_dur = base_pause + 0.1
139
+ else:
140
+ pause_dur = base_pause
141
+
142
+ silence = np.zeros(int(hps.data.sampling_rate * pause_dur))
143
+
144
+ audio_chunks.append(audio)
145
+ audio_chunks.append(silence)
146
+
147
+ final_audio = np.concatenate(audio_chunks)
148
+ return (hps.data.sampling_rate, final_audio)
149
+
150
+
151
+ theme = gr.themes.Soft(
152
+ primary_hue="pink",
153
+ secondary_hue="rose",
154
+ neutral_hue="slate"
155
+ ).set(
156
+ button_primary_background_fill="linear-gradient(90deg, #ff69b4, #ff1493)",
157
+ button_primary_background_fill_hover="linear-gradient(90deg, #ff1493, #c71585)",
158
+ button_primary_text_color="white",
159
+ )
160
+
161
+ custom_css = """
162
+ .banner-container {
163
+ width: 100%;
164
+ max-width: 100%;
165
+ margin: 0 auto 20px auto;
166
+ display: flex;
167
+ justify-content: center;
168
+ align-items: center;
169
+ }
170
+
171
+ .banner-container img {
172
+ width: 100%;
173
+ max-width: 1800px;
174
+ max-height: 120px;
175
+ height: auto;
176
+ object-fit: scale-down;
177
+ object-position: center;
178
+ border-radius: 8px;
179
+ }
180
+
181
+ .main-title {
182
+ text-align: center;
183
+ color: #ff1493;
184
+ font-size: 2em;
185
+ font-weight: 700;
186
+ margin: 15px 0 8px 0;
187
+ }
188
+
189
+ .subtitle {
190
+ text-align: center;
191
+ color: white;
192
+ font-size: 1.1em;
193
+ margin-bottom: 25px;
194
+ font-weight: 400;
195
+ }
196
+
197
+ footer {
198
+ display: none !important;
199
+ }
200
+ """
201
+
202
+
203
+ with gr.Blocks(theme=theme, css=custom_css, title="Sonya TTS") as app:
204
+
205
+ with gr.Row(elem_classes="banner-container"):
206
+ if os.path.exists("logo.png"):
207
+ gr.Image("logo.png", show_label=False, container=False, elem_classes="banner-img")
208
+
209
+ gr.HTML("""
210
+ <h1 class="main-title">✨ Sonya TTS β€” A Beautiful, Expressive Neural Voice Engine</h1>
211
+ <p class="subtitle">High-fidelity AI speech with emotion, rhythm, and audiobook mode</p>
212
+ """)
213
+
214
+ with gr.Tabs():
215
+
216
+ with gr.TabItem("πŸŽ›οΈ Studio Mode"):
217
+ with gr.Row():
218
+ with gr.Column(scale=2):
219
+ inp_short = gr.Textbox(
220
+ label="πŸ’¬ Input Text",
221
+ placeholder="Type something for Sonya to say...",
222
+ lines=4,
223
+ value="Hello! I am Sonya, your AI voice."
224
+ )
225
+
226
+ with gr.Accordion("βš™οΈ Voice Controls", open=True):
227
+ slider_ns = gr.Slider(0.1, 1.0, value=0.4, label="🎭 Emotion", info="Higher = more expressive")
228
+ slider_nsw = gr.Slider(0.1, 1.0, value=0.5, label="🎡 Rhythm", info="Higher = looser timing")
229
+ slider_ls = gr.Slider(0.5, 1.5, value=0.97, label="⏱ Speed", info="Lower = faster, Higher = slower")
230
+
231
+ btn_short = gr.Button("✨ Generate Voice", variant="primary", size="lg")
232
+
233
+ with gr.Column(scale=1):
234
+ out_short = gr.Audio(label="πŸ”Š Sonya's Voice", type="numpy")
235
+
236
+ btn_short.click(
237
+ infer_short,
238
+ inputs=[inp_short, slider_ns, slider_nsw, slider_ls],
239
+ outputs=[out_short]
240
+ )
241
+
242
+ with gr.TabItem("πŸ“– Audiobook Mode"):
243
+ gr.Markdown(
244
+ """<p style='text-align: center; color: #666; font-size: 1.05em;'>
245
+ Paste long text. Sonya will read it beautifully with natural pauses.
246
+ </p>""",
247
+ elem_classes="audiobook-description"
248
+ )
249
+
250
+ with gr.Row():
251
+ with gr.Column(scale=2):
252
+ inp_long = gr.Textbox(
253
+ label="πŸ“œ Long Text Input",
254
+ placeholder="Paste your story or article here...",
255
+ lines=10
256
+ )
257
+
258
+ with gr.Accordion("βš™οΈ Narration Settings", open=False):
259
+ long_ls = gr.Slider(0.5, 1.5, value=1.0, label="⏱ Reading Speed")
260
+ long_ns = gr.Slider(0.1, 1.0, value=0.5, label="🎭 Tone Variation")
261
+
262
+ btn_long = gr.Button("🎧 Read Aloud", variant="primary", size="lg")
263
+
264
+ with gr.Column(scale=1):
265
+ out_long = gr.Audio(label="πŸ“’ Full Narration", type="numpy")
266
+
267
+ btn_long.click(
268
+ infer_long,
269
+ inputs=[inp_long, long_ls, long_ns],
270
+ outputs=[out_long]
271
+ )
272
+
273
+ if __name__ == "__main__":
274
  app.launch()