sbapan41 commited on
Commit
1ca37ed
·
verified ·
1 Parent(s): 204f085

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -561
app.py DELETED
@@ -1,561 +0,0 @@
1
- import os
2
- import shlex
3
- import subprocess
4
-
5
- subprocess.run(
6
- shlex.split("pip install flash-attn --no-build-isolation"),
7
- env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
8
- check=True,
9
- )
10
- subprocess.run(
11
- shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
12
- check=True,
13
- )
14
- subprocess.run(
15
- shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
16
- check=True,
17
- )
18
-
19
- import spaces
20
- import torch
21
- import torchaudio
22
- import gradio as gr
23
- from os import getenv
24
-
25
- from qhash.model import Zonos
26
- from qhash.conditioning import make_cond_dict, supported_language_codes
27
-
28
- device = "cuda"
29
- MODEL_NAMES = ["Quantumhash/Qhash-v0.1-transformer", "Quantumhash/Qhash-v0.1-hybrid"]
30
- MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
31
- for model in MODELS.values():
32
- model.requires_grad_(False).eval()
33
-
34
-
35
- def update_ui(model_choice):
36
- """
37
- Dynamically show/hide UI elements based on the model's conditioners.
38
- We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
39
- """
40
- model = MODELS[model_choice]
41
- cond_names = [c.name for c in model.prefix_conditioner.conditioners]
42
- print("Conditioners in this model:", cond_names)
43
-
44
- text_update = gr.update(visible=("espeak" in cond_names))
45
- language_update = gr.update(visible=("espeak" in cond_names))
46
- speaker_audio_update = gr.update(visible=("speaker" in cond_names))
47
- prefix_audio_update = gr.update(visible=True)
48
- emotion1_update = gr.update(visible=("emotion" in cond_names))
49
- emotion2_update = gr.update(visible=("emotion" in cond_names))
50
- emotion3_update = gr.update(visible=("emotion" in cond_names))
51
- emotion4_update = gr.update(visible=("emotion" in cond_names))
52
- emotion5_update = gr.update(visible=("emotion" in cond_names))
53
- emotion6_update = gr.update(visible=("emotion" in cond_names))
54
- emotion7_update = gr.update(visible=("emotion" in cond_names))
55
- emotion8_update = gr.update(visible=("emotion" in cond_names))
56
- vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
57
- fmax_slider_update = gr.update(visible=("fmax" in cond_names))
58
- pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
59
- speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
60
- dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
61
- speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
62
- unconditional_keys_update = gr.update(
63
- choices=[name for name in cond_names if name not in ("espeak", "language_id")]
64
- )
65
-
66
- return (
67
- text_update,
68
- language_update,
69
- speaker_audio_update,
70
- prefix_audio_update,
71
- emotion1_update,
72
- emotion2_update,
73
- emotion3_update,
74
- emotion4_update,
75
- emotion5_update,
76
- emotion6_update,
77
- emotion7_update,
78
- emotion8_update,
79
- vq_single_slider_update,
80
- fmax_slider_update,
81
- pitch_std_slider_update,
82
- speaking_rate_slider_update,
83
- dnsmos_slider_update,
84
- speaker_noised_checkbox_update,
85
- unconditional_keys_update,
86
- )
87
-
88
-
89
- @spaces.GPU(duration=120)
90
- def generate_audio(
91
- model_choice,
92
- text,
93
- language,
94
- speaker_audio,
95
- prefix_audio,
96
- e1,
97
- e2,
98
- e3,
99
- e4,
100
- e5,
101
- e6,
102
- e7,
103
- e8,
104
- vq_single,
105
- fmax,
106
- pitch_std,
107
- speaking_rate,
108
- dnsmos_ovrl,
109
- speaker_noised,
110
- cfg_scale,
111
- min_p,
112
- seed,
113
- randomize_seed,
114
- unconditional_keys,
115
- progress=gr.Progress(),
116
- ):
117
- """
118
- Generates audio based on the provided UI parameters.
119
- We do NOT use language_id or ctc_loss even if the model has them.
120
- """
121
- selected_model = MODELS[model_choice]
122
-
123
- speaker_noised_bool = bool(speaker_noised)
124
- fmax = float(fmax)
125
- pitch_std = float(pitch_std)
126
- speaking_rate = float(speaking_rate)
127
- dnsmos_ovrl = float(dnsmos_ovrl)
128
- cfg_scale = float(cfg_scale)
129
- min_p = float(min_p)
130
- seed = int(seed)
131
- max_new_tokens = 86 * 30
132
-
133
- if randomize_seed:
134
- seed = torch.randint(0, 2**32 - 1, (1,)).item()
135
- torch.manual_seed(seed)
136
-
137
- speaker_embedding = None
138
- if speaker_audio is not None and "speaker" not in unconditional_keys:
139
- wav, sr = torchaudio.load(speaker_audio)
140
- speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
141
- speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
142
-
143
- audio_prefix_codes = None
144
- if prefix_audio is not None:
145
- wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
146
- wav_prefix = wav_prefix.mean(0, keepdim=True)
147
- wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
148
- wav_prefix = wav_prefix.to(device, dtype=torch.float32)
149
- with torch.autocast(device, dtype=torch.float32):
150
- audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
151
-
152
- emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
153
-
154
- vq_val = float(vq_single)
155
- vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
156
-
157
- cond_dict = make_cond_dict(
158
- text=text,
159
- language=language,
160
- speaker=speaker_embedding,
161
- emotion=emotion_tensor,
162
- vqscore_8=vq_tensor,
163
- fmax=fmax,
164
- pitch_std=pitch_std,
165
- speaking_rate=speaking_rate,
166
- dnsmos_ovrl=dnsmos_ovrl,
167
- speaker_noised=speaker_noised_bool,
168
- device=device,
169
- unconditional_keys=unconditional_keys,
170
- )
171
- conditioning = selected_model.prepare_conditioning(cond_dict)
172
-
173
- estimated_generation_duration = 30 * len(text) / 400
174
- estimated_total_steps = int(estimated_generation_duration * 86)
175
-
176
- def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
177
- progress((step, estimated_total_steps))
178
- return True
179
-
180
- codes = selected_model.generate(
181
- prefix_conditioning=conditioning,
182
- audio_prefix_codes=audio_prefix_codes,
183
- max_new_tokens=max_new_tokens,
184
- cfg_scale=cfg_scale,
185
- batch_size=1,
186
- sampling_params=dict(min_p=min_p),
187
- callback=update_progress,
188
- )
189
-
190
- wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
191
- sr_out = selected_model.autoencoder.sampling_rate
192
- if wav_out.dim() == 2 and wav_out.size(0) > 1:
193
- wav_out = wav_out[0:1, :]
194
- return (sr_out, wav_out.squeeze().numpy()), seed
195
-
196
-
197
- # Custom CSS for pastel gradient background and enhanced UI
198
- custom_css = """
199
- .gradio-container {
200
- background: #0C101B;
201
- background-size: 400% 400%;
202
- animation: gradient 15s ease infinite;
203
- }
204
-
205
- @keyframes gradient {
206
- 0% {
207
- background-position: 0% 50%;
208
- }
209
- 50% {
210
- background-position: 100% 50%;
211
- }
212
- 100% {
213
- background-position: 0% 50%;
214
- }
215
- }
216
-
217
- .container {
218
- max-width: 1200px;
219
- margin: 0 auto;
220
- padding: 20px;
221
- }
222
-
223
- .panel {
224
- background-color: rgba(159, 153, 96, 0.9);
225
- border-radius: 16px;
226
- padding: 20px;
227
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
228
- margin-bottom: 16px;
229
- backdrop-filter: blur(5px);
230
- transition: all 0.3s ease;
231
- }
232
-
233
- .panel p {
234
- font-size: 1.1em;
235
- color: black;
236
- }
237
- .panel:hover {
238
- box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12);
239
- transform: translateY(-2px);
240
- }
241
-
242
- .title {
243
- font-size: 1.2em;
244
- font-weight: 600;
245
- margin-bottom: 12px;
246
- color: #6a3ea1;
247
- border-bottom: 2px solid #f0e6ff;
248
- padding-bottom: 8px;
249
- }
250
-
251
- .slider-container {
252
- background-color: rgba(255, 255, 255, 0.5);
253
- border-radius: 10px;
254
- padding: 10px;
255
- margin: 5px 0;
256
- }
257
-
258
- /* Make sliders more appealing */
259
- input[type=range] {
260
- height: 5px;
261
- appearance: none;
262
- width: 100%;
263
- border-radius: 3px;
264
- background: linear-gradient(90deg, #9c83e0, #83b1e0);
265
- }
266
-
267
- .generate-button {
268
- background: linear-gradient(90deg, #a673ff, #7c4dff);
269
- color: white;
270
- border: none;
271
- border-radius: 8px;
272
- padding: 12px 24px;
273
- font-size: 16px;
274
- font-weight: 500;
275
- cursor: pointer;
276
- transition: all 0.3s ease;
277
- box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2);
278
- display: block;
279
- width: 100%;
280
- margin: 20px 0;
281
- }
282
-
283
- .generate-button:hover {
284
- background: linear-gradient(90deg, #9c5eff, #6a3aff);
285
- box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3);
286
- transform: translateY(-2px);
287
- }
288
-
289
- /* Tabs styling */
290
- .tabs {
291
- display: flex;
292
- border-bottom: 1px solid #e0e0e0;
293
- margin-bottom: 20px;
294
- }
295
-
296
- .tab {
297
- padding: 10px 20px;
298
- cursor: pointer;
299
- transition: all 0.3s ease;
300
- background-color: transparent;
301
- border: none;
302
- color: #666;
303
- }
304
-
305
- .tab.active {
306
- color: #7c4dff;
307
- border-bottom: 3px solid #7c4dff;
308
- font-weight: 600;
309
- }
310
-
311
- /* Emotion sliders container */
312
- .emotion-grid {
313
- display: grid;
314
- grid-template-columns: repeat(4, 1fr);
315
- gap: 12px;
316
- }
317
-
318
- /* Header styling */
319
- .app-header {
320
- text-align: center;
321
- margin-bottom: 25px;
322
- }
323
-
324
- .app-header h1 {
325
- font-size: 2.5em;
326
- color: #6a3ea1;
327
- margin-bottom: 8px;
328
- font-weight: 700;
329
- }
330
-
331
- .app-header p {
332
- font-size: 1.1em;
333
- color: #6a3ea1;
334
- margin-bottom: 20px;
335
- }
336
-
337
- /* Audio player styling */
338
- .audio-output {
339
- margin-top: 20px;
340
- }
341
-
342
- /* Make output area more prominent */
343
- .output-container {
344
- background-color: rgba(24, 82, 79, 0.85);
345
- border-radius: 16px;
346
- padding: 24px;
347
- box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1);
348
- margin-top: 20px;
349
- }
350
- """
351
-
352
-
353
- def build_interface():
354
- # Build interface with enhanced visual elements and layout
355
- with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
356
- # Header section
357
- with gr.Column(elem_classes="app-header"):
358
- gr.Markdown("# ✨ Qhash Text-to-Speech Clone ✨")
359
- gr.Markdown("Create natural-sounding speech with customizable voice characteristics")
360
-
361
- # Main content container
362
- with gr.Column(elem_classes="container"):
363
- # First panel - Text & Model Selection
364
- with gr.Column(elem_classes="panel"):
365
- gr.Markdown("💬 Text & Model Configuration")
366
- with gr.Row():
367
- with gr.Column(scale=2):
368
- model_choice = gr.Dropdown(
369
- choices=MODEL_NAMES,
370
- value="Quantumhash/Qhash-v0.1-transformer",
371
- label="Qhash Model Type",
372
- info="Select the model variant to use.",
373
- )
374
- text = gr.Textbox(
375
- label="Text to Synthesize",
376
- value="Qhash uses eSpeak for text to phoneme conversion!",
377
- lines=4,
378
- max_length=500,
379
- )
380
- language = gr.Dropdown(
381
- choices=supported_language_codes,
382
- value="en-us",
383
- label="Language Code",
384
- info="Select a language code.",
385
- )
386
- with gr.Column(scale=1):
387
- prefix_audio = gr.Audio(
388
- value="assets/silence_100ms.wav",
389
- label="Optional Prefix Audio (continue from this audio)",
390
- type="filepath",
391
- )
392
-
393
- # Second panel - Voice Characteristics
394
- with gr.Column(elem_classes="panel"):
395
- gr.Markdown("🎤 Voice Characteristics")
396
- with gr.Row():
397
- with gr.Column(scale=1):
398
- speaker_audio = gr.Audio(
399
- label="Optional Speaker Audio (for voice cloning)",
400
- type="filepath",
401
- )
402
- speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
403
-
404
- with gr.Column(scale=2):
405
- with gr.Row():
406
- with gr.Column():
407
- dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container")
408
- fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container")
409
- vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container")
410
- with gr.Column():
411
- pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container")
412
- speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container")
413
-
414
- # Third panel - Generation Parameters
415
- with gr.Column(elem_classes="panel"):
416
- gr.Markdown("⚙️ Generation Parameters")
417
- with gr.Row():
418
- with gr.Column():
419
- cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container")
420
- min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container")
421
- with gr.Column():
422
- seed_number = gr.Number(label="Seed", value=420, precision=0)
423
- randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
424
-
425
- # Emotion Panel with Tabbed Interface
426
- with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"):
427
- gr.Markdown(
428
- "Adjust these sliders to control the emotional tone of the generated speech.\n"
429
- "For a neutral voice, keep 'Neutral' high and other emotions low."
430
- )
431
- with gr.Row(elem_classes="emotion-grid"):
432
- emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container")
433
- emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container")
434
- emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container")
435
- emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container")
436
- with gr.Row(elem_classes="emotion-grid"):
437
- emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container")
438
- emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container")
439
- emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container")
440
- emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container")
441
-
442
- # Advanced Settings Panel
443
- with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"):
444
- gr.Markdown(
445
- "### Unconditional Toggles\n"
446
- "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
447
- 'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
448
- )
449
- unconditional_keys = gr.CheckboxGroup(
450
- [
451
- "speaker",
452
- "emotion",
453
- "vqscore_8",
454
- "fmax",
455
- "pitch_std",
456
- "speaking_rate",
457
- "dnsmos_ovrl",
458
- "speaker_noised",
459
- ],
460
- value=["emotion"],
461
- label="Unconditional Keys",
462
- )
463
-
464
- # Generate Button and Output Area
465
- with gr.Column(elem_classes="panel output-container"):
466
- gr.Markdown("🔊 Generate & Output")
467
- generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
468
- output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")
469
-
470
- model_choice.change(
471
- fn=update_ui,
472
- inputs=[model_choice],
473
- outputs=[
474
- text,
475
- language,
476
- speaker_audio,
477
- prefix_audio,
478
- emotion1,
479
- emotion2,
480
- emotion3,
481
- emotion4,
482
- emotion5,
483
- emotion6,
484
- emotion7,
485
- emotion8,
486
- vq_single_slider,
487
- fmax_slider,
488
- pitch_std_slider,
489
- speaking_rate_slider,
490
- dnsmos_slider,
491
- speaker_noised_checkbox,
492
- unconditional_keys,
493
- ],
494
- )
495
-
496
- # On page load, trigger the same UI refresh
497
- demo.load(
498
- fn=update_ui,
499
- inputs=[model_choice],
500
- outputs=[
501
- text,
502
- language,
503
- speaker_audio,
504
- prefix_audio,
505
- emotion1,
506
- emotion2,
507
- emotion3,
508
- emotion4,
509
- emotion5,
510
- emotion6,
511
- emotion7,
512
- emotion8,
513
- vq_single_slider,
514
- fmax_slider,
515
- pitch_std_slider,
516
- speaking_rate_slider,
517
- dnsmos_slider,
518
- speaker_noised_checkbox,
519
- unconditional_keys,
520
- ],
521
- )
522
-
523
- # Generate audio on button click
524
- generate_button.click(
525
- fn=generate_audio,
526
- inputs=[
527
- model_choice,
528
- text,
529
- language,
530
- speaker_audio,
531
- prefix_audio,
532
- emotion1,
533
- emotion2,
534
- emotion3,
535
- emotion4,
536
- emotion5,
537
- emotion6,
538
- emotion7,
539
- emotion8,
540
- vq_single_slider,
541
- fmax_slider,
542
- pitch_std_slider,
543
- speaking_rate_slider,
544
- dnsmos_slider,
545
- speaker_noised_checkbox,
546
- cfg_scale_slider,
547
- min_p_slider,
548
- seed_number,
549
- randomize_seed_toggle,
550
- unconditional_keys,
551
- ],
552
- outputs=[output_audio, seed_number],
553
- )
554
-
555
- return demo
556
-
557
-
558
- if __name__ == "__main__":
559
- demo = build_interface()
560
- share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
561
- demo.launch(server_name="0.0.0.0", server_port=7860, share=share, mcp_server=True)