jkorstad commited on
Commit
620fd78
Β·
1 Parent(s): 0eaa943

Add ZeroGPU spaces.GPU decorators and local fallback for GPU functions

Browse files
__pycache__/app.cpython-311.pyc ADDED
Binary file (27.7 kB). View file
 
__pycache__/backend.cpython-311.pyc ADDED
Binary file (26.5 kB). View file
 
app.py CHANGED
@@ -4,14 +4,29 @@ High-fidelity audiobook generator with character voice mapping.
4
  """
5
 
6
  import os
7
- import json
8
  from pathlib import Path
9
  from typing import Dict, List, Optional
10
 
11
  import gradio as gr
12
  import numpy as np
13
- import soundfile as sf
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  from backend import (
16
  AudiobookPipeline,
17
  VoiceConfig,
@@ -144,27 +159,12 @@ def extract_chars(text: str, use_ai: bool) -> tuple:
144
  return chars, status
145
 
146
 
147
- def _build_char_dict(
148
- names, descs, modes, presets, audios, ref_texts, designs, instructs, langs
149
- ) -> List[Dict]:
150
- chars = []
151
- for i in range(8):
152
- if names[i]:
153
- chars.append({
154
- "name": names[i],
155
- "description": descs[i] or "",
156
- "voice_mode": modes[i],
157
- "voice_preset": presets[i] if modes[i] == "preset" else None,
158
- "voice_ref_audio": audios[i] if modes[i] == "clone" else None,
159
- "voice_ref_text": ref_texts[i] if modes[i] == "clone" else None,
160
- "voice_design_desc": designs[i] if modes[i] == "design" else None,
161
- "voice_instruct": instructs[i] or "",
162
- "language": langs[i],
163
- })
164
- return chars
165
-
166
-
167
- def generate_audiobook(
168
  text,
169
  nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang,
170
  gen_temp, gen_seed,
@@ -225,7 +225,8 @@ def generate_audiobook(
225
  return None, f"Error: {str(e)}"
226
 
227
 
228
- def preview_narrator(mode, preset, audio, ref_text, design, instruct, lang):
 
229
  pipe = get_pipeline()
230
  vc = VoiceConfig(
231
  name="Narrator",
@@ -307,9 +308,6 @@ def build_app():
307
 
308
  extract_status = gr.Textbox(label="Status", interactive=False)
309
 
310
- # Hidden states to hold character data
311
- char_state = gr.State(value=[])
312
-
313
  # ==================== TAB 2 ====================
314
  with gr.TabItem("🎭 Voice Cast"):
315
  with gr.Row():
@@ -361,7 +359,7 @@ def build_app():
361
  outputs=[nar_preset, nar_audio, nar_ref_text, nar_design],
362
  )
363
  nar_preview_btn.click(
364
- preview_narrator,
365
  inputs=[nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang],
366
  outputs=[nar_preview_audio, nar_preview_status],
367
  )
@@ -370,7 +368,6 @@ def build_app():
370
  gr.Markdown("## Character Voices")
371
  gr.Markdown("Configure up to 8 characters. Use **preset** for built-in speakers, **clone** to upload a voice sample, or **design** to describe a voice from text.")
372
 
373
- # Dynamic character rows β€” we'll create 8 static rows and toggle visibility
374
  char_names = []
375
  char_descs = []
376
  char_modes = []
@@ -434,24 +431,19 @@ def build_app():
434
  gr.Markdown("""
435
  ## AudioBook Forge
436
 
437
- **Model-agnostic, high-fidelity audiobook generation** using state-of-the-art open TTS.
438
-
439
- ### Current Backend: Qwen3-TTS
440
- - **1.7B CustomVoice** β€” 9 premium preset speakers with style control
441
- - **1.7B Base** β€” High-quality voice cloning from 3–10 second samples
442
- - **1.7B VoiceDesign** β€” Create voices from text descriptions
443
- - **10 languages** supported
444
- - **Apache 2.0** license β€” commercially usable
445
 
446
- ### Workflow
447
- 1. **Paste your story** in the Story Setup tab.
448
- 2. **Extract characters** automatically or define them manually.
449
- 3. **Assign voices** β€” choose presets, upload samples for cloning, or describe voices.
450
- 4. **Generate** β€” the engine detects narration vs dialogue and routes each segment to the right voice.
451
- 5. **Download** your finished audiobook as MP3.
452
 
453
- ### Architecture
454
- The TTS engine is fully model-agnostic. Swapping to a future SOTA model only requires updating the backend adapter.
 
 
 
 
 
 
 
455
 
456
  ### Tips for Best Quality
457
  - Use clean, noise-free voice samples for cloning (3–10 seconds).
@@ -463,19 +455,18 @@ def build_app():
463
  # ---------- Extract wiring ----------
464
  def do_extract(text, use_ai):
465
  chars, status = extract_chars(text, use_ai)
466
- # Build visibility updates
467
  updates = []
468
  for i in range(8):
469
  if i < len(chars):
470
  updates.extend([
471
- gr.update(visible=True), # row
472
  gr.update(value=chars[i].get("name", ""), visible=True),
473
  gr.update(value=chars[i].get("description", ""), visible=True),
474
  gr.update(value=chars[i].get("voice_mode", "preset"), visible=True),
475
  gr.update(value=chars[i].get("voice_preset", "Ryan"), visible=True),
476
- gr.update(visible=False), # audio
477
- gr.update(visible=False), # ref text
478
- gr.update(visible=False), # design
479
  gr.update(value=chars[i].get("voice_instruct", ""), visible=True),
480
  gr.update(value=chars[i].get("language", "English"), visible=True),
481
  ])
@@ -513,7 +504,7 @@ def build_app():
513
  )
514
 
515
  gen_btn.click(
516
- generate_audiobook,
517
  inputs=[
518
  story_input,
519
  nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang,
 
4
  """
5
 
6
  import os
 
7
  from pathlib import Path
8
  from typing import Dict, List, Optional
9
 
10
  import gradio as gr
11
  import numpy as np
 
12
 
13
+ # ---------------------------------------------------------------------------
14
+ # spaces / ZeroGPU compatibility
15
+ # ---------------------------------------------------------------------------
16
+ try:
17
+ import spaces
18
+ except ImportError:
19
+ class _SpacesGPU:
20
+ def __init__(self, duration=60):
21
+ self.duration = duration
22
+ def __call__(self, fn):
23
+ return fn
24
+ class spaces:
25
+ GPU = _SpacesGPU
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # Backend imports
29
+ # ---------------------------------------------------------------------------
30
  from backend import (
31
  AudiobookPipeline,
32
  VoiceConfig,
 
159
  return chars, status
160
 
161
 
162
+ # ---------------------------------------------------------------------------
163
+ # GPU-wrapped functions (ZeroGPU)
164
+ # ---------------------------------------------------------------------------
165
+
166
+ @spaces.GPU(duration=180)
167
+ def generate_audiobook_gpu(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  text,
169
  nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang,
170
  gen_temp, gen_seed,
 
225
  return None, f"Error: {str(e)}"
226
 
227
 
228
+ @spaces.GPU(duration=60)
229
+ def preview_narrator_gpu(mode, preset, audio, ref_text, design, instruct, lang):
230
  pipe = get_pipeline()
231
  vc = VoiceConfig(
232
  name="Narrator",
 
308
 
309
  extract_status = gr.Textbox(label="Status", interactive=False)
310
 
 
 
 
311
  # ==================== TAB 2 ====================
312
  with gr.TabItem("🎭 Voice Cast"):
313
  with gr.Row():
 
359
  outputs=[nar_preset, nar_audio, nar_ref_text, nar_design],
360
  )
361
  nar_preview_btn.click(
362
+ preview_narrator_gpu,
363
  inputs=[nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang],
364
  outputs=[nar_preview_audio, nar_preview_status],
365
  )
 
368
  gr.Markdown("## Character Voices")
369
  gr.Markdown("Configure up to 8 characters. Use **preset** for built-in speakers, **clone** to upload a voice sample, or **design** to describe a voice from text.")
370
 
 
371
  char_names = []
372
  char_descs = []
373
  char_modes = []
 
431
  gr.Markdown("""
432
  ## AudioBook Forge
433
 
434
+ **Model-agnostic, high-fidelity audiobook generator** powered by [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS). Create audiobooks where every character speaks with their own unique voice.
 
 
 
 
 
 
 
435
 
436
+ ## Features
 
 
 
 
 
437
 
438
+ - πŸŽ™οΈ **Character Voice Mapping** β€” Automatically detect characters from your story and assign unique voices to each one
439
+ - 🎭 **Three Voice Modes**
440
+ - **Preset** β€” 9 premium built-in speakers (English, Chinese, Japanese, Korean, dialects)
441
+ - **Clone** β€” Upload a 3–10 second voice sample to clone any voice
442
+ - **Design** β€” Describe a voice in text and the AI creates it
443
+ - πŸ“– **Smart Text Processing** β€” Automatically distinguishes narration from dialogue and routes each segment to the correct voice
444
+ - 🌐 **Multilingual** β€” Supports 10 languages via Qwen3-TTS
445
+ - ⚑ **ZeroGPU** β€” Runs on Hugging Face ZeroGPU (free A100 compute)
446
+ - πŸ”§ **Model Agnostic** β€” Backend is swappable; upgrade to future SOTA TTS models without changing the UI
447
 
448
  ### Tips for Best Quality
449
  - Use clean, noise-free voice samples for cloning (3–10 seconds).
 
455
  # ---------- Extract wiring ----------
456
  def do_extract(text, use_ai):
457
  chars, status = extract_chars(text, use_ai)
 
458
  updates = []
459
  for i in range(8):
460
  if i < len(chars):
461
  updates.extend([
462
+ gr.update(visible=True),
463
  gr.update(value=chars[i].get("name", ""), visible=True),
464
  gr.update(value=chars[i].get("description", ""), visible=True),
465
  gr.update(value=chars[i].get("voice_mode", "preset"), visible=True),
466
  gr.update(value=chars[i].get("voice_preset", "Ryan"), visible=True),
467
+ gr.update(visible=False),
468
+ gr.update(visible=False),
469
+ gr.update(visible=False),
470
  gr.update(value=chars[i].get("voice_instruct", ""), visible=True),
471
  gr.update(value=chars[i].get("language", "English"), visible=True),
472
  ])
 
504
  )
505
 
506
  gen_btn.click(
507
+ generate_audiobook_gpu,
508
  inputs=[
509
  story_input,
510
  nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang,
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  gradio>=6.13.0,<7.0
2
  qwen-tts>=0.1.0
3
  torch>=2.2.0
 
1
+ spaces>=0.30.0
2
  gradio>=6.13.0,<7.0
3
  qwen-tts>=0.1.0
4
  torch>=2.2.0