Spaces:

jkorstad
/

AudioBook

Running on Zero

App Files Files Community

jkorstad commited on Apr 23

Commit

620fd78

1 Parent(s): 0eaa943

Add ZeroGPU spaces.GPU decorators and local fallback for GPU functions

Browse files

Files changed (4) hide show

__pycache__/app.cpython-311.pyc +0 -0
__pycache__/backend.cpython-311.pyc +0 -0
app.py +42 -51
requirements.txt +1 -0

__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (27.7 kB). View file

__pycache__/backend.cpython-311.pyc ADDED Viewed

Binary file (26.5 kB). View file

app.py CHANGED Viewed

@@ -4,14 +4,29 @@ High-fidelity audiobook generator with character voice mapping.
 """
 import os
-import json
 from pathlib import Path
 from typing import Dict, List, Optional
 import gradio as gr
 import numpy as np
-import soundfile as sf
 from backend import (
     AudiobookPipeline,
     VoiceConfig,
@@ -144,27 +159,12 @@ def extract_chars(text: str, use_ai: bool) -> tuple:
     return chars, status
-def _build_char_dict(
-    names, descs, modes, presets, audios, ref_texts, designs, instructs, langs
-) -> List[Dict]:
-    chars = []
-    for i in range(8):
-        if names[i]:
-            chars.append({
-                "name": names[i],
-                "description": descs[i] or "",
-                "voice_mode": modes[i],
-                "voice_preset": presets[i] if modes[i] == "preset" else None,
-                "voice_ref_audio": audios[i] if modes[i] == "clone" else None,
-                "voice_ref_text": ref_texts[i] if modes[i] == "clone" else None,
-                "voice_design_desc": designs[i] if modes[i] == "design" else None,
-                "voice_instruct": instructs[i] or "",
-                "language": langs[i],
-            })
-    return chars
-def generate_audiobook(
     text,
     nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang,
     gen_temp, gen_seed,
@@ -225,7 +225,8 @@ def generate_audiobook(
         return None, f"Error: {str(e)}"
-def preview_narrator(mode, preset, audio, ref_text, design, instruct, lang):
     pipe = get_pipeline()
     vc = VoiceConfig(
         name="Narrator",
@@ -307,9 +308,6 @@ def build_app():
                 extract_status = gr.Textbox(label="Status", interactive=False)
-                # Hidden states to hold character data
-                char_state = gr.State(value=[])
             # ==================== TAB 2 ====================
             with gr.TabItem("🎭 Voice Cast"):
                 with gr.Row():
@@ -361,7 +359,7 @@ def build_app():
                                 outputs=[nar_preset, nar_audio, nar_ref_text, nar_design],
                             )
                             nar_preview_btn.click(
-                                preview_narrator,
                                 inputs=[nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang],
                                 outputs=[nar_preview_audio, nar_preview_status],
                             )
@@ -370,7 +368,6 @@ def build_app():
                         gr.Markdown("## Character Voices")
                         gr.Markdown("Configure up to 8 characters. Use **preset** for built-in speakers, **clone** to upload a voice sample, or **design** to describe a voice from text.")
-                        # Dynamic character rows — we'll create 8 static rows and toggle visibility
                         char_names = []
                         char_descs = []
                         char_modes = []
@@ -434,24 +431,19 @@ def build_app():
                 gr.Markdown("""
                 ## AudioBook Forge
-                **Model-agnostic, high-fidelity audiobook generation** using state-of-the-art open TTS.
-                ### Current Backend: Qwen3-TTS
-                - **1.7B CustomVoice** — 9 premium preset speakers with style control
-                - **1.7B Base** — High-quality voice cloning from 3–10 second samples
-                - **1.7B VoiceDesign** — Create voices from text descriptions
-                - **10 languages** supported
-                - **Apache 2.0** license — commercially usable
-                ### Workflow
-                1. **Paste your story** in the Story Setup tab.
-                2. **Extract characters** automatically or define them manually.
-                3. **Assign voices** — choose presets, upload samples for cloning, or describe voices.
-                4. **Generate** — the engine detects narration vs dialogue and routes each segment to the right voice.
-                5. **Download** your finished audiobook as MP3.
-                ### Architecture
-                The TTS engine is fully model-agnostic. Swapping to a future SOTA model only requires updating the backend adapter.
                 ### Tips for Best Quality
                 - Use clean, noise-free voice samples for cloning (3–10 seconds).
@@ -463,19 +455,18 @@ def build_app():
         # ---------- Extract wiring ----------
         def do_extract(text, use_ai):
             chars, status = extract_chars(text, use_ai)
-            # Build visibility updates
             updates = []
             for i in range(8):
                 if i < len(chars):
                     updates.extend([
-                        gr.update(visible=True),   # row
                         gr.update(value=chars[i].get("name", ""), visible=True),
                         gr.update(value=chars[i].get("description", ""), visible=True),
                         gr.update(value=chars[i].get("voice_mode", "preset"), visible=True),
                         gr.update(value=chars[i].get("voice_preset", "Ryan"), visible=True),
-                        gr.update(visible=False),   # audio
-                        gr.update(visible=False),   # ref text
-                        gr.update(visible=False),   # design
                         gr.update(value=chars[i].get("voice_instruct", ""), visible=True),
                         gr.update(value=chars[i].get("language", "English"), visible=True),
                     ])
@@ -513,7 +504,7 @@ def build_app():
         )
         gen_btn.click(
-            generate_audiobook,
             inputs=[
                 story_input,
                 nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang,

 """
 import os
 from pathlib import Path
 from typing import Dict, List, Optional
 import gradio as gr
 import numpy as np
+# ---------------------------------------------------------------------------
+# spaces / ZeroGPU compatibility
+# ---------------------------------------------------------------------------
+try:
+    import spaces
+except ImportError:
+    class _SpacesGPU:
+        def __init__(self, duration=60):
+            self.duration = duration
+        def __call__(self, fn):
+            return fn
+    class spaces:
+        GPU = _SpacesGPU
+# ---------------------------------------------------------------------------
+# Backend imports
+# ---------------------------------------------------------------------------
 from backend import (
     AudiobookPipeline,
     VoiceConfig,
     return chars, status
+# ---------------------------------------------------------------------------
+# GPU-wrapped functions (ZeroGPU)
+# ---------------------------------------------------------------------------
+@spaces.GPU(duration=180)
+def generate_audiobook_gpu(
     text,
     nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang,
     gen_temp, gen_seed,
         return None, f"Error: {str(e)}"
+@spaces.GPU(duration=60)
+def preview_narrator_gpu(mode, preset, audio, ref_text, design, instruct, lang):
     pipe = get_pipeline()
     vc = VoiceConfig(
         name="Narrator",
                 extract_status = gr.Textbox(label="Status", interactive=False)
             # ==================== TAB 2 ====================
             with gr.TabItem("🎭 Voice Cast"):
                 with gr.Row():
                                 outputs=[nar_preset, nar_audio, nar_ref_text, nar_design],
                             )
                             nar_preview_btn.click(
+                                preview_narrator_gpu,
                                 inputs=[nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang],
                                 outputs=[nar_preview_audio, nar_preview_status],
                             )
                         gr.Markdown("## Character Voices")
                         gr.Markdown("Configure up to 8 characters. Use **preset** for built-in speakers, **clone** to upload a voice sample, or **design** to describe a voice from text.")
                         char_names = []
                         char_descs = []
                         char_modes = []
                 gr.Markdown("""
                 ## AudioBook Forge
+                **Model-agnostic, high-fidelity audiobook generator** powered by [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS). Create audiobooks where every character speaks with their own unique voice.
+                ## Features
+                - 🎙️ **Character Voice Mapping** — Automatically detect characters from your story and assign unique voices to each one
+                - 🎭 **Three Voice Modes**
+                  - **Preset** — 9 premium built-in speakers (English, Chinese, Japanese, Korean, dialects)
+                  - **Clone** — Upload a 3–10 second voice sample to clone any voice
+                  - **Design** — Describe a voice in text and the AI creates it
+                - 📖 **Smart Text Processing** — Automatically distinguishes narration from dialogue and routes each segment to the correct voice
+                - 🌐 **Multilingual** — Supports 10 languages via Qwen3-TTS
+                - ⚡ **ZeroGPU** — Runs on Hugging Face ZeroGPU (free A100 compute)
+                - 🔧 **Model Agnostic** — Backend is swappable; upgrade to future SOTA TTS models without changing the UI
                 ### Tips for Best Quality
                 - Use clean, noise-free voice samples for cloning (3–10 seconds).
         # ---------- Extract wiring ----------
         def do_extract(text, use_ai):
             chars, status = extract_chars(text, use_ai)
             updates = []
             for i in range(8):
                 if i < len(chars):
                     updates.extend([
+                        gr.update(visible=True),
                         gr.update(value=chars[i].get("name", ""), visible=True),
                         gr.update(value=chars[i].get("description", ""), visible=True),
                         gr.update(value=chars[i].get("voice_mode", "preset"), visible=True),
                         gr.update(value=chars[i].get("voice_preset", "Ryan"), visible=True),
+                        gr.update(visible=False),
+                        gr.update(visible=False),
+                        gr.update(visible=False),
                         gr.update(value=chars[i].get("voice_instruct", ""), visible=True),
                         gr.update(value=chars[i].get("language", "English"), visible=True),
                     ])
         )
         gen_btn.click(
+            generate_audiobook_gpu,
             inputs=[
                 story_input,
                 nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang,

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 gradio>=6.13.0,<7.0
 qwen-tts>=0.1.0
 torch>=2.2.0

+spaces>=0.30.0
 gradio>=6.13.0,<7.0
 qwen-tts>=0.1.0
 torch>=2.2.0