AudioBook / app.py
jkorstad's picture
Remove Use AI enhancement toggle β€” always use AI for character extraction since Qwen3-TTS is the default engine
2f4164d
"""
AudioBook Forge - Enhanced Gradio Frontend
High-fidelity audiobook generator with character voice mapping,
file upload, chapter selection, segment previews, and project save/load.
"""
import os
import json
from pathlib import Path
from typing import Dict, List, Optional
import gradio as gr
import numpy as np
# ---------------------------------------------------------------------------
# spaces / ZeroGPU compatibility
# ---------------------------------------------------------------------------
try:
import spaces
except ImportError:
class _SpacesGPU:
def __init__(self, duration=60):
self.duration = duration
def __call__(self, fn):
return fn
class spaces:
GPU = _SpacesGPU
# ---------------------------------------------------------------------------
# Backend imports
# ---------------------------------------------------------------------------
from backend import (
AudiobookPipeline,
VoiceConfig,
PRESET_SPEAKERS,
SAMPLE_STORIES,
save_project,
load_project,
estimate_duration,
)
# ---------------------------------------------------------------------------
# CSS & Theme
# ---------------------------------------------------------------------------
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
body, .gradio-container {
font-family: 'Inter', sans-serif !important;
background: #0f172a !important;
color: #f8fafc !important;
}
.gradio-container {
max-width: 1200px !important;
}
.ab-header {
text-align: center;
padding: 2.2rem 1rem 1.8rem;
background: linear-gradient(135deg, rgba(99,102,241,0.12) 0%, rgba(34,211,238,0.06) 100%);
border-radius: 18px;
margin-bottom: 1.5rem;
border: 1px solid rgba(99,102,241,0.18);
}
.ab-header h1 {
font-size: 2.6rem;
font-weight: 700;
margin: 0;
background: linear-gradient(90deg, #a5b4fc, #22d3ee);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.ab-header p {
color: #94a3b8;
margin-top: 0.6rem;
font-size: 1.05rem;
}
.ab-card {
background: #1e293b !important;
border: 1px solid #334155 !important;
border-radius: 14px !important;
padding: 1.25rem !important;
}
.ab-stat {
background: #0f172a;
border: 1px solid #334155;
border-radius: 10px;
padding: 0.75rem 1rem;
text-align: center;
}
.ab-stat .value {
font-size: 1.4rem;
font-weight: 700;
color: #22d3ee;
}
.ab-stat .label {
font-size: 0.75rem;
color: #94a3b8;
text-transform: uppercase;
letter-spacing: 0.05em;
}
button.primary {
background: linear-gradient(135deg, #6366f1, #4f46e5) !important;
border: none !important;
border-radius: 10px !important;
font-weight: 600 !important;
transition: all 0.2s ease !important;
}
button.primary:hover {
transform: translateY(-1px);
box-shadow: 0 4px 14px rgba(99,102,241,0.4) !important;
}
button.secondary {
background: #334155 !important;
border: 1px solid #475569 !important;
border-radius: 10px !important;
color: #f8fafc !important;
}
input, textarea, select {
background: #0f172a !important;
border: 1px solid #334155 !important;
border-radius: 8px !important;
color: #f8fafc !important;
}
input:focus, textarea:focus, select:focus {
border-color: #6366f1 !important;
box-shadow: 0 0 0 3px rgba(99,102,241,0.15) !important;
}
.gr-box, .gr-form {
background: #1e293b !important;
border-color: #334155 !important;
}
.gr-panel {
background: #1e293b !important;
}
.tabitem {
background: #1e293b !important;
border-color: #334155 !important;
}
input[type="checkbox"] + label,
.checkbox-label,
.gr-checkbox label {
color: #f8fafc !important;
}
/* Gradio 5+ checkbox checked state - make it clearly visible in dark theme */
.gr-checkbox input[type="checkbox"]:checked + label,
.gr-checkbox-checked label,
.gr-checkbox-input:checked + .gr-checkbox-border,
.gr-checkbox-input:checked + label .gr-checkbox-border,
input[type="checkbox"]:checked + label span {
background: #6366f1 !important;
border-color: #818cf8 !important;
box-shadow: 0 0 0 3px rgba(99,102,241,0.35) !important;
}
.gr-checkbox input[type="checkbox"]:checked + label::after,
.gr-checkbox-input:checked + label::after {
border-color: #ffffff !important;
}
.gr-checkbox {
color: #f8fafc !important;
}
.gr-checkbox-input:checked + * {
background: #6366f1 !important;
border-color: #818cf8 !important;
}
li, .prose li, .gr-prose li {
color: #cbd5e1 !important;
}
strong, b {
color: #f8fafc !important;
}
code {
background: #334155 !important;
color: #22d3ee !important;
padding: 0.1rem 0.3rem !important;
border-radius: 4px !important;
}
progress {
width: 100%;
height: 8px;
border-radius: 4px;
background: #334155;
}
progress::-webkit-progress-bar {
background: #334155;
border-radius: 4px;
}
progress::-webkit-progress-value {
background: linear-gradient(90deg, #6366f1, #22d3ee);
border-radius: 4px;
}
.seg-item {
background: #0f172a;
border: 1px solid #334155;
border-radius: 8px;
padding: 0.5rem 0.75rem;
margin-bottom: 0.4rem;
font-size: 0.85rem;
}
.seg-item .seg-type {
display: inline-block;
padding: 0.1rem 0.4rem;
border-radius: 4px;
font-size: 0.7rem;
font-weight: 600;
text-transform: uppercase;
}
.seg-type.narration { background: #4f46e5; color: #fff; }
.seg-type.dialogue { background: #22d3ee; color: #0f172a; }
"""
# ---------------------------------------------------------------------------
# Global State
# ---------------------------------------------------------------------------
_pipeline: Optional[AudiobookPipeline] = None
def get_pipeline() -> AudiobookPipeline:
global _pipeline
if _pipeline is None:
_pipeline = AudiobookPipeline()
return _pipeline
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def on_mode_change(mode: str) -> tuple:
if mode == "preset":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
elif mode == "clone":
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
else:
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
def update_stats(text: str) -> tuple:
wc = len(text.split()) if text else 0
dur = estimate_duration(wc)
return str(wc), dur
def handle_upload(file_obj) -> tuple:
if file_obj is None:
return "", "No file uploaded."
try:
pipe = get_pipeline()
text, fname = pipe.parse_upload(file_obj)
text = pipe.processor.clean_text(text)
chs = pipe.detect_chapters(text)
ch_info = " | ".join([f"Ch{c['idx']+1}: {c['word_count']}w" for c in chs[:5]])
if len(chs) > 5:
ch_info += f" (+{len(chs)-5} more)"
wc = len(text.split())
dur = estimate_duration(wc)
return text, f"Loaded {fname} β€” {wc} words (~{dur}) | {ch_info if chs else '1 section'}"
except Exception as e:
return "", f"Error: {e}"
def extract_chars(text: str) -> tuple:
if not text or len(text.strip()) < 20:
return [], "Text too short. Please paste at least a paragraph."
pipe = get_pipeline()
chars = pipe.extract_characters(text, use_ai=True)
status = f"Found {len(chars)} characters: {', '.join(c['name'] for c in chars)}" if chars else "No characters auto-detected. Add them manually below."
return chars, status
def get_chapter_text(text: str, chapter_sel: str) -> str:
if not text or chapter_sel == "All" or not chapter_sel:
return text
try:
idx = int(chapter_sel.split(":")[0].replace("Ch", "")) - 1
pipe = get_pipeline()
return pipe.get_chapter_text(text, idx)
except Exception:
return text
# ---------------------------------------------------------------------------
# GPU-wrapped functions (ZeroGPU)
# ---------------------------------------------------------------------------
@spaces.GPU(duration=180)
def generate_audiobook_gpu(
text,
nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
gen_temp, gen_seed, output_fmt, *args
):
if not text or len(text.strip()) < 50:
return None, None, "", "Error: Please provide at least 50 characters of story text.", ""
wc = len(text.split())
if wc > 5000:
print(f"[WARN] Long text: {wc} words. Generation may take a while or hit timeouts.")
# Unpack character args (80 values = 8 chars x 10 fields)
names = list(args[0:8])
descs = list(args[8:16])
modes = list(args[16:24])
presets = list(args[24:32])
audios = list(args[32:40])
ref_texts = list(args[40:48])
designs = list(args[48:56])
instructs = list(args[56:64])
langs = list(args[64:72])
speeds = list(args[72:80])
pipe = get_pipeline()
nar_cfg = VoiceConfig(
name="Narrator",
mode=nar_mode,
preset=nar_preset if nar_mode == "preset" else None,
ref_audio=nar_audio if nar_mode == "clone" and nar_audio else None,
ref_text=nar_ref_text if nar_mode == "clone" else None,
design_desc=nar_design if nar_mode == "design" else None,
instruct=nar_instruct,
language=nar_lang,
speed=float(nar_speed) if nar_speed else 1.0,
)
char_configs = {}
for i in range(8):
if not names[i]:
continue
vc = VoiceConfig(
name=names[i],
mode=modes[i],
preset=presets[i] if modes[i] == "preset" else None,
ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None,
ref_text=ref_texts[i] if modes[i] == "clone" else None,
design_desc=designs[i] if modes[i] == "design" else None,
instruct=instructs[i] or "",
language=langs[i],
speed=float(speeds[i]) if speeds[i] else 1.0,
)
char_configs[names[i]] = vc
progress_text = ""
def prog_cb(ratio: float, msg: str):
nonlocal progress_text
progress_text = f"[{ratio*100:.0f}%] {msg}"
print(progress_text)
try:
output_path, seg_paths, seg_meta = pipe.generate(
text=text,
narrator_config=nar_cfg,
character_configs=char_configs,
progress_callback=prog_cb,
temperature=gen_temp,
seed=int(gen_seed),
)
seg_html = "<div style='max-height: 300px; overflow-y: auto;'>"
for s in seg_meta[:50]:
tclass = "narration" if s['type'] == 'narration' else "dialogue"
seg_html += f"<div class='seg-item'><span class='seg-type {tclass}'>{s['type']}</span> <strong>{s['speaker']}</strong>: {s['text']}</div>"
if len(seg_meta) > 50:
seg_html += f"<div style='text-align:center;color:#94a3b8;padding:0.5rem;'>... and {len(seg_meta)-50} more segments</div>"
seg_html += "</div>"
extra_path = None
if output_fmt == "wav":
extra_path = output_path.replace(".mp3", ".wav")
from backend import save_audiobook
save_audiobook(seg_paths, extra_path, fmt="wav")
elif output_fmt == "zip":
extra_path = pipe.export_segments_zip(seg_paths)
final_path = extra_path if extra_path else output_path
return final_path, final_path, seg_html, f"Done! {len(seg_meta)} segments generated.", progress_text
except Exception as e:
import traceback
traceback.print_exc()
return None, None, "", f"Error: {str(e)}", progress_text
@spaces.GPU(duration=60)
def preview_narrator_gpu(mode, preset, audio, ref_text, design, instruct, lang, speed):
pipe = get_pipeline()
vc = VoiceConfig(
name="Narrator",
mode=mode,
preset=preset if mode == "preset" else None,
ref_audio=audio if mode == "clone" and audio else None,
ref_text=ref_text if mode == "clone" else None,
design_desc=design if mode == "design" else None,
instruct=instruct,
language=lang,
speed=float(speed) if speed else 1.0,
)
try:
wav, sr = pipe.preview_voice(vc)
return (sr, wav), "Preview ready!"
except Exception as e:
import traceback
traceback.print_exc()
return None, f"Preview failed: {e}"
@spaces.GPU(duration=60)
def preview_char_voice_gpu(name, mode, preset, audio, ref_text, design, instruct, lang, speed):
pipe = get_pipeline()
vc = VoiceConfig(
name=name or "Character",
mode=mode,
preset=preset if mode == "preset" else None,
ref_audio=audio if mode == "clone" and audio else None,
ref_text=ref_text if mode == "clone" else None,
design_desc=design if mode == "design" else None,
instruct=instruct,
language=lang,
speed=float(speed) if speed else 1.0,
)
try:
sample = f"Hello, I am {name or 'your character'}. This is how I sound in the story."
wav, sr = pipe.preview_voice(vc, sample_text=sample)
return (sr, wav), f"{name or 'Character'} preview ready!"
except Exception as e:
import traceback
traceback.print_exc()
return None, f"Preview failed: {e}"
# ---------------------------------------------------------------------------
# Quick Generate
# ---------------------------------------------------------------------------
@spaces.GPU(duration=180)
def quick_generate_gpu(text, mode, preset, audio, ref_text, design, instruct, lang, speed, gen_temp, output_fmt, gen_seed=42):
if not text or len(text.strip()) < 50:
return None, None, "Error: Text too short."
wc = len(text.split())
if wc > 5000:
print(f"[WARN] Long text: {wc} words. Quick Generate may take a while or hit timeouts.")
pipe = get_pipeline()
nar_cfg = VoiceConfig(
name="Narrator",
mode=mode,
preset=preset if mode == "preset" else None,
ref_audio=audio if mode == "clone" and audio else None,
ref_text=ref_text if mode == "clone" else None,
design_desc=design if mode == "design" else None,
instruct=instruct or "Narrate clearly and expressively.",
language=lang,
speed=float(speed) if speed else 1.0,
)
def prog_cb(ratio: float, msg: str):
print(f"[{ratio*100:.0f}%] {msg}")
try:
output_path, seg_paths, seg_meta = pipe.generate(
text=text,
narrator_config=nar_cfg,
character_configs={},
progress_callback=prog_cb,
temperature=gen_temp,
seed=int(gen_seed),
)
extra_path = None
if output_fmt == "wav":
extra_path = output_path.replace(".mp3", ".wav")
from backend import save_audiobook
save_audiobook(seg_paths, extra_path, fmt="wav")
elif output_fmt == "zip":
extra_path = pipe.export_segments_zip(seg_paths)
final_path = extra_path if extra_path else output_path
return final_path, final_path, f"Quick audiobook ready! {len(seg_meta)} segments."
except Exception as e:
import traceback
traceback.print_exc()
return None, None, f"Error: {str(e)}"
# ---------------------------------------------------------------------------
# Project Save/Load
# ---------------------------------------------------------------------------
def do_save_project(text, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed, *args):
# Unpack character args (80 values) + gen_temp + gen_seed
names = list(args[0:8])
descs = list(args[8:16])
modes = list(args[16:24])
presets = list(args[24:32])
audios = list(args[32:40])
ref_texts = list(args[40:48])
designs = list(args[48:56])
instructs = list(args[56:64])
langs = list(args[64:72])
speeds = list(args[72:80])
gen_temp = args[80] if len(args) > 80 else 0.7
gen_seed = args[81] if len(args) > 81 else 42
nar_cfg = VoiceConfig(
name="Narrator", mode=nar_mode, preset=nar_preset if nar_mode == "preset" else None,
ref_audio=nar_audio if nar_mode == "clone" and nar_audio else None,
ref_text=nar_ref_text if nar_mode == "clone" else None,
design_desc=nar_design if nar_mode == "design" else None,
instruct=nar_instruct, language=nar_lang,
speed=float(nar_speed) if nar_speed else 1.0,
)
char_configs = {}
for i in range(8):
if not names[i]:
continue
char_configs[names[i]] = VoiceConfig(
name=names[i], mode=modes[i], description=descs[i] or "",
preset=presets[i] if modes[i] == "preset" else None,
ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None,
ref_text=ref_texts[i] if modes[i] == "clone" else None,
design_desc=designs[i] if modes[i] == "design" else None,
instruct=instructs[i] or "", language=langs[i],
speed=float(speeds[i]) if speeds[i] else 1.0,
)
settings = {"temperature": gen_temp, "seed": int(gen_seed)}
json_str = save_project(text, nar_cfg, char_configs, settings)
return json_str
def do_load_project(json_str):
try:
data = load_project(json_str)
nar = data["narrator"]
chars = data.get("characters", {})
nar_updates = [
gr.update(value=nar.mode),
gr.update(value=nar.preset if nar.preset else "Ryan", visible=nar.mode=="preset"),
gr.update(value=nar.ref_audio, visible=nar.mode=="clone"),
gr.update(value=nar.ref_text, visible=nar.mode=="clone"),
gr.update(value=nar.design_desc, visible=nar.mode=="design"),
gr.update(value=nar.instruct),
gr.update(value=nar.language),
gr.update(value=nar.speed),
]
char_updates = []
char_items = list(chars.items())[:8]
for i in range(8):
if i < len(char_items):
_, c = char_items[i]
char_updates.extend([
gr.update(visible=True),
gr.update(value=c.name, visible=True),
gr.update(value=c.description, visible=True),
gr.update(value=c.mode, visible=True),
gr.update(value=c.preset if c.preset else "Ryan", visible=c.mode=="preset"),
gr.update(value=c.ref_audio, visible=c.mode=="clone"),
gr.update(value=c.ref_text, visible=c.mode=="clone"),
gr.update(value=c.design_desc, visible=c.mode=="design"),
gr.update(value=c.instruct, visible=True),
gr.update(value=c.language, visible=True),
gr.update(value=c.speed, visible=True),
gr.update(visible=True),
gr.update(visible=True),
gr.update(visible=True),
])
else:
char_updates.extend([
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
])
text_sample = data.get("text_sample", "")
return [text_sample] + nar_updates + char_updates + [f"Project loaded! {len(chars)} characters configured."]
except Exception as e:
import traceback
traceback.print_exc()
return [""] + [gr.update()] * 8 + [gr.update(visible=False)] * 112 + [f"Error loading project: {e}"]
# ---------------------------------------------------------------------------
# Build UI
# ---------------------------------------------------------------------------
def build_app():
theme = gr.themes.Soft(
primary_hue="indigo",
secondary_hue="cyan",
neutral_hue="slate",
).set(
body_background_fill="#0f172a",
body_background_fill_dark="#0f172a",
body_text_color="#f8fafc",
body_text_color_subdued="#94a3b8",
background_fill_primary="#1e293b",
background_fill_secondary="#0f172a",
border_color_accent="#334155",
color_accent_soft="#22d3ee",
button_primary_background_fill="linear-gradient(135deg, #6366f1, #4f46e5)",
button_primary_background_fill_hover="linear-gradient(135deg, #4f46e5, #4338ca)",
button_primary_text_color="#ffffff",
input_background_fill="#0f172a",
input_border_color="#334155",
block_title_text_color="#f8fafc",
block_label_text_color="#94a3b8",
)
with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="AudioBook Forge") as demo:
gr.HTML("""
<div class="ab-header">
<h1>AudioBook Forge</h1>
<p>High-fidelity audiobooks with AI character voices. Model-agnostic TTS powered by Qwen3-TTS.</p>
</div>
""")
with gr.Tabs():
# ==================== TAB 1: Story ====================
with gr.TabItem("πŸ“– Story"):
with gr.Row():
with gr.Column(scale=2):
gr.Markdown("### Upload or Paste")
file_upload = gr.File(
label="Upload EPUB, PDF, TXT, or HTML",
file_types=[".txt", ".epub", ".pdf", ".html", ".htm"],
)
story_input = gr.TextArea(
label="Story Text",
placeholder="Paste your book chapter, short story, or script here...",
lines=18,
max_lines=40,
)
sample_dropdown = gr.Dropdown(
label="Or try a sample story",
choices=list(SAMPLE_STORIES.keys()),
value=None,
)
with gr.Column(scale=1):
gr.Markdown("### Stats")
with gr.Row():
stat_words = gr.Textbox(label="Words", value="0", interactive=False)
stat_dur = gr.Textbox(label="Est. Duration", value="0 sec", interactive=False)
gr.Markdown("---")
gr.Markdown("### Quick Generate")
quick_mode = gr.Dropdown(choices=["preset", "clone", "design"], value="design", label="Narrator Mode")
quick_preset = gr.Dropdown(choices=list(PRESET_SPEAKERS.keys()), value="Ryan", label="Preset Voice", visible=False)
quick_audio = gr.Audio(label="Upload Voice Sample (3–10s)", type="filepath", visible=False)
quick_ref_text = gr.Textbox(label="Reference Transcript", placeholder="What does the sample say?", visible=False)
quick_design = gr.TextArea(label="Voice Description", placeholder="e.g. A warm, raspy baritone with measured pacing...", visible=True, lines=2, value="A clear, warm, expressive audiobook narrator voice with professional pacing and rich tone.")
quick_instruct = gr.Textbox(label="Style Instruction", placeholder="e.g. Calm, measured storytelling.", value="")
quick_lang = gr.Dropdown(choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", label="Language")
quick_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
quick_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
quick_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format")
quick_btn = gr.Button("⚑ Quick Generate", variant="primary")
quick_output_audio = gr.Audio(label="Quick Audiobook", type="filepath", interactive=False)
quick_output_file = gr.File(label="Download", interactive=False)
quick_status = gr.Textbox(show_label=False, interactive=False)
gr.Markdown("---")
gr.Markdown("**Quick Generate** uses a single narrator voice for the entire text. Supports preset, clone, or AI-designed voices.")
with gr.Row():
chapter_selector = gr.Dropdown(
label="Chapter / Section",
choices=["All"],
value="All",
interactive=True,
)
refresh_chapters_btn = gr.Button("πŸ”„ Detect Chapters")
clear_story_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
def clear_story():
return "", gr.update(choices=["All"], value="All"), "0", "0 sec"
clear_story_btn.click(
clear_story,
inputs=[],
outputs=[story_input, chapter_selector, stat_words, stat_dur],
)
with gr.Row():
gr.Markdown("### Character Detection")
extract_btn = gr.Button("πŸ” Extract Characters", variant="primary")
extract_status = gr.Textbox(label="Status", interactive=False)
# Wiring
file_upload.change(handle_upload, inputs=[file_upload], outputs=[story_input, extract_status])
def load_sample_and_update(name):
text = SAMPLE_STORIES.get(name, "")
wc = len(text.split()) if text else 0
dur = estimate_duration(wc)
return text, str(wc), dur, gr.update(choices=["All"], value="All"), ""
sample_dropdown.change(
load_sample_and_update,
inputs=[sample_dropdown],
outputs=[story_input, stat_words, stat_dur, chapter_selector, extract_status],
)
story_input.change(update_stats, inputs=[story_input], outputs=[stat_words, stat_dur])
quick_btn.click(
quick_generate_gpu,
inputs=[story_input, quick_mode, quick_preset, quick_audio, quick_ref_text, quick_design, quick_instruct, quick_lang, quick_speed, quick_temp, quick_fmt],
outputs=[quick_output_audio, quick_output_file, quick_status],
)
quick_mode.change(on_mode_change, inputs=quick_mode, outputs=[quick_preset, quick_audio, quick_ref_text, quick_design])
def refresh_chapters(text):
if not text:
return gr.update(choices=["All"], value="All")
pipe = get_pipeline()
chs = pipe.detect_chapters(text)
choices = ["All"] + [f"Ch{c['idx']+1}: {c['title'][:60]}" for c in chs]
return gr.update(choices=choices, value="All")
refresh_chapters_btn.click(refresh_chapters, inputs=[story_input], outputs=[chapter_selector])
# ==================== TAB 2: Voice Cast ====================
with gr.TabItem("🎭 Voice Cast"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Narrator")
with gr.Column(elem_classes="ab-card"):
nar_mode = gr.Dropdown(choices=["preset", "clone", "design"], value="design", label="Mode")
nar_preset = gr.Dropdown(choices=list(PRESET_SPEAKERS.keys()), value="Ryan", label="Preset Voice", visible=False)
nar_audio = gr.Audio(label="Upload Voice Sample (3–10s)", type="filepath", visible=False)
nar_ref_text = gr.Textbox(label="Reference Transcript", placeholder="What does the sample say?", visible=False)
nar_design = gr.TextArea(label="Voice Description", placeholder="e.g. A warm, raspy baritone with measured pacing...", visible=True, lines=2, value="A clear, warm, expressive audiobook narrator voice with professional pacing and rich tone.")
nar_instruct = gr.Textbox(label="Style Instruction", placeholder="e.g. Calm, measured storytelling.")
nar_lang = gr.Dropdown(choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", label="Language")
nar_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
nar_preview_btn = gr.Button("πŸ”Š Preview Narrator", variant="secondary")
nar_preview_audio = gr.Audio(label="Preview", interactive=False)
nar_preview_status = gr.Textbox(show_label=False, interactive=False)
nar_mode.change(on_mode_change, inputs=nar_mode, outputs=[nar_preset, nar_audio, nar_ref_text, nar_design])
nar_preview_btn.click(
preview_narrator_gpu,
inputs=[nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed],
outputs=[nar_preview_audio, nar_preview_status],
)
with gr.Column(scale=2):
gr.Markdown("## Character Voices")
gr.Markdown("""
Configure up to 8 characters. Each character can use one of three voice modes:
- **Preset** β€” Choose from 9 built-in speakers (Ryan, Aiden, Serena, etc.)
- **Clone** β€” Upload a 3–10 second voice sample to clone any real voice
- **Design** β€” Describe a voice in text (e.g. *"A raspy old man with a warm chuckle"*) and the AI will create it
""")
char_names, char_descs, char_modes, char_presets = [], [], [], []
char_audios, char_ref_texts, char_designs, char_instructs, char_langs, char_speeds = [], [], [], [], [], []
char_rows, char_preview_btns, char_preview_audios, char_preview_statuses = [], [], [], []
for i in range(8):
visible_default = (i == 0)
with gr.Group(visible=visible_default) as row:
with gr.Row():
cn = gr.Textbox(label="Name", placeholder="e.g. Alice", visible=visible_default)
cd = gr.Textbox(label="Description", placeholder="Personality note", visible=visible_default)
cm = gr.Dropdown(label="Mode", choices=["preset", "clone", "design"], value="design", visible=visible_default)
cp = gr.Dropdown(label="Preset", choices=list(PRESET_SPEAKERS.keys()), value="Ryan", visible=False)
with gr.Row():
ca = gr.Audio(label="Voice Sample", type="filepath", visible=False)
crt = gr.Textbox(label="Ref Transcript", placeholder="What the sample says", visible=False)
cdes = gr.TextArea(label="Voice Description", placeholder="e.g. A shrill, nervous teenager.", visible=visible_default, lines=2)
cinstr = gr.Textbox(label="Style Instruction", placeholder="e.g. Angry and loud.", visible=visible_default)
cl = gr.Dropdown(label="Language", choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", visible=visible_default)
cspd = gr.Slider(label="Speed", minimum=0.5, maximum=2.0, value=1.0, step=0.1, visible=visible_default)
with gr.Row():
cpv_btn = gr.Button("πŸ”Š Preview", variant="secondary", visible=visible_default)
cpv_audio = gr.Audio(label="Preview", interactive=False, visible=visible_default)
cpv_status = gr.Textbox(show_label=False, interactive=False, visible=visible_default)
cm.change(on_mode_change, inputs=cm, outputs=[cp, ca, crt, cdes])
cpv_btn.click(
preview_char_voice_gpu,
inputs=[cn, cm, cp, ca, crt, cdes, cinstr, cl, cspd],
outputs=[cpv_audio, cpv_status],
)
char_rows.append(row)
char_names.append(cn)
char_descs.append(cd)
char_modes.append(cm)
char_presets.append(cp)
char_audios.append(ca)
char_ref_texts.append(crt)
char_designs.append(cdes)
char_instructs.append(cinstr)
char_langs.append(cl)
char_speeds.append(cspd)
char_preview_btns.append(cpv_btn)
char_preview_audios.append(cpv_audio)
char_preview_statuses.append(cpv_status)
# ==================== TAB 3: Generate ====================
with gr.TabItem("⚑ Generate"):
gr.Markdown("_Note: The first generation downloads Qwen3-TTS 1.7B models (~5 GB) and may take 2–5 minutes. Subsequent runs are much faster._")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Settings")
gen_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
gen_seed = gr.Number(value=42, precision=0, label="Seed (fix for consistency)")
output_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format")
gen_btn = gr.Button("▢️ Generate Full Audiobook", variant="primary", size="lg")
gen_progress = gr.Textbox(label="Progress", interactive=False, value="Ready.")
with gr.Column(scale=2):
gr.Markdown("### Output")
output_audio = gr.Audio(label="Generated Audiobook", type="filepath", interactive=False)
output_file = gr.File(label="Download", interactive=False)
output_status = gr.Textbox(label="Status", interactive=False)
segment_list = gr.HTML(label="Segments")
# ==================== TAB 4: Project ====================
with gr.TabItem("πŸ’Ύ Project"):
with gr.Row():
with gr.Column():
gr.Markdown("### Save Project")
save_btn = gr.Button("πŸ’Ύ Save Configuration", variant="primary")
project_json = gr.TextArea(label="Project JSON (copy this to save)", lines=10, interactive=True)
with gr.Column():
gr.Markdown("### Load Project")
load_json = gr.TextArea(label="Paste Project JSON here", lines=10, interactive=True)
load_btn = gr.Button("πŸ“‚ Load Configuration", variant="secondary")
load_status = gr.Textbox(label="Status", interactive=False)
# ==================== TAB 5: About ====================
with gr.TabItem("ℹ️ About"):
gr.Markdown("""
## AudioBook Forge
**Model-agnostic, high-fidelity audiobook generator** powered by [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS).
### Features
- πŸ“ **File Upload** β€” Import EPUB, PDF, TXT, or HTML directly
- πŸ“– **Chapter Detection** β€” Auto-detects chapters/sections for selective generation
- πŸŽ™οΈ **Character Voice Mapping** β€” Auto-extract characters and assign unique voices
- 🎭 **Three Voice Modes** β€” Preset (9 speakers), Clone (upload sample), Design (text description)
- ⚑ **Quick Generate** β€” One-click audiobook with a single narrator voice
- 🎚️ **Speed Control** β€” Adjust playback speed per voice (0.5x–2.0x)
- πŸ“¦ **Multi-format Export** β€” MP3, WAV, or ZIP of individual segments
- πŸ’Ύ **Save/Load Projects** β€” Export and restore your voice configurations
- 🌐 **10 Languages** β€” English, Chinese, Japanese, Korean, German, French, Spanish, Italian, Portuguese, Russian
- ⚑ **ZeroGPU** β€” Runs on Hugging Face ZeroGPU (free compute)
### Workflow
1. **Upload or paste** your story text
2. **Detect chapters** (optional) and select a range
3. **Extract characters** or use Quick Generate for simple narration
4. **Assign voices** to narrator and each character
5. **Generate** and download your audiobook
### Tips for Best Quality
- Use clean, noise-free voice samples for cloning (3–10 seconds)
- Keep reference transcripts accurate
- Lower temperature (0.5–0.6) for stable narration; higher (0.8–0.9) for expressive dialogue
- Use a fixed seed to prevent voice drift across segments
- Use speed adjustment to fine-tune pacing per character
### Note on First Run
The first time you generate audio, the Space downloads the Qwen3-TTS 1.7B models (~5 GB total). This can take **2–5 minutes** depending on network speed. Subsequent runs are much faster because models are cached. Please be patient β€” the progress is printed in the server logs.
""")
# ---------- Extract wiring ----------
def do_extract(text):
chars, status = extract_chars(text)
updates = []
for i in range(8):
if i < len(chars):
mode = chars[i].get("voice_mode", "design")
is_preset = mode == "preset"
is_clone = mode == "clone"
is_design = mode == "design"
updates.extend([
gr.update(visible=True),
gr.update(value=chars[i].get("name", ""), visible=True),
gr.update(value=chars[i].get("description", ""), visible=True),
gr.update(value=mode, visible=True),
gr.update(value=chars[i].get("voice_preset", "Ryan"), visible=is_preset),
gr.update(visible=is_clone),
gr.update(visible=is_clone),
gr.update(value=chars[i].get("voice_description", ""), visible=is_design),
gr.update(value=chars[i].get("voice_instruct", ""), visible=True),
gr.update(value=chars[i].get("language", "English"), visible=True),
gr.update(value=chars[i].get("speed", 1.0), visible=True),
gr.update(visible=True),
gr.update(visible=True),
gr.update(visible=True),
])
else:
updates.extend([
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
])
return [status] + updates
extract_outputs = [extract_status] + [
item for sublist in [
[char_rows[i], char_names[i], char_descs[i], char_modes[i], char_presets[i],
char_audios[i], char_ref_texts[i], char_designs[i], char_instructs[i], char_langs[i],
char_speeds[i], char_preview_btns[i], char_preview_audios[i], char_preview_statuses[i]]
for i in range(8)
] for item in sublist
]
extract_btn.click(do_extract, inputs=[story_input], outputs=extract_outputs)
# ---------- Generate wiring ----------
all_char_inputs = (
char_names + char_descs + char_modes + char_presets +
char_audios + char_ref_texts + char_designs + char_instructs + char_langs + char_speeds
)
gen_inputs = [
story_input, chapter_selector,
nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
gen_temp, gen_seed, output_fmt,
] + all_char_inputs
def wrapped_generate(story_text, chapter_sel, *args):
text = get_chapter_text(story_text, chapter_sel)
return generate_audiobook_gpu(text, *args)
gen_btn.click(
wrapped_generate,
inputs=gen_inputs,
outputs=[output_audio, output_file, segment_list, output_status, gen_progress],
)
# ---------- Project wiring ----------
save_inputs = [
story_input,
nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
] + all_char_inputs + [gen_temp, gen_seed]
save_btn.click(do_save_project, inputs=save_inputs, outputs=[project_json])
load_outputs = [story_input, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed] + extract_outputs[1:] + [load_status]
load_btn.click(do_load_project, inputs=[load_json], outputs=load_outputs)
return demo
demo = build_app()
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)