Spaces:

jkorstad
/

AudioBook

Running on Zero

File size: 43,794 Bytes

"""
AudioBook Forge - Enhanced Gradio Frontend
High-fidelity audiobook generator with character voice mapping,
file upload, chapter selection, segment previews, and project save/load.
"""

import os
import json
from pathlib import Path
from typing import Dict, List, Optional

import gradio as gr
import numpy as np

# ---------------------------------------------------------------------------
# spaces / ZeroGPU compatibility
# ---------------------------------------------------------------------------
try:
    import spaces
except ImportError:
    class _SpacesGPU:
        def __init__(self, duration=60):
            self.duration = duration
        def __call__(self, fn):
            return fn
    class spaces:
        GPU = _SpacesGPU

# ---------------------------------------------------------------------------
# Backend imports
# ---------------------------------------------------------------------------
from backend import (
    AudiobookPipeline,
    VoiceConfig,
    PRESET_SPEAKERS,
    SAMPLE_STORIES,
    save_project,
    load_project,
    estimate_duration,
)

# ---------------------------------------------------------------------------
# CSS & Theme
# ---------------------------------------------------------------------------

CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');

body, .gradio-container {
    font-family: 'Inter', sans-serif !important;
    background: #0f172a !important;
    color: #f8fafc !important;
}

.gradio-container {
    max-width: 1200px !important;
}

.ab-header {
    text-align: center;
    padding: 2.2rem 1rem 1.8rem;
    background: linear-gradient(135deg, rgba(99,102,241,0.12) 0%, rgba(34,211,238,0.06) 100%);
    border-radius: 18px;
    margin-bottom: 1.5rem;
    border: 1px solid rgba(99,102,241,0.18);
}
.ab-header h1 {
    font-size: 2.6rem;
    font-weight: 700;
    margin: 0;
    background: linear-gradient(90deg, #a5b4fc, #22d3ee);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
}
.ab-header p {
    color: #94a3b8;
    margin-top: 0.6rem;
    font-size: 1.05rem;
}

.ab-card {
    background: #1e293b !important;
    border: 1px solid #334155 !important;
    border-radius: 14px !important;
    padding: 1.25rem !important;
}

.ab-stat {
    background: #0f172a;
    border: 1px solid #334155;
    border-radius: 10px;
    padding: 0.75rem 1rem;
    text-align: center;
}
.ab-stat .value {
    font-size: 1.4rem;
    font-weight: 700;
    color: #22d3ee;
}
.ab-stat .label {
    font-size: 0.75rem;
    color: #94a3b8;
    text-transform: uppercase;
    letter-spacing: 0.05em;
}

button.primary {
    background: linear-gradient(135deg, #6366f1, #4f46e5) !important;
    border: none !important;
    border-radius: 10px !important;
    font-weight: 600 !important;
    transition: all 0.2s ease !important;
}
button.primary:hover {
    transform: translateY(-1px);
    box-shadow: 0 4px 14px rgba(99,102,241,0.4) !important;
}
button.secondary {
    background: #334155 !important;
    border: 1px solid #475569 !important;
    border-radius: 10px !important;
    color: #f8fafc !important;
}

input, textarea, select {
    background: #0f172a !important;
    border: 1px solid #334155 !important;
    border-radius: 8px !important;
    color: #f8fafc !important;
}
input:focus, textarea:focus, select:focus {
    border-color: #6366f1 !important;
    box-shadow: 0 0 0 3px rgba(99,102,241,0.15) !important;
}

.gr-box, .gr-form {
    background: #1e293b !important;
    border-color: #334155 !important;
}
.gr-panel {
    background: #1e293b !important;
}

.tabitem {
    background: #1e293b !important;
    border-color: #334155 !important;
}

input[type="checkbox"] + label,
.checkbox-label,
.gr-checkbox label {
    color: #f8fafc !important;
}

/* Gradio 5+ checkbox checked state - make it clearly visible in dark theme */
.gr-checkbox input[type="checkbox"]:checked + label,
.gr-checkbox-checked label,
.gr-checkbox-input:checked + .gr-checkbox-border,
.gr-checkbox-input:checked + label .gr-checkbox-border,
input[type="checkbox"]:checked + label span {
    background: #6366f1 !important;
    border-color: #818cf8 !important;
    box-shadow: 0 0 0 3px rgba(99,102,241,0.35) !important;
}
.gr-checkbox input[type="checkbox"]:checked + label::after,
.gr-checkbox-input:checked + label::after {
    border-color: #ffffff !important;
}
.gr-checkbox {
    color: #f8fafc !important;
}
.gr-checkbox-input:checked + * {
    background: #6366f1 !important;
    border-color: #818cf8 !important;
}

li, .prose li, .gr-prose li {
    color: #cbd5e1 !important;
}

strong, b {
    color: #f8fafc !important;
}

code {
    background: #334155 !important;
    color: #22d3ee !important;
    padding: 0.1rem 0.3rem !important;
    border-radius: 4px !important;
}

progress {
    width: 100%;
    height: 8px;
    border-radius: 4px;
    background: #334155;
}
progress::-webkit-progress-bar {
    background: #334155;
    border-radius: 4px;
}
progress::-webkit-progress-value {
    background: linear-gradient(90deg, #6366f1, #22d3ee);
    border-radius: 4px;
}

.seg-item {
    background: #0f172a;
    border: 1px solid #334155;
    border-radius: 8px;
    padding: 0.5rem 0.75rem;
    margin-bottom: 0.4rem;
    font-size: 0.85rem;
}
.seg-item .seg-type {
    display: inline-block;
    padding: 0.1rem 0.4rem;
    border-radius: 4px;
    font-size: 0.7rem;
    font-weight: 600;
    text-transform: uppercase;
}
.seg-type.narration { background: #4f46e5; color: #fff; }
.seg-type.dialogue { background: #22d3ee; color: #0f172a; }
"""

# ---------------------------------------------------------------------------
# Global State
# ---------------------------------------------------------------------------

_pipeline: Optional[AudiobookPipeline] = None


def get_pipeline() -> AudiobookPipeline:
    global _pipeline
    if _pipeline is None:
        _pipeline = AudiobookPipeline()
    return _pipeline


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def on_mode_change(mode: str) -> tuple:
    if mode == "preset":
        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
    elif mode == "clone":
        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
    else:
        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)


def update_stats(text: str) -> tuple:
    wc = len(text.split()) if text else 0
    dur = estimate_duration(wc)
    return str(wc), dur


def handle_upload(file_obj) -> tuple:
    if file_obj is None:
        return "", "No file uploaded."
    try:
        pipe = get_pipeline()
        text, fname = pipe.parse_upload(file_obj)
        text = pipe.processor.clean_text(text)
        chs = pipe.detect_chapters(text)
        ch_info = " | ".join([f"Ch{c['idx']+1}: {c['word_count']}w" for c in chs[:5]])
        if len(chs) > 5:
            ch_info += f" (+{len(chs)-5} more)"
        wc = len(text.split())
        dur = estimate_duration(wc)
        return text, f"Loaded {fname} — {wc} words (~{dur}) | {ch_info if chs else '1 section'}"
    except Exception as e:
        return "", f"Error: {e}"


def extract_chars(text: str) -> tuple:
    if not text or len(text.strip()) < 20:
        return [], "Text too short. Please paste at least a paragraph."
    pipe = get_pipeline()
    chars = pipe.extract_characters(text, use_ai=True)
    status = f"Found {len(chars)} characters: {', '.join(c['name'] for c in chars)}" if chars else "No characters auto-detected. Add them manually below."
    return chars, status


def get_chapter_text(text: str, chapter_sel: str) -> str:
    if not text or chapter_sel == "All" or not chapter_sel:
        return text
    try:
        idx = int(chapter_sel.split(":")[0].replace("Ch", "")) - 1
        pipe = get_pipeline()
        return pipe.get_chapter_text(text, idx)
    except Exception:
        return text


# ---------------------------------------------------------------------------
# GPU-wrapped functions (ZeroGPU)
# ---------------------------------------------------------------------------

@spaces.GPU(duration=180)
def generate_audiobook_gpu(
    text,
    nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
    gen_temp, gen_seed, output_fmt, *args
):
    if not text or len(text.strip()) < 50:
        return None, None, "", "Error: Please provide at least 50 characters of story text.", ""

    wc = len(text.split())
    if wc > 5000:
        print(f"[WARN] Long text: {wc} words. Generation may take a while or hit timeouts.")

    # Unpack character args (80 values = 8 chars x 10 fields)
    names = list(args[0:8])
    descs = list(args[8:16])
    modes = list(args[16:24])
    presets = list(args[24:32])
    audios = list(args[32:40])
    ref_texts = list(args[40:48])
    designs = list(args[48:56])
    instructs = list(args[56:64])
    langs = list(args[64:72])
    speeds = list(args[72:80])

    pipe = get_pipeline()

    nar_cfg = VoiceConfig(
        name="Narrator",
        mode=nar_mode,
        preset=nar_preset if nar_mode == "preset" else None,
        ref_audio=nar_audio if nar_mode == "clone" and nar_audio else None,
        ref_text=nar_ref_text if nar_mode == "clone" else None,
        design_desc=nar_design if nar_mode == "design" else None,
        instruct=nar_instruct,
        language=nar_lang,
        speed=float(nar_speed) if nar_speed else 1.0,
    )

    char_configs = {}
    for i in range(8):
        if not names[i]:
            continue
        vc = VoiceConfig(
            name=names[i],
            mode=modes[i],
            preset=presets[i] if modes[i] == "preset" else None,
            ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None,
            ref_text=ref_texts[i] if modes[i] == "clone" else None,
            design_desc=designs[i] if modes[i] == "design" else None,
            instruct=instructs[i] or "",
            language=langs[i],
            speed=float(speeds[i]) if speeds[i] else 1.0,
        )
        char_configs[names[i]] = vc

    progress_text = ""

    def prog_cb(ratio: float, msg: str):
        nonlocal progress_text
        progress_text = f"[{ratio*100:.0f}%] {msg}"
        print(progress_text)

    try:
        output_path, seg_paths, seg_meta = pipe.generate(
            text=text,
            narrator_config=nar_cfg,
            character_configs=char_configs,
            progress_callback=prog_cb,
            temperature=gen_temp,
            seed=int(gen_seed),
        )

        seg_html = "<div style='max-height: 300px; overflow-y: auto;'>"
        for s in seg_meta[:50]:
            tclass = "narration" if s['type'] == 'narration' else "dialogue"
            seg_html += f"<div class='seg-item'><span class='seg-type {tclass}'>{s['type']}</span> <strong>{s['speaker']}</strong>: {s['text']}</div>"
        if len(seg_meta) > 50:
            seg_html += f"<div style='text-align:center;color:#94a3b8;padding:0.5rem;'>... and {len(seg_meta)-50} more segments</div>"
        seg_html += "</div>"

        extra_path = None
        if output_fmt == "wav":
            extra_path = output_path.replace(".mp3", ".wav")
            from backend import save_audiobook
            save_audiobook(seg_paths, extra_path, fmt="wav")
        elif output_fmt == "zip":
            extra_path = pipe.export_segments_zip(seg_paths)

        final_path = extra_path if extra_path else output_path
        return final_path, final_path, seg_html, f"Done! {len(seg_meta)} segments generated.", progress_text
    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, None, "", f"Error: {str(e)}", progress_text


@spaces.GPU(duration=60)
def preview_narrator_gpu(mode, preset, audio, ref_text, design, instruct, lang, speed):
    pipe = get_pipeline()
    vc = VoiceConfig(
        name="Narrator",
        mode=mode,
        preset=preset if mode == "preset" else None,
        ref_audio=audio if mode == "clone" and audio else None,
        ref_text=ref_text if mode == "clone" else None,
        design_desc=design if mode == "design" else None,
        instruct=instruct,
        language=lang,
        speed=float(speed) if speed else 1.0,
    )
    try:
        wav, sr = pipe.preview_voice(vc)
        return (sr, wav), "Preview ready!"
    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, f"Preview failed: {e}"


@spaces.GPU(duration=60)
def preview_char_voice_gpu(name, mode, preset, audio, ref_text, design, instruct, lang, speed):
    pipe = get_pipeline()
    vc = VoiceConfig(
        name=name or "Character",
        mode=mode,
        preset=preset if mode == "preset" else None,
        ref_audio=audio if mode == "clone" and audio else None,
        ref_text=ref_text if mode == "clone" else None,
        design_desc=design if mode == "design" else None,
        instruct=instruct,
        language=lang,
        speed=float(speed) if speed else 1.0,
    )
    try:
        sample = f"Hello, I am {name or 'your character'}. This is how I sound in the story."
        wav, sr = pipe.preview_voice(vc, sample_text=sample)
        return (sr, wav), f"{name or 'Character'} preview ready!"
    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, f"Preview failed: {e}"


# ---------------------------------------------------------------------------
# Quick Generate
# ---------------------------------------------------------------------------

@spaces.GPU(duration=180)
def quick_generate_gpu(text, mode, preset, audio, ref_text, design, instruct, lang, speed, gen_temp, output_fmt, gen_seed=42):
    if not text or len(text.strip()) < 50:
        return None, None, "Error: Text too short."

    wc = len(text.split())
    if wc > 5000:
        print(f"[WARN] Long text: {wc} words. Quick Generate may take a while or hit timeouts.")

    pipe = get_pipeline()
    nar_cfg = VoiceConfig(
        name="Narrator",
        mode=mode,
        preset=preset if mode == "preset" else None,
        ref_audio=audio if mode == "clone" and audio else None,
        ref_text=ref_text if mode == "clone" else None,
        design_desc=design if mode == "design" else None,
        instruct=instruct or "Narrate clearly and expressively.",
        language=lang,
        speed=float(speed) if speed else 1.0,
    )

    def prog_cb(ratio: float, msg: str):
        print(f"[{ratio*100:.0f}%] {msg}")

    try:
        output_path, seg_paths, seg_meta = pipe.generate(
            text=text,
            narrator_config=nar_cfg,
            character_configs={},
            progress_callback=prog_cb,
            temperature=gen_temp,
            seed=int(gen_seed),
        )

        extra_path = None
        if output_fmt == "wav":
            extra_path = output_path.replace(".mp3", ".wav")
            from backend import save_audiobook
            save_audiobook(seg_paths, extra_path, fmt="wav")
        elif output_fmt == "zip":
            extra_path = pipe.export_segments_zip(seg_paths)

        final_path = extra_path if extra_path else output_path
        return final_path, final_path, f"Quick audiobook ready! {len(seg_meta)} segments."
    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, None, f"Error: {str(e)}"


# ---------------------------------------------------------------------------
# Project Save/Load
# ---------------------------------------------------------------------------

def do_save_project(text, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed, *args):
    # Unpack character args (80 values) + gen_temp + gen_seed
    names = list(args[0:8])
    descs = list(args[8:16])
    modes = list(args[16:24])
    presets = list(args[24:32])
    audios = list(args[32:40])
    ref_texts = list(args[40:48])
    designs = list(args[48:56])
    instructs = list(args[56:64])
    langs = list(args[64:72])
    speeds = list(args[72:80])
    gen_temp = args[80] if len(args) > 80 else 0.7
    gen_seed = args[81] if len(args) > 81 else 42

    nar_cfg = VoiceConfig(
        name="Narrator", mode=nar_mode, preset=nar_preset if nar_mode == "preset" else None,
        ref_audio=nar_audio if nar_mode == "clone" and nar_audio else None,
        ref_text=nar_ref_text if nar_mode == "clone" else None,
        design_desc=nar_design if nar_mode == "design" else None,
        instruct=nar_instruct, language=nar_lang,
        speed=float(nar_speed) if nar_speed else 1.0,
    )
    char_configs = {}
    for i in range(8):
        if not names[i]:
            continue
        char_configs[names[i]] = VoiceConfig(
            name=names[i], mode=modes[i], description=descs[i] or "",
            preset=presets[i] if modes[i] == "preset" else None,
            ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None,
            ref_text=ref_texts[i] if modes[i] == "clone" else None,
            design_desc=designs[i] if modes[i] == "design" else None,
            instruct=instructs[i] or "", language=langs[i],
            speed=float(speeds[i]) if speeds[i] else 1.0,
        )
    settings = {"temperature": gen_temp, "seed": int(gen_seed)}
    json_str = save_project(text, nar_cfg, char_configs, settings)
    return json_str


def do_load_project(json_str):
    try:
        data = load_project(json_str)
        nar = data["narrator"]
        chars = data.get("characters", {})

        nar_updates = [
            gr.update(value=nar.mode),
            gr.update(value=nar.preset if nar.preset else "Ryan", visible=nar.mode=="preset"),
            gr.update(value=nar.ref_audio, visible=nar.mode=="clone"),
            gr.update(value=nar.ref_text, visible=nar.mode=="clone"),
            gr.update(value=nar.design_desc, visible=nar.mode=="design"),
            gr.update(value=nar.instruct),
            gr.update(value=nar.language),
            gr.update(value=nar.speed),
        ]

        char_updates = []
        char_items = list(chars.items())[:8]
        for i in range(8):
            if i < len(char_items):
                _, c = char_items[i]
                char_updates.extend([
                    gr.update(visible=True),
                    gr.update(value=c.name, visible=True),
                    gr.update(value=c.description, visible=True),
                    gr.update(value=c.mode, visible=True),
                    gr.update(value=c.preset if c.preset else "Ryan", visible=c.mode=="preset"),
                    gr.update(value=c.ref_audio, visible=c.mode=="clone"),
                    gr.update(value=c.ref_text, visible=c.mode=="clone"),
                    gr.update(value=c.design_desc, visible=c.mode=="design"),
                    gr.update(value=c.instruct, visible=True),
                    gr.update(value=c.language, visible=True),
                    gr.update(value=c.speed, visible=True),
                    gr.update(visible=True),
                    gr.update(visible=True),
                    gr.update(visible=True),
                ])
            else:
                char_updates.extend([
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                ])

        text_sample = data.get("text_sample", "")
        return [text_sample] + nar_updates + char_updates + [f"Project loaded! {len(chars)} characters configured."]
    except Exception as e:
        import traceback
        traceback.print_exc()
        return [""] + [gr.update()] * 8 + [gr.update(visible=False)] * 112 + [f"Error loading project: {e}"]


# ---------------------------------------------------------------------------
# Build UI
# ---------------------------------------------------------------------------

def build_app():
    theme = gr.themes.Soft(
        primary_hue="indigo",
        secondary_hue="cyan",
        neutral_hue="slate",
    ).set(
        body_background_fill="#0f172a",
        body_background_fill_dark="#0f172a",
        body_text_color="#f8fafc",
        body_text_color_subdued="#94a3b8",
        background_fill_primary="#1e293b",
        background_fill_secondary="#0f172a",
        border_color_accent="#334155",
        color_accent_soft="#22d3ee",
        button_primary_background_fill="linear-gradient(135deg, #6366f1, #4f46e5)",
        button_primary_background_fill_hover="linear-gradient(135deg, #4f46e5, #4338ca)",
        button_primary_text_color="#ffffff",
        input_background_fill="#0f172a",
        input_border_color="#334155",
        block_title_text_color="#f8fafc",
        block_label_text_color="#94a3b8",
    )

    with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="AudioBook Forge") as demo:
        gr.HTML("""
        <div class="ab-header">
            <h1>AudioBook Forge</h1>
            <p>High-fidelity audiobooks with AI character voices. Model-agnostic TTS powered by Qwen3-TTS.</p>
        </div>
        """)

        with gr.Tabs():
            # ==================== TAB 1: Story ====================
            with gr.TabItem("📖 Story"):
                with gr.Row():
                    with gr.Column(scale=2):
                        gr.Markdown("### Upload or Paste")
                        file_upload = gr.File(
                            label="Upload EPUB, PDF, TXT, or HTML",
                            file_types=[".txt", ".epub", ".pdf", ".html", ".htm"],
                        )
                        story_input = gr.TextArea(
                            label="Story Text",
                            placeholder="Paste your book chapter, short story, or script here...",
                            lines=18,
                            max_lines=40,
                        )
                        sample_dropdown = gr.Dropdown(
                            label="Or try a sample story",
                            choices=list(SAMPLE_STORIES.keys()),
                            value=None,
                        )

                    with gr.Column(scale=1):
                        gr.Markdown("### Stats")
                        with gr.Row():
                            stat_words = gr.Textbox(label="Words", value="0", interactive=False)
                            stat_dur = gr.Textbox(label="Est. Duration", value="0 sec", interactive=False)
                        gr.Markdown("---")
                        gr.Markdown("### Quick Generate")
                        quick_mode = gr.Dropdown(choices=["preset", "clone", "design"], value="design", label="Narrator Mode")
                        quick_preset = gr.Dropdown(choices=list(PRESET_SPEAKERS.keys()), value="Ryan", label="Preset Voice", visible=False)
                        quick_audio = gr.Audio(label="Upload Voice Sample (3–10s)", type="filepath", visible=False)
                        quick_ref_text = gr.Textbox(label="Reference Transcript", placeholder="What does the sample say?", visible=False)
                        quick_design = gr.TextArea(label="Voice Description", placeholder="e.g. A warm, raspy baritone with measured pacing...", visible=True, lines=2, value="A clear, warm, expressive audiobook narrator voice with professional pacing and rich tone.")
                        quick_instruct = gr.Textbox(label="Style Instruction", placeholder="e.g. Calm, measured storytelling.", value="")
                        quick_lang = gr.Dropdown(choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", label="Language")
                        quick_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
                        quick_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
                        quick_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format")
                        quick_btn = gr.Button("⚡ Quick Generate", variant="primary")
                        quick_output_audio = gr.Audio(label="Quick Audiobook", type="filepath", interactive=False)
                        quick_output_file = gr.File(label="Download", interactive=False)
                        quick_status = gr.Textbox(show_label=False, interactive=False)
                        gr.Markdown("---")
                        gr.Markdown("**Quick Generate** uses a single narrator voice for the entire text. Supports preset, clone, or AI-designed voices.")

                with gr.Row():
                    chapter_selector = gr.Dropdown(
                        label="Chapter / Section",
                        choices=["All"],
                        value="All",
                        interactive=True,
                    )
                    refresh_chapters_btn = gr.Button("🔄 Detect Chapters")
                    clear_story_btn = gr.Button("🗑️ Clear", variant="secondary")

                def clear_story():
                    return "", gr.update(choices=["All"], value="All"), "0", "0 sec"

                clear_story_btn.click(
                    clear_story,
                    inputs=[],
                    outputs=[story_input, chapter_selector, stat_words, stat_dur],
                )

                with gr.Row():
                    gr.Markdown("### Character Detection")
                    extract_btn = gr.Button("🔍 Extract Characters", variant="primary")

                extract_status = gr.Textbox(label="Status", interactive=False)

                # Wiring
                file_upload.change(handle_upload, inputs=[file_upload], outputs=[story_input, extract_status])
                def load_sample_and_update(name):
                    text = SAMPLE_STORIES.get(name, "")
                    wc = len(text.split()) if text else 0
                    dur = estimate_duration(wc)
                    return text, str(wc), dur, gr.update(choices=["All"], value="All"), ""

                sample_dropdown.change(
                    load_sample_and_update,
                    inputs=[sample_dropdown],
                    outputs=[story_input, stat_words, stat_dur, chapter_selector, extract_status],
                )
                story_input.change(update_stats, inputs=[story_input], outputs=[stat_words, stat_dur])
                quick_btn.click(
                    quick_generate_gpu,
                    inputs=[story_input, quick_mode, quick_preset, quick_audio, quick_ref_text, quick_design, quick_instruct, quick_lang, quick_speed, quick_temp, quick_fmt],
                    outputs=[quick_output_audio, quick_output_file, quick_status],
                )

                quick_mode.change(on_mode_change, inputs=quick_mode, outputs=[quick_preset, quick_audio, quick_ref_text, quick_design])

                def refresh_chapters(text):
                    if not text:
                        return gr.update(choices=["All"], value="All")
                    pipe = get_pipeline()
                    chs = pipe.detect_chapters(text)
                    choices = ["All"] + [f"Ch{c['idx']+1}: {c['title'][:60]}" for c in chs]
                    return gr.update(choices=choices, value="All")

                refresh_chapters_btn.click(refresh_chapters, inputs=[story_input], outputs=[chapter_selector])

            # ==================== TAB 2: Voice Cast ====================
            with gr.TabItem("🎭 Voice Cast"):
                with gr.Row():
                    with gr.Column(scale=1):
                        gr.Markdown("## Narrator")
                        with gr.Column(elem_classes="ab-card"):
                            nar_mode = gr.Dropdown(choices=["preset", "clone", "design"], value="design", label="Mode")
                            nar_preset = gr.Dropdown(choices=list(PRESET_SPEAKERS.keys()), value="Ryan", label="Preset Voice", visible=False)
                            nar_audio = gr.Audio(label="Upload Voice Sample (3–10s)", type="filepath", visible=False)
                            nar_ref_text = gr.Textbox(label="Reference Transcript", placeholder="What does the sample say?", visible=False)
                            nar_design = gr.TextArea(label="Voice Description", placeholder="e.g. A warm, raspy baritone with measured pacing...", visible=True, lines=2, value="A clear, warm, expressive audiobook narrator voice with professional pacing and rich tone.")
                            nar_instruct = gr.Textbox(label="Style Instruction", placeholder="e.g. Calm, measured storytelling.")
                            nar_lang = gr.Dropdown(choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", label="Language")
                            nar_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
                            nar_preview_btn = gr.Button("🔊 Preview Narrator", variant="secondary")
                            nar_preview_audio = gr.Audio(label="Preview", interactive=False)
                            nar_preview_status = gr.Textbox(show_label=False, interactive=False)

                            nar_mode.change(on_mode_change, inputs=nar_mode, outputs=[nar_preset, nar_audio, nar_ref_text, nar_design])
                            nar_preview_btn.click(
                                preview_narrator_gpu,
                                inputs=[nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed],
                                outputs=[nar_preview_audio, nar_preview_status],
                            )

                    with gr.Column(scale=2):
                        gr.Markdown("## Character Voices")
                        gr.Markdown("""
Configure up to 8 characters. Each character can use one of three voice modes:

- **Preset** — Choose from 9 built-in speakers (Ryan, Aiden, Serena, etc.)
- **Clone** — Upload a 3–10 second voice sample to clone any real voice
- **Design** — Describe a voice in text (e.g. *"A raspy old man with a warm chuckle"*) and the AI will create it
""")

                        char_names, char_descs, char_modes, char_presets = [], [], [], []
                        char_audios, char_ref_texts, char_designs, char_instructs, char_langs, char_speeds = [], [], [], [], [], []
                        char_rows, char_preview_btns, char_preview_audios, char_preview_statuses = [], [], [], []

                        for i in range(8):
                            visible_default = (i == 0)
                            with gr.Group(visible=visible_default) as row:
                                with gr.Row():
                                    cn = gr.Textbox(label="Name", placeholder="e.g. Alice", visible=visible_default)
                                    cd = gr.Textbox(label="Description", placeholder="Personality note", visible=visible_default)
                                    cm = gr.Dropdown(label="Mode", choices=["preset", "clone", "design"], value="design", visible=visible_default)
                                    cp = gr.Dropdown(label="Preset", choices=list(PRESET_SPEAKERS.keys()), value="Ryan", visible=False)
                                with gr.Row():
                                    ca = gr.Audio(label="Voice Sample", type="filepath", visible=False)
                                    crt = gr.Textbox(label="Ref Transcript", placeholder="What the sample says", visible=False)
                                    cdes = gr.TextArea(label="Voice Description", placeholder="e.g. A shrill, nervous teenager.", visible=visible_default, lines=2)
                                    cinstr = gr.Textbox(label="Style Instruction", placeholder="e.g. Angry and loud.", visible=visible_default)
                                    cl = gr.Dropdown(label="Language", choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", visible=visible_default)
                                    cspd = gr.Slider(label="Speed", minimum=0.5, maximum=2.0, value=1.0, step=0.1, visible=visible_default)
                                with gr.Row():
                                    cpv_btn = gr.Button("🔊 Preview", variant="secondary", visible=visible_default)
                                    cpv_audio = gr.Audio(label="Preview", interactive=False, visible=visible_default)
                                    cpv_status = gr.Textbox(show_label=False, interactive=False, visible=visible_default)

                                cm.change(on_mode_change, inputs=cm, outputs=[cp, ca, crt, cdes])
                                cpv_btn.click(
                                    preview_char_voice_gpu,
                                    inputs=[cn, cm, cp, ca, crt, cdes, cinstr, cl, cspd],
                                    outputs=[cpv_audio, cpv_status],
                                )

                            char_rows.append(row)
                            char_names.append(cn)
                            char_descs.append(cd)
                            char_modes.append(cm)
                            char_presets.append(cp)
                            char_audios.append(ca)
                            char_ref_texts.append(crt)
                            char_designs.append(cdes)
                            char_instructs.append(cinstr)
                            char_langs.append(cl)
                            char_speeds.append(cspd)
                            char_preview_btns.append(cpv_btn)
                            char_preview_audios.append(cpv_audio)
                            char_preview_statuses.append(cpv_status)

            # ==================== TAB 3: Generate ====================
            with gr.TabItem("⚡ Generate"):
                gr.Markdown("_Note: The first generation downloads Qwen3-TTS 1.7B models (~5 GB) and may take 2–5 minutes. Subsequent runs are much faster._")
                with gr.Row():
                    with gr.Column(scale=1):
                        gr.Markdown("### Settings")
                        gen_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
                        gen_seed = gr.Number(value=42, precision=0, label="Seed (fix for consistency)")
                        output_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format")
                        gen_btn = gr.Button("▶️ Generate Full Audiobook", variant="primary", size="lg")
                        gen_progress = gr.Textbox(label="Progress", interactive=False, value="Ready.")

                    with gr.Column(scale=2):
                        gr.Markdown("### Output")
                        output_audio = gr.Audio(label="Generated Audiobook", type="filepath", interactive=False)
                        output_file = gr.File(label="Download", interactive=False)
                        output_status = gr.Textbox(label="Status", interactive=False)
                        segment_list = gr.HTML(label="Segments")

            # ==================== TAB 4: Project ====================
            with gr.TabItem("💾 Project"):
                with gr.Row():
                    with gr.Column():
                        gr.Markdown("### Save Project")
                        save_btn = gr.Button("💾 Save Configuration", variant="primary")
                        project_json = gr.TextArea(label="Project JSON (copy this to save)", lines=10, interactive=True)
                    with gr.Column():
                        gr.Markdown("### Load Project")
                        load_json = gr.TextArea(label="Paste Project JSON here", lines=10, interactive=True)
                        load_btn = gr.Button("📂 Load Configuration", variant="secondary")
                        load_status = gr.Textbox(label="Status", interactive=False)

            # ==================== TAB 5: About ====================
            with gr.TabItem("ℹ️ About"):
                gr.Markdown("""
                ## AudioBook Forge

                **Model-agnostic, high-fidelity audiobook generator** powered by [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS).

                ### Features
                - 📁 **File Upload** — Import EPUB, PDF, TXT, or HTML directly
                - 📖 **Chapter Detection** — Auto-detects chapters/sections for selective generation
                - 🎙️ **Character Voice Mapping** — Auto-extract characters and assign unique voices
                - 🎭 **Three Voice Modes** — Preset (9 speakers), Clone (upload sample), Design (text description)
                - ⚡ **Quick Generate** — One-click audiobook with a single narrator voice
                - 🎚️ **Speed Control** — Adjust playback speed per voice (0.5x–2.0x)
                - 📦 **Multi-format Export** — MP3, WAV, or ZIP of individual segments
                - 💾 **Save/Load Projects** — Export and restore your voice configurations
                - 🌐 **10 Languages** — English, Chinese, Japanese, Korean, German, French, Spanish, Italian, Portuguese, Russian
                - ⚡ **ZeroGPU** — Runs on Hugging Face ZeroGPU (free compute)

                ### Workflow
                1. **Upload or paste** your story text
                2. **Detect chapters** (optional) and select a range
                3. **Extract characters** or use Quick Generate for simple narration
                4. **Assign voices** to narrator and each character
                5. **Generate** and download your audiobook

                ### Tips for Best Quality
                - Use clean, noise-free voice samples for cloning (3–10 seconds)
                - Keep reference transcripts accurate
                - Lower temperature (0.5–0.6) for stable narration; higher (0.8–0.9) for expressive dialogue
                - Use a fixed seed to prevent voice drift across segments
                - Use speed adjustment to fine-tune pacing per character

                ### Note on First Run
                The first time you generate audio, the Space downloads the Qwen3-TTS 1.7B models (~5 GB total). This can take **2–5 minutes** depending on network speed. Subsequent runs are much faster because models are cached. Please be patient — the progress is printed in the server logs.
                """)

        # ---------- Extract wiring ----------
        def do_extract(text):
            chars, status = extract_chars(text)
            updates = []
            for i in range(8):
                if i < len(chars):
                    mode = chars[i].get("voice_mode", "design")
                    is_preset = mode == "preset"
                    is_clone = mode == "clone"
                    is_design = mode == "design"
                    updates.extend([
                        gr.update(visible=True),
                        gr.update(value=chars[i].get("name", ""), visible=True),
                        gr.update(value=chars[i].get("description", ""), visible=True),
                        gr.update(value=mode, visible=True),
                        gr.update(value=chars[i].get("voice_preset", "Ryan"), visible=is_preset),
                        gr.update(visible=is_clone),
                        gr.update(visible=is_clone),
                        gr.update(value=chars[i].get("voice_description", ""), visible=is_design),
                        gr.update(value=chars[i].get("voice_instruct", ""), visible=True),
                        gr.update(value=chars[i].get("language", "English"), visible=True),
                        gr.update(value=chars[i].get("speed", 1.0), visible=True),
                        gr.update(visible=True),
                        gr.update(visible=True),
                        gr.update(visible=True),
                    ])
                else:
                    updates.extend([
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False),
                    ])
            return [status] + updates

        extract_outputs = [extract_status] + [
            item for sublist in [
                [char_rows[i], char_names[i], char_descs[i], char_modes[i], char_presets[i],
                 char_audios[i], char_ref_texts[i], char_designs[i], char_instructs[i], char_langs[i],
                 char_speeds[i], char_preview_btns[i], char_preview_audios[i], char_preview_statuses[i]]
                for i in range(8)
            ] for item in sublist
        ]
        extract_btn.click(do_extract, inputs=[story_input], outputs=extract_outputs)

        # ---------- Generate wiring ----------
        all_char_inputs = (
            char_names + char_descs + char_modes + char_presets +
            char_audios + char_ref_texts + char_designs + char_instructs + char_langs + char_speeds
        )

        gen_inputs = [
            story_input, chapter_selector,
            nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
            gen_temp, gen_seed, output_fmt,
        ] + all_char_inputs

        def wrapped_generate(story_text, chapter_sel, *args):
            text = get_chapter_text(story_text, chapter_sel)
            return generate_audiobook_gpu(text, *args)

        gen_btn.click(
            wrapped_generate,
            inputs=gen_inputs,
            outputs=[output_audio, output_file, segment_list, output_status, gen_progress],
        )

        # ---------- Project wiring ----------
        save_inputs = [
            story_input,
            nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
        ] + all_char_inputs + [gen_temp, gen_seed]
        save_btn.click(do_save_project, inputs=save_inputs, outputs=[project_json])

        load_outputs = [story_input, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed] + extract_outputs[1:] + [load_status]
        load_btn.click(do_load_project, inputs=[load_json], outputs=load_outputs)

    return demo


demo = build_app()

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)