""" AudioBook Forge - Enhanced Gradio Frontend High-fidelity audiobook generator with character voice mapping, file upload, chapter selection, segment previews, and project save/load. """ import os import json from pathlib import Path from typing import Dict, List, Optional import gradio as gr import numpy as np # --------------------------------------------------------------------------- # spaces / ZeroGPU compatibility # --------------------------------------------------------------------------- try: import spaces except ImportError: class _SpacesGPU: def __init__(self, duration=60): self.duration = duration def __call__(self, fn): return fn class spaces: GPU = _SpacesGPU # --------------------------------------------------------------------------- # Backend imports # --------------------------------------------------------------------------- from backend import ( AudiobookPipeline, VoiceConfig, PRESET_SPEAKERS, SAMPLE_STORIES, save_project, load_project, estimate_duration, ) # --------------------------------------------------------------------------- # CSS & Theme # --------------------------------------------------------------------------- CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); body, .gradio-container { font-family: 'Inter', sans-serif !important; background: #0f172a !important; color: #f8fafc !important; } .gradio-container { max-width: 1200px !important; } .ab-header { text-align: center; padding: 2.2rem 1rem 1.8rem; background: linear-gradient(135deg, rgba(99,102,241,0.12) 0%, rgba(34,211,238,0.06) 100%); border-radius: 18px; margin-bottom: 1.5rem; border: 1px solid rgba(99,102,241,0.18); } .ab-header h1 { font-size: 2.6rem; font-weight: 700; margin: 0; background: linear-gradient(90deg, #a5b4fc, #22d3ee); -webkit-background-clip: text; -webkit-text-fill-color: transparent; } .ab-header p { color: #94a3b8; margin-top: 0.6rem; font-size: 1.05rem; } .ab-card { background: #1e293b !important; border: 1px solid #334155 !important; border-radius: 14px !important; padding: 1.25rem !important; } .ab-stat { background: #0f172a; border: 1px solid #334155; border-radius: 10px; padding: 0.75rem 1rem; text-align: center; } .ab-stat .value { font-size: 1.4rem; font-weight: 700; color: #22d3ee; } .ab-stat .label { font-size: 0.75rem; color: #94a3b8; text-transform: uppercase; letter-spacing: 0.05em; } button.primary { background: linear-gradient(135deg, #6366f1, #4f46e5) !important; border: none !important; border-radius: 10px !important; font-weight: 600 !important; transition: all 0.2s ease !important; } button.primary:hover { transform: translateY(-1px); box-shadow: 0 4px 14px rgba(99,102,241,0.4) !important; } button.secondary { background: #334155 !important; border: 1px solid #475569 !important; border-radius: 10px !important; color: #f8fafc !important; } input, textarea, select { background: #0f172a !important; border: 1px solid #334155 !important; border-radius: 8px !important; color: #f8fafc !important; } input:focus, textarea:focus, select:focus { border-color: #6366f1 !important; box-shadow: 0 0 0 3px rgba(99,102,241,0.15) !important; } .gr-box, .gr-form { background: #1e293b !important; border-color: #334155 !important; } .gr-panel { background: #1e293b !important; } .tabitem { background: #1e293b !important; border-color: #334155 !important; } input[type="checkbox"] + label, .checkbox-label, .gr-checkbox label { color: #f8fafc !important; } /* Gradio 5+ checkbox checked state - make it clearly visible in dark theme */ .gr-checkbox input[type="checkbox"]:checked + label, .gr-checkbox-checked label, .gr-checkbox-input:checked + .gr-checkbox-border, .gr-checkbox-input:checked + label .gr-checkbox-border, input[type="checkbox"]:checked + label span { background: #6366f1 !important; border-color: #818cf8 !important; box-shadow: 0 0 0 3px rgba(99,102,241,0.35) !important; } .gr-checkbox input[type="checkbox"]:checked + label::after, .gr-checkbox-input:checked + label::after { border-color: #ffffff !important; } .gr-checkbox { color: #f8fafc !important; } .gr-checkbox-input:checked + * { background: #6366f1 !important; border-color: #818cf8 !important; } li, .prose li, .gr-prose li { color: #cbd5e1 !important; } strong, b { color: #f8fafc !important; } code { background: #334155 !important; color: #22d3ee !important; padding: 0.1rem 0.3rem !important; border-radius: 4px !important; } progress { width: 100%; height: 8px; border-radius: 4px; background: #334155; } progress::-webkit-progress-bar { background: #334155; border-radius: 4px; } progress::-webkit-progress-value { background: linear-gradient(90deg, #6366f1, #22d3ee); border-radius: 4px; } .seg-item { background: #0f172a; border: 1px solid #334155; border-radius: 8px; padding: 0.5rem 0.75rem; margin-bottom: 0.4rem; font-size: 0.85rem; } .seg-item .seg-type { display: inline-block; padding: 0.1rem 0.4rem; border-radius: 4px; font-size: 0.7rem; font-weight: 600; text-transform: uppercase; } .seg-type.narration { background: #4f46e5; color: #fff; } .seg-type.dialogue { background: #22d3ee; color: #0f172a; } """ # --------------------------------------------------------------------------- # Global State # --------------------------------------------------------------------------- _pipeline: Optional[AudiobookPipeline] = None def get_pipeline() -> AudiobookPipeline: global _pipeline if _pipeline is None: _pipeline = AudiobookPipeline() return _pipeline # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def on_mode_change(mode: str) -> tuple: if mode == "preset": return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) elif mode == "clone": return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False) else: return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True) def update_stats(text: str) -> tuple: wc = len(text.split()) if text else 0 dur = estimate_duration(wc) return str(wc), dur def handle_upload(file_obj) -> tuple: if file_obj is None: return "", "No file uploaded." try: pipe = get_pipeline() text, fname = pipe.parse_upload(file_obj) text = pipe.processor.clean_text(text) chs = pipe.detect_chapters(text) ch_info = " | ".join([f"Ch{c['idx']+1}: {c['word_count']}w" for c in chs[:5]]) if len(chs) > 5: ch_info += f" (+{len(chs)-5} more)" wc = len(text.split()) dur = estimate_duration(wc) return text, f"Loaded {fname} — {wc} words (~{dur}) | {ch_info if chs else '1 section'}" except Exception as e: return "", f"Error: {e}" def extract_chars(text: str) -> tuple: if not text or len(text.strip()) < 20: return [], "Text too short. Please paste at least a paragraph." pipe = get_pipeline() chars = pipe.extract_characters(text, use_ai=True) status = f"Found {len(chars)} characters: {', '.join(c['name'] for c in chars)}" if chars else "No characters auto-detected. Add them manually below." return chars, status def get_chapter_text(text: str, chapter_sel: str) -> str: if not text or chapter_sel == "All" or not chapter_sel: return text try: idx = int(chapter_sel.split(":")[0].replace("Ch", "")) - 1 pipe = get_pipeline() return pipe.get_chapter_text(text, idx) except Exception: return text # --------------------------------------------------------------------------- # GPU-wrapped functions (ZeroGPU) # --------------------------------------------------------------------------- @spaces.GPU(duration=180) def generate_audiobook_gpu( text, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed, gen_temp, gen_seed, output_fmt, *args ): if not text or len(text.strip()) < 50: return None, None, "", "Error: Please provide at least 50 characters of story text.", "" wc = len(text.split()) if wc > 5000: print(f"[WARN] Long text: {wc} words. Generation may take a while or hit timeouts.") # Unpack character args (80 values = 8 chars x 10 fields) names = list(args[0:8]) descs = list(args[8:16]) modes = list(args[16:24]) presets = list(args[24:32]) audios = list(args[32:40]) ref_texts = list(args[40:48]) designs = list(args[48:56]) instructs = list(args[56:64]) langs = list(args[64:72]) speeds = list(args[72:80]) pipe = get_pipeline() nar_cfg = VoiceConfig( name="Narrator", mode=nar_mode, preset=nar_preset if nar_mode == "preset" else None, ref_audio=nar_audio if nar_mode == "clone" and nar_audio else None, ref_text=nar_ref_text if nar_mode == "clone" else None, design_desc=nar_design if nar_mode == "design" else None, instruct=nar_instruct, language=nar_lang, speed=float(nar_speed) if nar_speed else 1.0, ) char_configs = {} for i in range(8): if not names[i]: continue vc = VoiceConfig( name=names[i], mode=modes[i], preset=presets[i] if modes[i] == "preset" else None, ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None, ref_text=ref_texts[i] if modes[i] == "clone" else None, design_desc=designs[i] if modes[i] == "design" else None, instruct=instructs[i] or "", language=langs[i], speed=float(speeds[i]) if speeds[i] else 1.0, ) char_configs[names[i]] = vc progress_text = "" def prog_cb(ratio: float, msg: str): nonlocal progress_text progress_text = f"[{ratio*100:.0f}%] {msg}" print(progress_text) try: output_path, seg_paths, seg_meta = pipe.generate( text=text, narrator_config=nar_cfg, character_configs=char_configs, progress_callback=prog_cb, temperature=gen_temp, seed=int(gen_seed), ) seg_html = "
" for s in seg_meta[:50]: tclass = "narration" if s['type'] == 'narration' else "dialogue" seg_html += f"
{s['type']} {s['speaker']}: {s['text']}
" if len(seg_meta) > 50: seg_html += f"
... and {len(seg_meta)-50} more segments
" seg_html += "
" extra_path = None if output_fmt == "wav": extra_path = output_path.replace(".mp3", ".wav") from backend import save_audiobook save_audiobook(seg_paths, extra_path, fmt="wav") elif output_fmt == "zip": extra_path = pipe.export_segments_zip(seg_paths) final_path = extra_path if extra_path else output_path return final_path, final_path, seg_html, f"Done! {len(seg_meta)} segments generated.", progress_text except Exception as e: import traceback traceback.print_exc() return None, None, "", f"Error: {str(e)}", progress_text @spaces.GPU(duration=60) def preview_narrator_gpu(mode, preset, audio, ref_text, design, instruct, lang, speed): pipe = get_pipeline() vc = VoiceConfig( name="Narrator", mode=mode, preset=preset if mode == "preset" else None, ref_audio=audio if mode == "clone" and audio else None, ref_text=ref_text if mode == "clone" else None, design_desc=design if mode == "design" else None, instruct=instruct, language=lang, speed=float(speed) if speed else 1.0, ) try: wav, sr = pipe.preview_voice(vc) return (sr, wav), "Preview ready!" except Exception as e: import traceback traceback.print_exc() return None, f"Preview failed: {e}" @spaces.GPU(duration=60) def preview_char_voice_gpu(name, mode, preset, audio, ref_text, design, instruct, lang, speed): pipe = get_pipeline() vc = VoiceConfig( name=name or "Character", mode=mode, preset=preset if mode == "preset" else None, ref_audio=audio if mode == "clone" and audio else None, ref_text=ref_text if mode == "clone" else None, design_desc=design if mode == "design" else None, instruct=instruct, language=lang, speed=float(speed) if speed else 1.0, ) try: sample = f"Hello, I am {name or 'your character'}. This is how I sound in the story." wav, sr = pipe.preview_voice(vc, sample_text=sample) return (sr, wav), f"{name or 'Character'} preview ready!" except Exception as e: import traceback traceback.print_exc() return None, f"Preview failed: {e}" # --------------------------------------------------------------------------- # Quick Generate # --------------------------------------------------------------------------- @spaces.GPU(duration=180) def quick_generate_gpu(text, mode, preset, audio, ref_text, design, instruct, lang, speed, gen_temp, output_fmt, gen_seed=42): if not text or len(text.strip()) < 50: return None, None, "Error: Text too short." wc = len(text.split()) if wc > 5000: print(f"[WARN] Long text: {wc} words. Quick Generate may take a while or hit timeouts.") pipe = get_pipeline() nar_cfg = VoiceConfig( name="Narrator", mode=mode, preset=preset if mode == "preset" else None, ref_audio=audio if mode == "clone" and audio else None, ref_text=ref_text if mode == "clone" else None, design_desc=design if mode == "design" else None, instruct=instruct or "Narrate clearly and expressively.", language=lang, speed=float(speed) if speed else 1.0, ) def prog_cb(ratio: float, msg: str): print(f"[{ratio*100:.0f}%] {msg}") try: output_path, seg_paths, seg_meta = pipe.generate( text=text, narrator_config=nar_cfg, character_configs={}, progress_callback=prog_cb, temperature=gen_temp, seed=int(gen_seed), ) extra_path = None if output_fmt == "wav": extra_path = output_path.replace(".mp3", ".wav") from backend import save_audiobook save_audiobook(seg_paths, extra_path, fmt="wav") elif output_fmt == "zip": extra_path = pipe.export_segments_zip(seg_paths) final_path = extra_path if extra_path else output_path return final_path, final_path, f"Quick audiobook ready! {len(seg_meta)} segments." except Exception as e: import traceback traceback.print_exc() return None, None, f"Error: {str(e)}" # --------------------------------------------------------------------------- # Project Save/Load # --------------------------------------------------------------------------- def do_save_project(text, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed, *args): # Unpack character args (80 values) + gen_temp + gen_seed names = list(args[0:8]) descs = list(args[8:16]) modes = list(args[16:24]) presets = list(args[24:32]) audios = list(args[32:40]) ref_texts = list(args[40:48]) designs = list(args[48:56]) instructs = list(args[56:64]) langs = list(args[64:72]) speeds = list(args[72:80]) gen_temp = args[80] if len(args) > 80 else 0.7 gen_seed = args[81] if len(args) > 81 else 42 nar_cfg = VoiceConfig( name="Narrator", mode=nar_mode, preset=nar_preset if nar_mode == "preset" else None, ref_audio=nar_audio if nar_mode == "clone" and nar_audio else None, ref_text=nar_ref_text if nar_mode == "clone" else None, design_desc=nar_design if nar_mode == "design" else None, instruct=nar_instruct, language=nar_lang, speed=float(nar_speed) if nar_speed else 1.0, ) char_configs = {} for i in range(8): if not names[i]: continue char_configs[names[i]] = VoiceConfig( name=names[i], mode=modes[i], description=descs[i] or "", preset=presets[i] if modes[i] == "preset" else None, ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None, ref_text=ref_texts[i] if modes[i] == "clone" else None, design_desc=designs[i] if modes[i] == "design" else None, instruct=instructs[i] or "", language=langs[i], speed=float(speeds[i]) if speeds[i] else 1.0, ) settings = {"temperature": gen_temp, "seed": int(gen_seed)} json_str = save_project(text, nar_cfg, char_configs, settings) return json_str def do_load_project(json_str): try: data = load_project(json_str) nar = data["narrator"] chars = data.get("characters", {}) nar_updates = [ gr.update(value=nar.mode), gr.update(value=nar.preset if nar.preset else "Ryan", visible=nar.mode=="preset"), gr.update(value=nar.ref_audio, visible=nar.mode=="clone"), gr.update(value=nar.ref_text, visible=nar.mode=="clone"), gr.update(value=nar.design_desc, visible=nar.mode=="design"), gr.update(value=nar.instruct), gr.update(value=nar.language), gr.update(value=nar.speed), ] char_updates = [] char_items = list(chars.items())[:8] for i in range(8): if i < len(char_items): _, c = char_items[i] char_updates.extend([ gr.update(visible=True), gr.update(value=c.name, visible=True), gr.update(value=c.description, visible=True), gr.update(value=c.mode, visible=True), gr.update(value=c.preset if c.preset else "Ryan", visible=c.mode=="preset"), gr.update(value=c.ref_audio, visible=c.mode=="clone"), gr.update(value=c.ref_text, visible=c.mode=="clone"), gr.update(value=c.design_desc, visible=c.mode=="design"), gr.update(value=c.instruct, visible=True), gr.update(value=c.language, visible=True), gr.update(value=c.speed, visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), ]) else: char_updates.extend([ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), ]) text_sample = data.get("text_sample", "") return [text_sample] + nar_updates + char_updates + [f"Project loaded! {len(chars)} characters configured."] except Exception as e: import traceback traceback.print_exc() return [""] + [gr.update()] * 8 + [gr.update(visible=False)] * 112 + [f"Error loading project: {e}"] # --------------------------------------------------------------------------- # Build UI # --------------------------------------------------------------------------- def build_app(): theme = gr.themes.Soft( primary_hue="indigo", secondary_hue="cyan", neutral_hue="slate", ).set( body_background_fill="#0f172a", body_background_fill_dark="#0f172a", body_text_color="#f8fafc", body_text_color_subdued="#94a3b8", background_fill_primary="#1e293b", background_fill_secondary="#0f172a", border_color_accent="#334155", color_accent_soft="#22d3ee", button_primary_background_fill="linear-gradient(135deg, #6366f1, #4f46e5)", button_primary_background_fill_hover="linear-gradient(135deg, #4f46e5, #4338ca)", button_primary_text_color="#ffffff", input_background_fill="#0f172a", input_border_color="#334155", block_title_text_color="#f8fafc", block_label_text_color="#94a3b8", ) with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="AudioBook Forge") as demo: gr.HTML("""

AudioBook Forge

High-fidelity audiobooks with AI character voices. Model-agnostic TTS powered by Qwen3-TTS.

""") with gr.Tabs(): # ==================== TAB 1: Story ==================== with gr.TabItem("📖 Story"): with gr.Row(): with gr.Column(scale=2): gr.Markdown("### Upload or Paste") file_upload = gr.File( label="Upload EPUB, PDF, TXT, or HTML", file_types=[".txt", ".epub", ".pdf", ".html", ".htm"], ) story_input = gr.TextArea( label="Story Text", placeholder="Paste your book chapter, short story, or script here...", lines=18, max_lines=40, ) sample_dropdown = gr.Dropdown( label="Or try a sample story", choices=list(SAMPLE_STORIES.keys()), value=None, ) with gr.Column(scale=1): gr.Markdown("### Stats") with gr.Row(): stat_words = gr.Textbox(label="Words", value="0", interactive=False) stat_dur = gr.Textbox(label="Est. Duration", value="0 sec", interactive=False) gr.Markdown("---") gr.Markdown("### Quick Generate") quick_mode = gr.Dropdown(choices=["preset", "clone", "design"], value="design", label="Narrator Mode") quick_preset = gr.Dropdown(choices=list(PRESET_SPEAKERS.keys()), value="Ryan", label="Preset Voice", visible=False) quick_audio = gr.Audio(label="Upload Voice Sample (3–10s)", type="filepath", visible=False) quick_ref_text = gr.Textbox(label="Reference Transcript", placeholder="What does the sample say?", visible=False) quick_design = gr.TextArea(label="Voice Description", placeholder="e.g. A warm, raspy baritone with measured pacing...", visible=True, lines=2, value="A clear, warm, expressive audiobook narrator voice with professional pacing and rich tone.") quick_instruct = gr.Textbox(label="Style Instruction", placeholder="e.g. Calm, measured storytelling.", value="") quick_lang = gr.Dropdown(choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", label="Language") quick_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed") quick_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature") quick_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format") quick_btn = gr.Button("⚡ Quick Generate", variant="primary") quick_output_audio = gr.Audio(label="Quick Audiobook", type="filepath", interactive=False) quick_output_file = gr.File(label="Download", interactive=False) quick_status = gr.Textbox(show_label=False, interactive=False) gr.Markdown("---") gr.Markdown("**Quick Generate** uses a single narrator voice for the entire text. Supports preset, clone, or AI-designed voices.") with gr.Row(): chapter_selector = gr.Dropdown( label="Chapter / Section", choices=["All"], value="All", interactive=True, ) refresh_chapters_btn = gr.Button("🔄 Detect Chapters") clear_story_btn = gr.Button("đŸ—‘ī¸ Clear", variant="secondary") def clear_story(): return "", gr.update(choices=["All"], value="All"), "0", "0 sec" clear_story_btn.click( clear_story, inputs=[], outputs=[story_input, chapter_selector, stat_words, stat_dur], ) with gr.Row(): gr.Markdown("### Character Detection") extract_btn = gr.Button("🔍 Extract Characters", variant="primary") extract_status = gr.Textbox(label="Status", interactive=False) # Wiring file_upload.change(handle_upload, inputs=[file_upload], outputs=[story_input, extract_status]) def load_sample_and_update(name): text = SAMPLE_STORIES.get(name, "") wc = len(text.split()) if text else 0 dur = estimate_duration(wc) return text, str(wc), dur, gr.update(choices=["All"], value="All"), "" sample_dropdown.change( load_sample_and_update, inputs=[sample_dropdown], outputs=[story_input, stat_words, stat_dur, chapter_selector, extract_status], ) story_input.change(update_stats, inputs=[story_input], outputs=[stat_words, stat_dur]) quick_btn.click( quick_generate_gpu, inputs=[story_input, quick_mode, quick_preset, quick_audio, quick_ref_text, quick_design, quick_instruct, quick_lang, quick_speed, quick_temp, quick_fmt], outputs=[quick_output_audio, quick_output_file, quick_status], ) quick_mode.change(on_mode_change, inputs=quick_mode, outputs=[quick_preset, quick_audio, quick_ref_text, quick_design]) def refresh_chapters(text): if not text: return gr.update(choices=["All"], value="All") pipe = get_pipeline() chs = pipe.detect_chapters(text) choices = ["All"] + [f"Ch{c['idx']+1}: {c['title'][:60]}" for c in chs] return gr.update(choices=choices, value="All") refresh_chapters_btn.click(refresh_chapters, inputs=[story_input], outputs=[chapter_selector]) # ==================== TAB 2: Voice Cast ==================== with gr.TabItem("🎭 Voice Cast"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("## Narrator") with gr.Column(elem_classes="ab-card"): nar_mode = gr.Dropdown(choices=["preset", "clone", "design"], value="design", label="Mode") nar_preset = gr.Dropdown(choices=list(PRESET_SPEAKERS.keys()), value="Ryan", label="Preset Voice", visible=False) nar_audio = gr.Audio(label="Upload Voice Sample (3–10s)", type="filepath", visible=False) nar_ref_text = gr.Textbox(label="Reference Transcript", placeholder="What does the sample say?", visible=False) nar_design = gr.TextArea(label="Voice Description", placeholder="e.g. A warm, raspy baritone with measured pacing...", visible=True, lines=2, value="A clear, warm, expressive audiobook narrator voice with professional pacing and rich tone.") nar_instruct = gr.Textbox(label="Style Instruction", placeholder="e.g. Calm, measured storytelling.") nar_lang = gr.Dropdown(choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", label="Language") nar_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed") nar_preview_btn = gr.Button("🔊 Preview Narrator", variant="secondary") nar_preview_audio = gr.Audio(label="Preview", interactive=False) nar_preview_status = gr.Textbox(show_label=False, interactive=False) nar_mode.change(on_mode_change, inputs=nar_mode, outputs=[nar_preset, nar_audio, nar_ref_text, nar_design]) nar_preview_btn.click( preview_narrator_gpu, inputs=[nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed], outputs=[nar_preview_audio, nar_preview_status], ) with gr.Column(scale=2): gr.Markdown("## Character Voices") gr.Markdown(""" Configure up to 8 characters. Each character can use one of three voice modes: - **Preset** — Choose from 9 built-in speakers (Ryan, Aiden, Serena, etc.) - **Clone** — Upload a 3–10 second voice sample to clone any real voice - **Design** — Describe a voice in text (e.g. *"A raspy old man with a warm chuckle"*) and the AI will create it """) char_names, char_descs, char_modes, char_presets = [], [], [], [] char_audios, char_ref_texts, char_designs, char_instructs, char_langs, char_speeds = [], [], [], [], [], [] char_rows, char_preview_btns, char_preview_audios, char_preview_statuses = [], [], [], [] for i in range(8): visible_default = (i == 0) with gr.Group(visible=visible_default) as row: with gr.Row(): cn = gr.Textbox(label="Name", placeholder="e.g. Alice", visible=visible_default) cd = gr.Textbox(label="Description", placeholder="Personality note", visible=visible_default) cm = gr.Dropdown(label="Mode", choices=["preset", "clone", "design"], value="design", visible=visible_default) cp = gr.Dropdown(label="Preset", choices=list(PRESET_SPEAKERS.keys()), value="Ryan", visible=False) with gr.Row(): ca = gr.Audio(label="Voice Sample", type="filepath", visible=False) crt = gr.Textbox(label="Ref Transcript", placeholder="What the sample says", visible=False) cdes = gr.TextArea(label="Voice Description", placeholder="e.g. A shrill, nervous teenager.", visible=visible_default, lines=2) cinstr = gr.Textbox(label="Style Instruction", placeholder="e.g. Angry and loud.", visible=visible_default) cl = gr.Dropdown(label="Language", choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", visible=visible_default) cspd = gr.Slider(label="Speed", minimum=0.5, maximum=2.0, value=1.0, step=0.1, visible=visible_default) with gr.Row(): cpv_btn = gr.Button("🔊 Preview", variant="secondary", visible=visible_default) cpv_audio = gr.Audio(label="Preview", interactive=False, visible=visible_default) cpv_status = gr.Textbox(show_label=False, interactive=False, visible=visible_default) cm.change(on_mode_change, inputs=cm, outputs=[cp, ca, crt, cdes]) cpv_btn.click( preview_char_voice_gpu, inputs=[cn, cm, cp, ca, crt, cdes, cinstr, cl, cspd], outputs=[cpv_audio, cpv_status], ) char_rows.append(row) char_names.append(cn) char_descs.append(cd) char_modes.append(cm) char_presets.append(cp) char_audios.append(ca) char_ref_texts.append(crt) char_designs.append(cdes) char_instructs.append(cinstr) char_langs.append(cl) char_speeds.append(cspd) char_preview_btns.append(cpv_btn) char_preview_audios.append(cpv_audio) char_preview_statuses.append(cpv_status) # ==================== TAB 3: Generate ==================== with gr.TabItem("⚡ Generate"): gr.Markdown("_Note: The first generation downloads Qwen3-TTS 1.7B models (~5 GB) and may take 2–5 minutes. Subsequent runs are much faster._") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Settings") gen_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature") gen_seed = gr.Number(value=42, precision=0, label="Seed (fix for consistency)") output_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format") gen_btn = gr.Button("â–ļī¸ Generate Full Audiobook", variant="primary", size="lg") gen_progress = gr.Textbox(label="Progress", interactive=False, value="Ready.") with gr.Column(scale=2): gr.Markdown("### Output") output_audio = gr.Audio(label="Generated Audiobook", type="filepath", interactive=False) output_file = gr.File(label="Download", interactive=False) output_status = gr.Textbox(label="Status", interactive=False) segment_list = gr.HTML(label="Segments") # ==================== TAB 4: Project ==================== with gr.TabItem("💾 Project"): with gr.Row(): with gr.Column(): gr.Markdown("### Save Project") save_btn = gr.Button("💾 Save Configuration", variant="primary") project_json = gr.TextArea(label="Project JSON (copy this to save)", lines=10, interactive=True) with gr.Column(): gr.Markdown("### Load Project") load_json = gr.TextArea(label="Paste Project JSON here", lines=10, interactive=True) load_btn = gr.Button("📂 Load Configuration", variant="secondary") load_status = gr.Textbox(label="Status", interactive=False) # ==================== TAB 5: About ==================== with gr.TabItem("â„šī¸ About"): gr.Markdown(""" ## AudioBook Forge **Model-agnostic, high-fidelity audiobook generator** powered by [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS). ### Features - 📁 **File Upload** — Import EPUB, PDF, TXT, or HTML directly - 📖 **Chapter Detection** — Auto-detects chapters/sections for selective generation - đŸŽ™ī¸ **Character Voice Mapping** — Auto-extract characters and assign unique voices - 🎭 **Three Voice Modes** — Preset (9 speakers), Clone (upload sample), Design (text description) - ⚡ **Quick Generate** — One-click audiobook with a single narrator voice - đŸŽšī¸ **Speed Control** — Adjust playback speed per voice (0.5x–2.0x) - đŸ“Ļ **Multi-format Export** — MP3, WAV, or ZIP of individual segments - 💾 **Save/Load Projects** — Export and restore your voice configurations - 🌐 **10 Languages** — English, Chinese, Japanese, Korean, German, French, Spanish, Italian, Portuguese, Russian - ⚡ **ZeroGPU** — Runs on Hugging Face ZeroGPU (free compute) ### Workflow 1. **Upload or paste** your story text 2. **Detect chapters** (optional) and select a range 3. **Extract characters** or use Quick Generate for simple narration 4. **Assign voices** to narrator and each character 5. **Generate** and download your audiobook ### Tips for Best Quality - Use clean, noise-free voice samples for cloning (3–10 seconds) - Keep reference transcripts accurate - Lower temperature (0.5–0.6) for stable narration; higher (0.8–0.9) for expressive dialogue - Use a fixed seed to prevent voice drift across segments - Use speed adjustment to fine-tune pacing per character ### Note on First Run The first time you generate audio, the Space downloads the Qwen3-TTS 1.7B models (~5 GB total). This can take **2–5 minutes** depending on network speed. Subsequent runs are much faster because models are cached. Please be patient — the progress is printed in the server logs. """) # ---------- Extract wiring ---------- def do_extract(text): chars, status = extract_chars(text) updates = [] for i in range(8): if i < len(chars): mode = chars[i].get("voice_mode", "design") is_preset = mode == "preset" is_clone = mode == "clone" is_design = mode == "design" updates.extend([ gr.update(visible=True), gr.update(value=chars[i].get("name", ""), visible=True), gr.update(value=chars[i].get("description", ""), visible=True), gr.update(value=mode, visible=True), gr.update(value=chars[i].get("voice_preset", "Ryan"), visible=is_preset), gr.update(visible=is_clone), gr.update(visible=is_clone), gr.update(value=chars[i].get("voice_description", ""), visible=is_design), gr.update(value=chars[i].get("voice_instruct", ""), visible=True), gr.update(value=chars[i].get("language", "English"), visible=True), gr.update(value=chars[i].get("speed", 1.0), visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), ]) else: updates.extend([ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), ]) return [status] + updates extract_outputs = [extract_status] + [ item for sublist in [ [char_rows[i], char_names[i], char_descs[i], char_modes[i], char_presets[i], char_audios[i], char_ref_texts[i], char_designs[i], char_instructs[i], char_langs[i], char_speeds[i], char_preview_btns[i], char_preview_audios[i], char_preview_statuses[i]] for i in range(8) ] for item in sublist ] extract_btn.click(do_extract, inputs=[story_input], outputs=extract_outputs) # ---------- Generate wiring ---------- all_char_inputs = ( char_names + char_descs + char_modes + char_presets + char_audios + char_ref_texts + char_designs + char_instructs + char_langs + char_speeds ) gen_inputs = [ story_input, chapter_selector, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed, gen_temp, gen_seed, output_fmt, ] + all_char_inputs def wrapped_generate(story_text, chapter_sel, *args): text = get_chapter_text(story_text, chapter_sel) return generate_audiobook_gpu(text, *args) gen_btn.click( wrapped_generate, inputs=gen_inputs, outputs=[output_audio, output_file, segment_list, output_status, gen_progress], ) # ---------- Project wiring ---------- save_inputs = [ story_input, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed, ] + all_char_inputs + [gen_temp, gen_seed] save_btn.click(do_save_project, inputs=save_inputs, outputs=[project_json]) load_outputs = [story_input, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed] + extract_outputs[1:] + [load_status] load_btn.click(do_load_project, inputs=[load_json], outputs=load_outputs) return demo demo = build_app() if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)