| """ |
| AudioBook Forge - Enhanced Gradio Frontend |
| High-fidelity audiobook generator with character voice mapping, |
| file upload, chapter selection, segment previews, and project save/load. |
| """ |
|
|
| import os |
| import json |
| from pathlib import Path |
| from typing import Dict, List, Optional |
|
|
| import gradio as gr |
| import numpy as np |
|
|
| |
| |
| |
| try: |
| import spaces |
| except ImportError: |
| class _SpacesGPU: |
| def __init__(self, duration=60): |
| self.duration = duration |
| def __call__(self, fn): |
| return fn |
| class spaces: |
| GPU = _SpacesGPU |
|
|
| |
| |
| |
| from backend import ( |
| AudiobookPipeline, |
| VoiceConfig, |
| PRESET_SPEAKERS, |
| SAMPLE_STORIES, |
| save_project, |
| load_project, |
| estimate_duration, |
| ) |
|
|
| |
| |
| |
|
|
| CUSTOM_CSS = """ |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); |
| |
| body, .gradio-container { |
| font-family: 'Inter', sans-serif !important; |
| background: #0f172a !important; |
| color: #f8fafc !important; |
| } |
| |
| .gradio-container { |
| max-width: 1200px !important; |
| } |
| |
| .ab-header { |
| text-align: center; |
| padding: 2.2rem 1rem 1.8rem; |
| background: linear-gradient(135deg, rgba(99,102,241,0.12) 0%, rgba(34,211,238,0.06) 100%); |
| border-radius: 18px; |
| margin-bottom: 1.5rem; |
| border: 1px solid rgba(99,102,241,0.18); |
| } |
| .ab-header h1 { |
| font-size: 2.6rem; |
| font-weight: 700; |
| margin: 0; |
| background: linear-gradient(90deg, #a5b4fc, #22d3ee); |
| -webkit-background-clip: text; |
| -webkit-text-fill-color: transparent; |
| } |
| .ab-header p { |
| color: #94a3b8; |
| margin-top: 0.6rem; |
| font-size: 1.05rem; |
| } |
| |
| .ab-card { |
| background: #1e293b !important; |
| border: 1px solid #334155 !important; |
| border-radius: 14px !important; |
| padding: 1.25rem !important; |
| } |
| |
| .ab-stat { |
| background: #0f172a; |
| border: 1px solid #334155; |
| border-radius: 10px; |
| padding: 0.75rem 1rem; |
| text-align: center; |
| } |
| .ab-stat .value { |
| font-size: 1.4rem; |
| font-weight: 700; |
| color: #22d3ee; |
| } |
| .ab-stat .label { |
| font-size: 0.75rem; |
| color: #94a3b8; |
| text-transform: uppercase; |
| letter-spacing: 0.05em; |
| } |
| |
| button.primary { |
| background: linear-gradient(135deg, #6366f1, #4f46e5) !important; |
| border: none !important; |
| border-radius: 10px !important; |
| font-weight: 600 !important; |
| transition: all 0.2s ease !important; |
| } |
| button.primary:hover { |
| transform: translateY(-1px); |
| box-shadow: 0 4px 14px rgba(99,102,241,0.4) !important; |
| } |
| button.secondary { |
| background: #334155 !important; |
| border: 1px solid #475569 !important; |
| border-radius: 10px !important; |
| color: #f8fafc !important; |
| } |
| |
| input, textarea, select { |
| background: #0f172a !important; |
| border: 1px solid #334155 !important; |
| border-radius: 8px !important; |
| color: #f8fafc !important; |
| } |
| input:focus, textarea:focus, select:focus { |
| border-color: #6366f1 !important; |
| box-shadow: 0 0 0 3px rgba(99,102,241,0.15) !important; |
| } |
| |
| .gr-box, .gr-form { |
| background: #1e293b !important; |
| border-color: #334155 !important; |
| } |
| .gr-panel { |
| background: #1e293b !important; |
| } |
| |
| .tabitem { |
| background: #1e293b !important; |
| border-color: #334155 !important; |
| } |
| |
| input[type="checkbox"] + label, |
| .checkbox-label, |
| .gr-checkbox label { |
| color: #f8fafc !important; |
| } |
| |
| /* Gradio 5+ checkbox checked state - make it clearly visible in dark theme */ |
| .gr-checkbox input[type="checkbox"]:checked + label, |
| .gr-checkbox-checked label, |
| .gr-checkbox-input:checked + .gr-checkbox-border, |
| .gr-checkbox-input:checked + label .gr-checkbox-border, |
| input[type="checkbox"]:checked + label span { |
| background: #6366f1 !important; |
| border-color: #818cf8 !important; |
| box-shadow: 0 0 0 3px rgba(99,102,241,0.35) !important; |
| } |
| .gr-checkbox input[type="checkbox"]:checked + label::after, |
| .gr-checkbox-input:checked + label::after { |
| border-color: #ffffff !important; |
| } |
| .gr-checkbox { |
| color: #f8fafc !important; |
| } |
| .gr-checkbox-input:checked + * { |
| background: #6366f1 !important; |
| border-color: #818cf8 !important; |
| } |
| |
| li, .prose li, .gr-prose li { |
| color: #cbd5e1 !important; |
| } |
| |
| strong, b { |
| color: #f8fafc !important; |
| } |
| |
| code { |
| background: #334155 !important; |
| color: #22d3ee !important; |
| padding: 0.1rem 0.3rem !important; |
| border-radius: 4px !important; |
| } |
| |
| progress { |
| width: 100%; |
| height: 8px; |
| border-radius: 4px; |
| background: #334155; |
| } |
| progress::-webkit-progress-bar { |
| background: #334155; |
| border-radius: 4px; |
| } |
| progress::-webkit-progress-value { |
| background: linear-gradient(90deg, #6366f1, #22d3ee); |
| border-radius: 4px; |
| } |
| |
| .seg-item { |
| background: #0f172a; |
| border: 1px solid #334155; |
| border-radius: 8px; |
| padding: 0.5rem 0.75rem; |
| margin-bottom: 0.4rem; |
| font-size: 0.85rem; |
| } |
| .seg-item .seg-type { |
| display: inline-block; |
| padding: 0.1rem 0.4rem; |
| border-radius: 4px; |
| font-size: 0.7rem; |
| font-weight: 600; |
| text-transform: uppercase; |
| } |
| .seg-type.narration { background: #4f46e5; color: #fff; } |
| .seg-type.dialogue { background: #22d3ee; color: #0f172a; } |
| """ |
|
|
| |
| |
| |
|
|
| _pipeline: Optional[AudiobookPipeline] = None |
|
|
|
|
| def get_pipeline() -> AudiobookPipeline: |
| global _pipeline |
| if _pipeline is None: |
| _pipeline = AudiobookPipeline() |
| return _pipeline |
|
|
|
|
| |
| |
| |
|
|
| def on_mode_change(mode: str) -> tuple: |
| if mode == "preset": |
| return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) |
| elif mode == "clone": |
| return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False) |
| else: |
| return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True) |
|
|
|
|
| def update_stats(text: str) -> tuple: |
| wc = len(text.split()) if text else 0 |
| dur = estimate_duration(wc) |
| return str(wc), dur |
|
|
|
|
| def handle_upload(file_obj) -> tuple: |
| if file_obj is None: |
| return "", "No file uploaded." |
| try: |
| pipe = get_pipeline() |
| text, fname = pipe.parse_upload(file_obj) |
| text = pipe.processor.clean_text(text) |
| chs = pipe.detect_chapters(text) |
| ch_info = " | ".join([f"Ch{c['idx']+1}: {c['word_count']}w" for c in chs[:5]]) |
| if len(chs) > 5: |
| ch_info += f" (+{len(chs)-5} more)" |
| wc = len(text.split()) |
| dur = estimate_duration(wc) |
| return text, f"Loaded {fname} β {wc} words (~{dur}) | {ch_info if chs else '1 section'}" |
| except Exception as e: |
| return "", f"Error: {e}" |
|
|
|
|
| def extract_chars(text: str) -> tuple: |
| if not text or len(text.strip()) < 20: |
| return [], "Text too short. Please paste at least a paragraph." |
| pipe = get_pipeline() |
| chars = pipe.extract_characters(text, use_ai=True) |
| status = f"Found {len(chars)} characters: {', '.join(c['name'] for c in chars)}" if chars else "No characters auto-detected. Add them manually below." |
| return chars, status |
|
|
|
|
| def get_chapter_text(text: str, chapter_sel: str) -> str: |
| if not text or chapter_sel == "All" or not chapter_sel: |
| return text |
| try: |
| idx = int(chapter_sel.split(":")[0].replace("Ch", "")) - 1 |
| pipe = get_pipeline() |
| return pipe.get_chapter_text(text, idx) |
| except Exception: |
| return text |
|
|
|
|
| |
| |
| |
|
|
| @spaces.GPU(duration=180) |
| def generate_audiobook_gpu( |
| text, |
| nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed, |
| gen_temp, gen_seed, output_fmt, *args |
| ): |
| if not text or len(text.strip()) < 50: |
| return None, None, "", "Error: Please provide at least 50 characters of story text.", "" |
|
|
| wc = len(text.split()) |
| if wc > 5000: |
| print(f"[WARN] Long text: {wc} words. Generation may take a while or hit timeouts.") |
|
|
| |
| names = list(args[0:8]) |
| descs = list(args[8:16]) |
| modes = list(args[16:24]) |
| presets = list(args[24:32]) |
| audios = list(args[32:40]) |
| ref_texts = list(args[40:48]) |
| designs = list(args[48:56]) |
| instructs = list(args[56:64]) |
| langs = list(args[64:72]) |
| speeds = list(args[72:80]) |
|
|
| pipe = get_pipeline() |
|
|
| nar_cfg = VoiceConfig( |
| name="Narrator", |
| mode=nar_mode, |
| preset=nar_preset if nar_mode == "preset" else None, |
| ref_audio=nar_audio if nar_mode == "clone" and nar_audio else None, |
| ref_text=nar_ref_text if nar_mode == "clone" else None, |
| design_desc=nar_design if nar_mode == "design" else None, |
| instruct=nar_instruct, |
| language=nar_lang, |
| speed=float(nar_speed) if nar_speed else 1.0, |
| ) |
|
|
| char_configs = {} |
| for i in range(8): |
| if not names[i]: |
| continue |
| vc = VoiceConfig( |
| name=names[i], |
| mode=modes[i], |
| preset=presets[i] if modes[i] == "preset" else None, |
| ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None, |
| ref_text=ref_texts[i] if modes[i] == "clone" else None, |
| design_desc=designs[i] if modes[i] == "design" else None, |
| instruct=instructs[i] or "", |
| language=langs[i], |
| speed=float(speeds[i]) if speeds[i] else 1.0, |
| ) |
| char_configs[names[i]] = vc |
|
|
| progress_text = "" |
|
|
| def prog_cb(ratio: float, msg: str): |
| nonlocal progress_text |
| progress_text = f"[{ratio*100:.0f}%] {msg}" |
| print(progress_text) |
|
|
| try: |
| output_path, seg_paths, seg_meta = pipe.generate( |
| text=text, |
| narrator_config=nar_cfg, |
| character_configs=char_configs, |
| progress_callback=prog_cb, |
| temperature=gen_temp, |
| seed=int(gen_seed), |
| ) |
|
|
| seg_html = "<div style='max-height: 300px; overflow-y: auto;'>" |
| for s in seg_meta[:50]: |
| tclass = "narration" if s['type'] == 'narration' else "dialogue" |
| seg_html += f"<div class='seg-item'><span class='seg-type {tclass}'>{s['type']}</span> <strong>{s['speaker']}</strong>: {s['text']}</div>" |
| if len(seg_meta) > 50: |
| seg_html += f"<div style='text-align:center;color:#94a3b8;padding:0.5rem;'>... and {len(seg_meta)-50} more segments</div>" |
| seg_html += "</div>" |
|
|
| extra_path = None |
| if output_fmt == "wav": |
| extra_path = output_path.replace(".mp3", ".wav") |
| from backend import save_audiobook |
| save_audiobook(seg_paths, extra_path, fmt="wav") |
| elif output_fmt == "zip": |
| extra_path = pipe.export_segments_zip(seg_paths) |
|
|
| final_path = extra_path if extra_path else output_path |
| return final_path, final_path, seg_html, f"Done! {len(seg_meta)} segments generated.", progress_text |
| except Exception as e: |
| import traceback |
| traceback.print_exc() |
| return None, None, "", f"Error: {str(e)}", progress_text |
|
|
|
|
| @spaces.GPU(duration=60) |
| def preview_narrator_gpu(mode, preset, audio, ref_text, design, instruct, lang, speed): |
| pipe = get_pipeline() |
| vc = VoiceConfig( |
| name="Narrator", |
| mode=mode, |
| preset=preset if mode == "preset" else None, |
| ref_audio=audio if mode == "clone" and audio else None, |
| ref_text=ref_text if mode == "clone" else None, |
| design_desc=design if mode == "design" else None, |
| instruct=instruct, |
| language=lang, |
| speed=float(speed) if speed else 1.0, |
| ) |
| try: |
| wav, sr = pipe.preview_voice(vc) |
| return (sr, wav), "Preview ready!" |
| except Exception as e: |
| import traceback |
| traceback.print_exc() |
| return None, f"Preview failed: {e}" |
|
|
|
|
| @spaces.GPU(duration=60) |
| def preview_char_voice_gpu(name, mode, preset, audio, ref_text, design, instruct, lang, speed): |
| pipe = get_pipeline() |
| vc = VoiceConfig( |
| name=name or "Character", |
| mode=mode, |
| preset=preset if mode == "preset" else None, |
| ref_audio=audio if mode == "clone" and audio else None, |
| ref_text=ref_text if mode == "clone" else None, |
| design_desc=design if mode == "design" else None, |
| instruct=instruct, |
| language=lang, |
| speed=float(speed) if speed else 1.0, |
| ) |
| try: |
| sample = f"Hello, I am {name or 'your character'}. This is how I sound in the story." |
| wav, sr = pipe.preview_voice(vc, sample_text=sample) |
| return (sr, wav), f"{name or 'Character'} preview ready!" |
| except Exception as e: |
| import traceback |
| traceback.print_exc() |
| return None, f"Preview failed: {e}" |
|
|
|
|
| |
| |
| |
|
|
| @spaces.GPU(duration=180) |
| def quick_generate_gpu(text, mode, preset, audio, ref_text, design, instruct, lang, speed, gen_temp, output_fmt, gen_seed=42): |
| if not text or len(text.strip()) < 50: |
| return None, None, "Error: Text too short." |
|
|
| wc = len(text.split()) |
| if wc > 5000: |
| print(f"[WARN] Long text: {wc} words. Quick Generate may take a while or hit timeouts.") |
|
|
| pipe = get_pipeline() |
| nar_cfg = VoiceConfig( |
| name="Narrator", |
| mode=mode, |
| preset=preset if mode == "preset" else None, |
| ref_audio=audio if mode == "clone" and audio else None, |
| ref_text=ref_text if mode == "clone" else None, |
| design_desc=design if mode == "design" else None, |
| instruct=instruct or "Narrate clearly and expressively.", |
| language=lang, |
| speed=float(speed) if speed else 1.0, |
| ) |
|
|
| def prog_cb(ratio: float, msg: str): |
| print(f"[{ratio*100:.0f}%] {msg}") |
|
|
| try: |
| output_path, seg_paths, seg_meta = pipe.generate( |
| text=text, |
| narrator_config=nar_cfg, |
| character_configs={}, |
| progress_callback=prog_cb, |
| temperature=gen_temp, |
| seed=int(gen_seed), |
| ) |
|
|
| extra_path = None |
| if output_fmt == "wav": |
| extra_path = output_path.replace(".mp3", ".wav") |
| from backend import save_audiobook |
| save_audiobook(seg_paths, extra_path, fmt="wav") |
| elif output_fmt == "zip": |
| extra_path = pipe.export_segments_zip(seg_paths) |
|
|
| final_path = extra_path if extra_path else output_path |
| return final_path, final_path, f"Quick audiobook ready! {len(seg_meta)} segments." |
| except Exception as e: |
| import traceback |
| traceback.print_exc() |
| return None, None, f"Error: {str(e)}" |
|
|
|
|
| |
| |
| |
|
|
| def do_save_project(text, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed, *args): |
| |
| names = list(args[0:8]) |
| descs = list(args[8:16]) |
| modes = list(args[16:24]) |
| presets = list(args[24:32]) |
| audios = list(args[32:40]) |
| ref_texts = list(args[40:48]) |
| designs = list(args[48:56]) |
| instructs = list(args[56:64]) |
| langs = list(args[64:72]) |
| speeds = list(args[72:80]) |
| gen_temp = args[80] if len(args) > 80 else 0.7 |
| gen_seed = args[81] if len(args) > 81 else 42 |
|
|
| nar_cfg = VoiceConfig( |
| name="Narrator", mode=nar_mode, preset=nar_preset if nar_mode == "preset" else None, |
| ref_audio=nar_audio if nar_mode == "clone" and nar_audio else None, |
| ref_text=nar_ref_text if nar_mode == "clone" else None, |
| design_desc=nar_design if nar_mode == "design" else None, |
| instruct=nar_instruct, language=nar_lang, |
| speed=float(nar_speed) if nar_speed else 1.0, |
| ) |
| char_configs = {} |
| for i in range(8): |
| if not names[i]: |
| continue |
| char_configs[names[i]] = VoiceConfig( |
| name=names[i], mode=modes[i], description=descs[i] or "", |
| preset=presets[i] if modes[i] == "preset" else None, |
| ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None, |
| ref_text=ref_texts[i] if modes[i] == "clone" else None, |
| design_desc=designs[i] if modes[i] == "design" else None, |
| instruct=instructs[i] or "", language=langs[i], |
| speed=float(speeds[i]) if speeds[i] else 1.0, |
| ) |
| settings = {"temperature": gen_temp, "seed": int(gen_seed)} |
| json_str = save_project(text, nar_cfg, char_configs, settings) |
| return json_str |
|
|
|
|
| def do_load_project(json_str): |
| try: |
| data = load_project(json_str) |
| nar = data["narrator"] |
| chars = data.get("characters", {}) |
|
|
| nar_updates = [ |
| gr.update(value=nar.mode), |
| gr.update(value=nar.preset if nar.preset else "Ryan", visible=nar.mode=="preset"), |
| gr.update(value=nar.ref_audio, visible=nar.mode=="clone"), |
| gr.update(value=nar.ref_text, visible=nar.mode=="clone"), |
| gr.update(value=nar.design_desc, visible=nar.mode=="design"), |
| gr.update(value=nar.instruct), |
| gr.update(value=nar.language), |
| gr.update(value=nar.speed), |
| ] |
|
|
| char_updates = [] |
| char_items = list(chars.items())[:8] |
| for i in range(8): |
| if i < len(char_items): |
| _, c = char_items[i] |
| char_updates.extend([ |
| gr.update(visible=True), |
| gr.update(value=c.name, visible=True), |
| gr.update(value=c.description, visible=True), |
| gr.update(value=c.mode, visible=True), |
| gr.update(value=c.preset if c.preset else "Ryan", visible=c.mode=="preset"), |
| gr.update(value=c.ref_audio, visible=c.mode=="clone"), |
| gr.update(value=c.ref_text, visible=c.mode=="clone"), |
| gr.update(value=c.design_desc, visible=c.mode=="design"), |
| gr.update(value=c.instruct, visible=True), |
| gr.update(value=c.language, visible=True), |
| gr.update(value=c.speed, visible=True), |
| gr.update(visible=True), |
| gr.update(visible=True), |
| gr.update(visible=True), |
| ]) |
| else: |
| char_updates.extend([ |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| ]) |
|
|
| text_sample = data.get("text_sample", "") |
| return [text_sample] + nar_updates + char_updates + [f"Project loaded! {len(chars)} characters configured."] |
| except Exception as e: |
| import traceback |
| traceback.print_exc() |
| return [""] + [gr.update()] * 8 + [gr.update(visible=False)] * 112 + [f"Error loading project: {e}"] |
|
|
|
|
| |
| |
| |
|
|
| def build_app(): |
| theme = gr.themes.Soft( |
| primary_hue="indigo", |
| secondary_hue="cyan", |
| neutral_hue="slate", |
| ).set( |
| body_background_fill="#0f172a", |
| body_background_fill_dark="#0f172a", |
| body_text_color="#f8fafc", |
| body_text_color_subdued="#94a3b8", |
| background_fill_primary="#1e293b", |
| background_fill_secondary="#0f172a", |
| border_color_accent="#334155", |
| color_accent_soft="#22d3ee", |
| button_primary_background_fill="linear-gradient(135deg, #6366f1, #4f46e5)", |
| button_primary_background_fill_hover="linear-gradient(135deg, #4f46e5, #4338ca)", |
| button_primary_text_color="#ffffff", |
| input_background_fill="#0f172a", |
| input_border_color="#334155", |
| block_title_text_color="#f8fafc", |
| block_label_text_color="#94a3b8", |
| ) |
|
|
| with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="AudioBook Forge") as demo: |
| gr.HTML(""" |
| <div class="ab-header"> |
| <h1>AudioBook Forge</h1> |
| <p>High-fidelity audiobooks with AI character voices. Model-agnostic TTS powered by Qwen3-TTS.</p> |
| </div> |
| """) |
|
|
| with gr.Tabs(): |
| |
| with gr.TabItem("π Story"): |
| with gr.Row(): |
| with gr.Column(scale=2): |
| gr.Markdown("### Upload or Paste") |
| file_upload = gr.File( |
| label="Upload EPUB, PDF, TXT, or HTML", |
| file_types=[".txt", ".epub", ".pdf", ".html", ".htm"], |
| ) |
| story_input = gr.TextArea( |
| label="Story Text", |
| placeholder="Paste your book chapter, short story, or script here...", |
| lines=18, |
| max_lines=40, |
| ) |
| sample_dropdown = gr.Dropdown( |
| label="Or try a sample story", |
| choices=list(SAMPLE_STORIES.keys()), |
| value=None, |
| ) |
|
|
| with gr.Column(scale=1): |
| gr.Markdown("### Stats") |
| with gr.Row(): |
| stat_words = gr.Textbox(label="Words", value="0", interactive=False) |
| stat_dur = gr.Textbox(label="Est. Duration", value="0 sec", interactive=False) |
| gr.Markdown("---") |
| gr.Markdown("### Quick Generate") |
| quick_mode = gr.Dropdown(choices=["preset", "clone", "design"], value="design", label="Narrator Mode") |
| quick_preset = gr.Dropdown(choices=list(PRESET_SPEAKERS.keys()), value="Ryan", label="Preset Voice", visible=False) |
| quick_audio = gr.Audio(label="Upload Voice Sample (3β10s)", type="filepath", visible=False) |
| quick_ref_text = gr.Textbox(label="Reference Transcript", placeholder="What does the sample say?", visible=False) |
| quick_design = gr.TextArea(label="Voice Description", placeholder="e.g. A warm, raspy baritone with measured pacing...", visible=True, lines=2, value="A clear, warm, expressive audiobook narrator voice with professional pacing and rich tone.") |
| quick_instruct = gr.Textbox(label="Style Instruction", placeholder="e.g. Calm, measured storytelling.", value="") |
| quick_lang = gr.Dropdown(choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", label="Language") |
| quick_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed") |
| quick_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature") |
| quick_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format") |
| quick_btn = gr.Button("β‘ Quick Generate", variant="primary") |
| quick_output_audio = gr.Audio(label="Quick Audiobook", type="filepath", interactive=False) |
| quick_output_file = gr.File(label="Download", interactive=False) |
| quick_status = gr.Textbox(show_label=False, interactive=False) |
| gr.Markdown("---") |
| gr.Markdown("**Quick Generate** uses a single narrator voice for the entire text. Supports preset, clone, or AI-designed voices.") |
|
|
| with gr.Row(): |
| chapter_selector = gr.Dropdown( |
| label="Chapter / Section", |
| choices=["All"], |
| value="All", |
| interactive=True, |
| ) |
| refresh_chapters_btn = gr.Button("π Detect Chapters") |
| clear_story_btn = gr.Button("ποΈ Clear", variant="secondary") |
|
|
| def clear_story(): |
| return "", gr.update(choices=["All"], value="All"), "0", "0 sec" |
|
|
| clear_story_btn.click( |
| clear_story, |
| inputs=[], |
| outputs=[story_input, chapter_selector, stat_words, stat_dur], |
| ) |
|
|
| with gr.Row(): |
| gr.Markdown("### Character Detection") |
| extract_btn = gr.Button("π Extract Characters", variant="primary") |
|
|
| extract_status = gr.Textbox(label="Status", interactive=False) |
|
|
| |
| file_upload.change(handle_upload, inputs=[file_upload], outputs=[story_input, extract_status]) |
| def load_sample_and_update(name): |
| text = SAMPLE_STORIES.get(name, "") |
| wc = len(text.split()) if text else 0 |
| dur = estimate_duration(wc) |
| return text, str(wc), dur, gr.update(choices=["All"], value="All"), "" |
|
|
| sample_dropdown.change( |
| load_sample_and_update, |
| inputs=[sample_dropdown], |
| outputs=[story_input, stat_words, stat_dur, chapter_selector, extract_status], |
| ) |
| story_input.change(update_stats, inputs=[story_input], outputs=[stat_words, stat_dur]) |
| quick_btn.click( |
| quick_generate_gpu, |
| inputs=[story_input, quick_mode, quick_preset, quick_audio, quick_ref_text, quick_design, quick_instruct, quick_lang, quick_speed, quick_temp, quick_fmt], |
| outputs=[quick_output_audio, quick_output_file, quick_status], |
| ) |
|
|
| quick_mode.change(on_mode_change, inputs=quick_mode, outputs=[quick_preset, quick_audio, quick_ref_text, quick_design]) |
|
|
| def refresh_chapters(text): |
| if not text: |
| return gr.update(choices=["All"], value="All") |
| pipe = get_pipeline() |
| chs = pipe.detect_chapters(text) |
| choices = ["All"] + [f"Ch{c['idx']+1}: {c['title'][:60]}" for c in chs] |
| return gr.update(choices=choices, value="All") |
|
|
| refresh_chapters_btn.click(refresh_chapters, inputs=[story_input], outputs=[chapter_selector]) |
|
|
| |
| with gr.TabItem("π Voice Cast"): |
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("## Narrator") |
| with gr.Column(elem_classes="ab-card"): |
| nar_mode = gr.Dropdown(choices=["preset", "clone", "design"], value="design", label="Mode") |
| nar_preset = gr.Dropdown(choices=list(PRESET_SPEAKERS.keys()), value="Ryan", label="Preset Voice", visible=False) |
| nar_audio = gr.Audio(label="Upload Voice Sample (3β10s)", type="filepath", visible=False) |
| nar_ref_text = gr.Textbox(label="Reference Transcript", placeholder="What does the sample say?", visible=False) |
| nar_design = gr.TextArea(label="Voice Description", placeholder="e.g. A warm, raspy baritone with measured pacing...", visible=True, lines=2, value="A clear, warm, expressive audiobook narrator voice with professional pacing and rich tone.") |
| nar_instruct = gr.Textbox(label="Style Instruction", placeholder="e.g. Calm, measured storytelling.") |
| nar_lang = gr.Dropdown(choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", label="Language") |
| nar_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed") |
| nar_preview_btn = gr.Button("π Preview Narrator", variant="secondary") |
| nar_preview_audio = gr.Audio(label="Preview", interactive=False) |
| nar_preview_status = gr.Textbox(show_label=False, interactive=False) |
|
|
| nar_mode.change(on_mode_change, inputs=nar_mode, outputs=[nar_preset, nar_audio, nar_ref_text, nar_design]) |
| nar_preview_btn.click( |
| preview_narrator_gpu, |
| inputs=[nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed], |
| outputs=[nar_preview_audio, nar_preview_status], |
| ) |
|
|
| with gr.Column(scale=2): |
| gr.Markdown("## Character Voices") |
| gr.Markdown(""" |
| Configure up to 8 characters. Each character can use one of three voice modes: |
| |
| - **Preset** β Choose from 9 built-in speakers (Ryan, Aiden, Serena, etc.) |
| - **Clone** β Upload a 3β10 second voice sample to clone any real voice |
| - **Design** β Describe a voice in text (e.g. *"A raspy old man with a warm chuckle"*) and the AI will create it |
| """) |
|
|
| char_names, char_descs, char_modes, char_presets = [], [], [], [] |
| char_audios, char_ref_texts, char_designs, char_instructs, char_langs, char_speeds = [], [], [], [], [], [] |
| char_rows, char_preview_btns, char_preview_audios, char_preview_statuses = [], [], [], [] |
|
|
| for i in range(8): |
| visible_default = (i == 0) |
| with gr.Group(visible=visible_default) as row: |
| with gr.Row(): |
| cn = gr.Textbox(label="Name", placeholder="e.g. Alice", visible=visible_default) |
| cd = gr.Textbox(label="Description", placeholder="Personality note", visible=visible_default) |
| cm = gr.Dropdown(label="Mode", choices=["preset", "clone", "design"], value="design", visible=visible_default) |
| cp = gr.Dropdown(label="Preset", choices=list(PRESET_SPEAKERS.keys()), value="Ryan", visible=False) |
| with gr.Row(): |
| ca = gr.Audio(label="Voice Sample", type="filepath", visible=False) |
| crt = gr.Textbox(label="Ref Transcript", placeholder="What the sample says", visible=False) |
| cdes = gr.TextArea(label="Voice Description", placeholder="e.g. A shrill, nervous teenager.", visible=visible_default, lines=2) |
| cinstr = gr.Textbox(label="Style Instruction", placeholder="e.g. Angry and loud.", visible=visible_default) |
| cl = gr.Dropdown(label="Language", choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", visible=visible_default) |
| cspd = gr.Slider(label="Speed", minimum=0.5, maximum=2.0, value=1.0, step=0.1, visible=visible_default) |
| with gr.Row(): |
| cpv_btn = gr.Button("π Preview", variant="secondary", visible=visible_default) |
| cpv_audio = gr.Audio(label="Preview", interactive=False, visible=visible_default) |
| cpv_status = gr.Textbox(show_label=False, interactive=False, visible=visible_default) |
|
|
| cm.change(on_mode_change, inputs=cm, outputs=[cp, ca, crt, cdes]) |
| cpv_btn.click( |
| preview_char_voice_gpu, |
| inputs=[cn, cm, cp, ca, crt, cdes, cinstr, cl, cspd], |
| outputs=[cpv_audio, cpv_status], |
| ) |
|
|
| char_rows.append(row) |
| char_names.append(cn) |
| char_descs.append(cd) |
| char_modes.append(cm) |
| char_presets.append(cp) |
| char_audios.append(ca) |
| char_ref_texts.append(crt) |
| char_designs.append(cdes) |
| char_instructs.append(cinstr) |
| char_langs.append(cl) |
| char_speeds.append(cspd) |
| char_preview_btns.append(cpv_btn) |
| char_preview_audios.append(cpv_audio) |
| char_preview_statuses.append(cpv_status) |
|
|
| |
| with gr.TabItem("β‘ Generate"): |
| gr.Markdown("_Note: The first generation downloads Qwen3-TTS 1.7B models (~5 GB) and may take 2β5 minutes. Subsequent runs are much faster._") |
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### Settings") |
| gen_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature") |
| gen_seed = gr.Number(value=42, precision=0, label="Seed (fix for consistency)") |
| output_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format") |
| gen_btn = gr.Button("βΆοΈ Generate Full Audiobook", variant="primary", size="lg") |
| gen_progress = gr.Textbox(label="Progress", interactive=False, value="Ready.") |
|
|
| with gr.Column(scale=2): |
| gr.Markdown("### Output") |
| output_audio = gr.Audio(label="Generated Audiobook", type="filepath", interactive=False) |
| output_file = gr.File(label="Download", interactive=False) |
| output_status = gr.Textbox(label="Status", interactive=False) |
| segment_list = gr.HTML(label="Segments") |
|
|
| |
| with gr.TabItem("πΎ Project"): |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### Save Project") |
| save_btn = gr.Button("πΎ Save Configuration", variant="primary") |
| project_json = gr.TextArea(label="Project JSON (copy this to save)", lines=10, interactive=True) |
| with gr.Column(): |
| gr.Markdown("### Load Project") |
| load_json = gr.TextArea(label="Paste Project JSON here", lines=10, interactive=True) |
| load_btn = gr.Button("π Load Configuration", variant="secondary") |
| load_status = gr.Textbox(label="Status", interactive=False) |
|
|
| |
| with gr.TabItem("βΉοΈ About"): |
| gr.Markdown(""" |
| ## AudioBook Forge |
| |
| **Model-agnostic, high-fidelity audiobook generator** powered by [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS). |
| |
| ### Features |
| - π **File Upload** β Import EPUB, PDF, TXT, or HTML directly |
| - π **Chapter Detection** β Auto-detects chapters/sections for selective generation |
| - ποΈ **Character Voice Mapping** β Auto-extract characters and assign unique voices |
| - π **Three Voice Modes** β Preset (9 speakers), Clone (upload sample), Design (text description) |
| - β‘ **Quick Generate** β One-click audiobook with a single narrator voice |
| - ποΈ **Speed Control** β Adjust playback speed per voice (0.5xβ2.0x) |
| - π¦ **Multi-format Export** β MP3, WAV, or ZIP of individual segments |
| - πΎ **Save/Load Projects** β Export and restore your voice configurations |
| - π **10 Languages** β English, Chinese, Japanese, Korean, German, French, Spanish, Italian, Portuguese, Russian |
| - β‘ **ZeroGPU** β Runs on Hugging Face ZeroGPU (free compute) |
| |
| ### Workflow |
| 1. **Upload or paste** your story text |
| 2. **Detect chapters** (optional) and select a range |
| 3. **Extract characters** or use Quick Generate for simple narration |
| 4. **Assign voices** to narrator and each character |
| 5. **Generate** and download your audiobook |
| |
| ### Tips for Best Quality |
| - Use clean, noise-free voice samples for cloning (3β10 seconds) |
| - Keep reference transcripts accurate |
| - Lower temperature (0.5β0.6) for stable narration; higher (0.8β0.9) for expressive dialogue |
| - Use a fixed seed to prevent voice drift across segments |
| - Use speed adjustment to fine-tune pacing per character |
| |
| ### Note on First Run |
| The first time you generate audio, the Space downloads the Qwen3-TTS 1.7B models (~5 GB total). This can take **2β5 minutes** depending on network speed. Subsequent runs are much faster because models are cached. Please be patient β the progress is printed in the server logs. |
| """) |
|
|
| |
| def do_extract(text): |
| chars, status = extract_chars(text) |
| updates = [] |
| for i in range(8): |
| if i < len(chars): |
| mode = chars[i].get("voice_mode", "design") |
| is_preset = mode == "preset" |
| is_clone = mode == "clone" |
| is_design = mode == "design" |
| updates.extend([ |
| gr.update(visible=True), |
| gr.update(value=chars[i].get("name", ""), visible=True), |
| gr.update(value=chars[i].get("description", ""), visible=True), |
| gr.update(value=mode, visible=True), |
| gr.update(value=chars[i].get("voice_preset", "Ryan"), visible=is_preset), |
| gr.update(visible=is_clone), |
| gr.update(visible=is_clone), |
| gr.update(value=chars[i].get("voice_description", ""), visible=is_design), |
| gr.update(value=chars[i].get("voice_instruct", ""), visible=True), |
| gr.update(value=chars[i].get("language", "English"), visible=True), |
| gr.update(value=chars[i].get("speed", 1.0), visible=True), |
| gr.update(visible=True), |
| gr.update(visible=True), |
| gr.update(visible=True), |
| ]) |
| else: |
| updates.extend([ |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| ]) |
| return [status] + updates |
|
|
| extract_outputs = [extract_status] + [ |
| item for sublist in [ |
| [char_rows[i], char_names[i], char_descs[i], char_modes[i], char_presets[i], |
| char_audios[i], char_ref_texts[i], char_designs[i], char_instructs[i], char_langs[i], |
| char_speeds[i], char_preview_btns[i], char_preview_audios[i], char_preview_statuses[i]] |
| for i in range(8) |
| ] for item in sublist |
| ] |
| extract_btn.click(do_extract, inputs=[story_input], outputs=extract_outputs) |
|
|
| |
| all_char_inputs = ( |
| char_names + char_descs + char_modes + char_presets + |
| char_audios + char_ref_texts + char_designs + char_instructs + char_langs + char_speeds |
| ) |
|
|
| gen_inputs = [ |
| story_input, chapter_selector, |
| nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed, |
| gen_temp, gen_seed, output_fmt, |
| ] + all_char_inputs |
|
|
| def wrapped_generate(story_text, chapter_sel, *args): |
| text = get_chapter_text(story_text, chapter_sel) |
| return generate_audiobook_gpu(text, *args) |
|
|
| gen_btn.click( |
| wrapped_generate, |
| inputs=gen_inputs, |
| outputs=[output_audio, output_file, segment_list, output_status, gen_progress], |
| ) |
|
|
| |
| save_inputs = [ |
| story_input, |
| nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed, |
| ] + all_char_inputs + [gen_temp, gen_seed] |
| save_btn.click(do_save_project, inputs=save_inputs, outputs=[project_json]) |
|
|
| load_outputs = [story_input, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed] + extract_outputs[1:] + [load_status] |
| load_btn.click(do_load_project, inputs=[load_json], outputs=load_outputs) |
|
|
| return demo |
|
|
|
|
| demo = build_app() |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|