Spaces:

jkorstad
/

AudioBook

Running on Zero

App Files Files Community

AudioBook / app.py

jkorstad

Remove Use AI enhancement toggle — always use AI for character extraction since Qwen3-TTS is the default engine

2f4164d 23 days ago

raw

history blame contribute delete

43.8 kB

	"""
	AudioBook Forge - Enhanced Gradio Frontend
	High-fidelity audiobook generator with character voice mapping,
	file upload, chapter selection, segment previews, and project save/load.
	"""

	import os
	import json
	from pathlib import Path
	from typing import Dict, List, Optional

	import gradio as gr
	import numpy as np

	# ---------------------------------------------------------------------------
	# spaces / ZeroGPU compatibility
	# ---------------------------------------------------------------------------
	try:
	import spaces
	except ImportError:
	class _SpacesGPU:
	def __init__(self, duration=60):
	self.duration = duration
	def __call__(self, fn):
	return fn
	class spaces:
	GPU = _SpacesGPU

	# ---------------------------------------------------------------------------
	# Backend imports
	# ---------------------------------------------------------------------------
	from backend import (
	AudiobookPipeline,
	VoiceConfig,
	PRESET_SPEAKERS,
	SAMPLE_STORIES,
	save_project,
	load_project,
	estimate_duration,
	)

	# ---------------------------------------------------------------------------
	# CSS & Theme
	# ---------------------------------------------------------------------------

	CUSTOM_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');

	body, .gradio-container {
	font-family: 'Inter', sans-serif !important;
	background: #0f172a !important;
	color: #f8fafc !important;
	}

	.gradio-container {
	max-width: 1200px !important;
	}

	.ab-header {
	text-align: center;
	padding: 2.2rem 1rem 1.8rem;
	background: linear-gradient(135deg, rgba(99,102,241,0.12) 0%, rgba(34,211,238,0.06) 100%);
	border-radius: 18px;
	margin-bottom: 1.5rem;
	border: 1px solid rgba(99,102,241,0.18);
	}
	.ab-header h1 {
	font-size: 2.6rem;
	font-weight: 700;
	margin: 0;
	background: linear-gradient(90deg, #a5b4fc, #22d3ee);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	}
	.ab-header p {
	color: #94a3b8;
	margin-top: 0.6rem;
	font-size: 1.05rem;
	}

	.ab-card {
	background: #1e293b !important;
	border: 1px solid #334155 !important;
	border-radius: 14px !important;
	padding: 1.25rem !important;
	}

	.ab-stat {
	background: #0f172a;
	border: 1px solid #334155;
	border-radius: 10px;
	padding: 0.75rem 1rem;
	text-align: center;
	}
	.ab-stat .value {
	font-size: 1.4rem;
	font-weight: 700;
	color: #22d3ee;
	}
	.ab-stat .label {
	font-size: 0.75rem;
	color: #94a3b8;
	text-transform: uppercase;
	letter-spacing: 0.05em;
	}

	button.primary {
	background: linear-gradient(135deg, #6366f1, #4f46e5) !important;
	border: none !important;
	border-radius: 10px !important;
	font-weight: 600 !important;
	transition: all 0.2s ease !important;
	}
	button.primary:hover {
	transform: translateY(-1px);
	box-shadow: 0 4px 14px rgba(99,102,241,0.4) !important;
	}
	button.secondary {
	background: #334155 !important;
	border: 1px solid #475569 !important;
	border-radius: 10px !important;
	color: #f8fafc !important;
	}

	input, textarea, select {
	background: #0f172a !important;
	border: 1px solid #334155 !important;
	border-radius: 8px !important;
	color: #f8fafc !important;
	}
	input:focus, textarea:focus, select:focus {
	border-color: #6366f1 !important;
	box-shadow: 0 0 0 3px rgba(99,102,241,0.15) !important;
	}

	.gr-box, .gr-form {
	background: #1e293b !important;
	border-color: #334155 !important;
	}
	.gr-panel {
	background: #1e293b !important;
	}

	.tabitem {
	background: #1e293b !important;
	border-color: #334155 !important;
	}

	input[type="checkbox"] + label,
	.checkbox-label,
	.gr-checkbox label {
	color: #f8fafc !important;
	}

	/* Gradio 5+ checkbox checked state - make it clearly visible in dark theme */
	.gr-checkbox input[type="checkbox"]:checked + label,
	.gr-checkbox-checked label,
	.gr-checkbox-input:checked + .gr-checkbox-border,
	.gr-checkbox-input:checked + label .gr-checkbox-border,
	input[type="checkbox"]:checked + label span {
	background: #6366f1 !important;
	border-color: #818cf8 !important;
	box-shadow: 0 0 0 3px rgba(99,102,241,0.35) !important;
	}
	.gr-checkbox input[type="checkbox"]:checked + label::after,
	.gr-checkbox-input:checked + label::after {
	border-color: #ffffff !important;
	}
	.gr-checkbox {
	color: #f8fafc !important;
	}
	.gr-checkbox-input:checked + * {
	background: #6366f1 !important;
	border-color: #818cf8 !important;
	}

	li, .prose li, .gr-prose li {
	color: #cbd5e1 !important;
	}

	strong, b {
	color: #f8fafc !important;
	}

	code {
	background: #334155 !important;
	color: #22d3ee !important;
	padding: 0.1rem 0.3rem !important;
	border-radius: 4px !important;
	}

	progress {
	width: 100%;
	height: 8px;
	border-radius: 4px;
	background: #334155;
	}
	progress::-webkit-progress-bar {
	background: #334155;
	border-radius: 4px;
	}
	progress::-webkit-progress-value {
	background: linear-gradient(90deg, #6366f1, #22d3ee);
	border-radius: 4px;
	}

	.seg-item {
	background: #0f172a;
	border: 1px solid #334155;
	border-radius: 8px;
	padding: 0.5rem 0.75rem;
	margin-bottom: 0.4rem;
	font-size: 0.85rem;
	}
	.seg-item .seg-type {
	display: inline-block;
	padding: 0.1rem 0.4rem;
	border-radius: 4px;
	font-size: 0.7rem;
	font-weight: 600;
	text-transform: uppercase;
	}
	.seg-type.narration { background: #4f46e5; color: #fff; }
	.seg-type.dialogue { background: #22d3ee; color: #0f172a; }
	"""

	# ---------------------------------------------------------------------------
	# Global State
	# ---------------------------------------------------------------------------

	_pipeline: Optional[AudiobookPipeline] = None


	def get_pipeline() -> AudiobookPipeline:
	global _pipeline
	if _pipeline is None:
	_pipeline = AudiobookPipeline()
	return _pipeline


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	def on_mode_change(mode: str) -> tuple:
	if mode == "preset":
	return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
	elif mode == "clone":
	return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
	else:
	return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)


	def update_stats(text: str) -> tuple:
	wc = len(text.split()) if text else 0
	dur = estimate_duration(wc)
	return str(wc), dur


	def handle_upload(file_obj) -> tuple:
	if file_obj is None:
	return "", "No file uploaded."
	try:
	pipe = get_pipeline()
	text, fname = pipe.parse_upload(file_obj)
	text = pipe.processor.clean_text(text)
	chs = pipe.detect_chapters(text)
	ch_info = " \| ".join([f"Ch{c['idx']+1}: {c['word_count']}w" for c in chs[:5]])
	if len(chs) > 5:
	ch_info += f" (+{len(chs)-5} more)"
	wc = len(text.split())
	dur = estimate_duration(wc)
	return text, f"Loaded {fname} — {wc} words (~{dur}) \| {ch_info if chs else '1 section'}"
	except Exception as e:
	return "", f"Error: {e}"


	def extract_chars(text: str) -> tuple:
	if not text or len(text.strip()) < 20:
	return [], "Text too short. Please paste at least a paragraph."
	pipe = get_pipeline()
	chars = pipe.extract_characters(text, use_ai=True)
	status = f"Found {len(chars)} characters: {', '.join(c['name'] for c in chars)}" if chars else "No characters auto-detected. Add them manually below."
	return chars, status


	def get_chapter_text(text: str, chapter_sel: str) -> str:
	if not text or chapter_sel == "All" or not chapter_sel:
	return text
	try:
	idx = int(chapter_sel.split(":")[0].replace("Ch", "")) - 1
	pipe = get_pipeline()
	return pipe.get_chapter_text(text, idx)
	except Exception:
	return text


	# ---------------------------------------------------------------------------
	# GPU-wrapped functions (ZeroGPU)
	# ---------------------------------------------------------------------------

	@spaces.GPU(duration=180)
	def generate_audiobook_gpu(
	text,
	nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
	gen_temp, gen_seed, output_fmt, *args
	):
	if not text or len(text.strip()) < 50:
	return None, None, "", "Error: Please provide at least 50 characters of story text.", ""

	wc = len(text.split())
	if wc > 5000:
	print(f"[WARN] Long text: {wc} words. Generation may take a while or hit timeouts.")

	# Unpack character args (80 values = 8 chars x 10 fields)
	names = list(args[0:8])
	descs = list(args[8:16])
	modes = list(args[16:24])
	presets = list(args[24:32])
	audios = list(args[32:40])
	ref_texts = list(args[40:48])
	designs = list(args[48:56])
	instructs = list(args[56:64])
	langs = list(args[64:72])
	speeds = list(args[72:80])

	pipe = get_pipeline()

	nar_cfg = VoiceConfig(
	name="Narrator",
	mode=nar_mode,
	preset=nar_preset if nar_mode == "preset" else None,
	ref_audio=nar_audio if nar_mode == "clone" and nar_audio else None,
	ref_text=nar_ref_text if nar_mode == "clone" else None,
	design_desc=nar_design if nar_mode == "design" else None,
	instruct=nar_instruct,
	language=nar_lang,
	speed=float(nar_speed) if nar_speed else 1.0,
	)

	char_configs = {}
	for i in range(8):
	if not names[i]:
	continue
	vc = VoiceConfig(
	name=names[i],
	mode=modes[i],
	preset=presets[i] if modes[i] == "preset" else None,
	ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None,
	ref_text=ref_texts[i] if modes[i] == "clone" else None,
	design_desc=designs[i] if modes[i] == "design" else None,
	instruct=instructs[i] or "",
	language=langs[i],
	speed=float(speeds[i]) if speeds[i] else 1.0,
	)
	char_configs[names[i]] = vc

	progress_text = ""

	def prog_cb(ratio: float, msg: str):
	nonlocal progress_text
	progress_text = f"[{ratio*100:.0f}%] {msg}"
	print(progress_text)

	try:
	output_path, seg_paths, seg_meta = pipe.generate(
	text=text,
	narrator_config=nar_cfg,
	character_configs=char_configs,
	progress_callback=prog_cb,
	temperature=gen_temp,
	seed=int(gen_seed),
	)

	seg_html = "<div style='max-height: 300px; overflow-y: auto;'>"
	for s in seg_meta[:50]:
	tclass = "narration" if s['type'] == 'narration' else "dialogue"
	seg_html += f"<div class='seg-item'><span class='seg-type {tclass}'>{s['type']}</span> <strong>{s['speaker']}</strong>: {s['text']}</div>"
	if len(seg_meta) > 50:
	seg_html += f"<div style='text-align:center;color:#94a3b8;padding:0.5rem;'>... and {len(seg_meta)-50} more segments</div>"
	seg_html += "</div>"

	extra_path = None
	if output_fmt == "wav":
	extra_path = output_path.replace(".mp3", ".wav")
	from backend import save_audiobook
	save_audiobook(seg_paths, extra_path, fmt="wav")
	elif output_fmt == "zip":
	extra_path = pipe.export_segments_zip(seg_paths)

	final_path = extra_path if extra_path else output_path
	return final_path, final_path, seg_html, f"Done! {len(seg_meta)} segments generated.", progress_text
	except Exception as e:
	import traceback
	traceback.print_exc()
	return None, None, "", f"Error: {str(e)}", progress_text


	@spaces.GPU(duration=60)
	def preview_narrator_gpu(mode, preset, audio, ref_text, design, instruct, lang, speed):
	pipe = get_pipeline()
	vc = VoiceConfig(
	name="Narrator",
	mode=mode,
	preset=preset if mode == "preset" else None,
	ref_audio=audio if mode == "clone" and audio else None,
	ref_text=ref_text if mode == "clone" else None,
	design_desc=design if mode == "design" else None,
	instruct=instruct,
	language=lang,
	speed=float(speed) if speed else 1.0,
	)
	try:
	wav, sr = pipe.preview_voice(vc)
	return (sr, wav), "Preview ready!"
	except Exception as e:
	import traceback
	traceback.print_exc()
	return None, f"Preview failed: {e}"


	@spaces.GPU(duration=60)
	def preview_char_voice_gpu(name, mode, preset, audio, ref_text, design, instruct, lang, speed):
	pipe = get_pipeline()
	vc = VoiceConfig(
	name=name or "Character",
	mode=mode,
	preset=preset if mode == "preset" else None,
	ref_audio=audio if mode == "clone" and audio else None,
	ref_text=ref_text if mode == "clone" else None,
	design_desc=design if mode == "design" else None,
	instruct=instruct,
	language=lang,
	speed=float(speed) if speed else 1.0,
	)
	try:
	sample = f"Hello, I am {name or 'your character'}. This is how I sound in the story."
	wav, sr = pipe.preview_voice(vc, sample_text=sample)
	return (sr, wav), f"{name or 'Character'} preview ready!"
	except Exception as e:
	import traceback
	traceback.print_exc()
	return None, f"Preview failed: {e}"


	# ---------------------------------------------------------------------------
	# Quick Generate
	# ---------------------------------------------------------------------------

	@spaces.GPU(duration=180)
	def quick_generate_gpu(text, mode, preset, audio, ref_text, design, instruct, lang, speed, gen_temp, output_fmt, gen_seed=42):
	if not text or len(text.strip()) < 50:
	return None, None, "Error: Text too short."

	wc = len(text.split())
	if wc > 5000:
	print(f"[WARN] Long text: {wc} words. Quick Generate may take a while or hit timeouts.")

	pipe = get_pipeline()
	nar_cfg = VoiceConfig(
	name="Narrator",
	mode=mode,
	preset=preset if mode == "preset" else None,
	ref_audio=audio if mode == "clone" and audio else None,
	ref_text=ref_text if mode == "clone" else None,
	design_desc=design if mode == "design" else None,
	instruct=instruct or "Narrate clearly and expressively.",
	language=lang,
	speed=float(speed) if speed else 1.0,
	)

	def prog_cb(ratio: float, msg: str):
	print(f"[{ratio*100:.0f}%] {msg}")

	try:
	output_path, seg_paths, seg_meta = pipe.generate(
	text=text,
	narrator_config=nar_cfg,
	character_configs={},
	progress_callback=prog_cb,
	temperature=gen_temp,
	seed=int(gen_seed),
	)

	extra_path = None
	if output_fmt == "wav":
	extra_path = output_path.replace(".mp3", ".wav")
	from backend import save_audiobook
	save_audiobook(seg_paths, extra_path, fmt="wav")
	elif output_fmt == "zip":
	extra_path = pipe.export_segments_zip(seg_paths)

	final_path = extra_path if extra_path else output_path
	return final_path, final_path, f"Quick audiobook ready! {len(seg_meta)} segments."
	except Exception as e:
	import traceback
	traceback.print_exc()
	return None, None, f"Error: {str(e)}"


	# ---------------------------------------------------------------------------
	# Project Save/Load
	# ---------------------------------------------------------------------------

	def do_save_project(text, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed, *args):
	# Unpack character args (80 values) + gen_temp + gen_seed
	names = list(args[0:8])
	descs = list(args[8:16])
	modes = list(args[16:24])
	presets = list(args[24:32])
	audios = list(args[32:40])
	ref_texts = list(args[40:48])
	designs = list(args[48:56])
	instructs = list(args[56:64])
	langs = list(args[64:72])
	speeds = list(args[72:80])
	gen_temp = args[80] if len(args) > 80 else 0.7
	gen_seed = args[81] if len(args) > 81 else 42

	nar_cfg = VoiceConfig(
	name="Narrator", mode=nar_mode, preset=nar_preset if nar_mode == "preset" else None,
	ref_audio=nar_audio if nar_mode == "clone" and nar_audio else None,
	ref_text=nar_ref_text if nar_mode == "clone" else None,
	design_desc=nar_design if nar_mode == "design" else None,
	instruct=nar_instruct, language=nar_lang,
	speed=float(nar_speed) if nar_speed else 1.0,
	)
	char_configs = {}
	for i in range(8):
	if not names[i]:
	continue
	char_configs[names[i]] = VoiceConfig(
	name=names[i], mode=modes[i], description=descs[i] or "",
	preset=presets[i] if modes[i] == "preset" else None,
	ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None,
	ref_text=ref_texts[i] if modes[i] == "clone" else None,
	design_desc=designs[i] if modes[i] == "design" else None,
	instruct=instructs[i] or "", language=langs[i],
	speed=float(speeds[i]) if speeds[i] else 1.0,
	)
	settings = {"temperature": gen_temp, "seed": int(gen_seed)}
	json_str = save_project(text, nar_cfg, char_configs, settings)
	return json_str


	def do_load_project(json_str):
	try:
	data = load_project(json_str)
	nar = data["narrator"]
	chars = data.get("characters", {})

	nar_updates = [
	gr.update(value=nar.mode),
	gr.update(value=nar.preset if nar.preset else "Ryan", visible=nar.mode=="preset"),
	gr.update(value=nar.ref_audio, visible=nar.mode=="clone"),
	gr.update(value=nar.ref_text, visible=nar.mode=="clone"),
	gr.update(value=nar.design_desc, visible=nar.mode=="design"),
	gr.update(value=nar.instruct),
	gr.update(value=nar.language),
	gr.update(value=nar.speed),
	]

	char_updates = []
	char_items = list(chars.items())[:8]
	for i in range(8):
	if i < len(char_items):
	_, c = char_items[i]
	char_updates.extend([
	gr.update(visible=True),
	gr.update(value=c.name, visible=True),
	gr.update(value=c.description, visible=True),
	gr.update(value=c.mode, visible=True),
	gr.update(value=c.preset if c.preset else "Ryan", visible=c.mode=="preset"),
	gr.update(value=c.ref_audio, visible=c.mode=="clone"),
	gr.update(value=c.ref_text, visible=c.mode=="clone"),
	gr.update(value=c.design_desc, visible=c.mode=="design"),
	gr.update(value=c.instruct, visible=True),
	gr.update(value=c.language, visible=True),
	gr.update(value=c.speed, visible=True),
	gr.update(visible=True),
	gr.update(visible=True),
	gr.update(visible=True),
	])
	else:
	char_updates.extend([
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	])

	text_sample = data.get("text_sample", "")
	return [text_sample] + nar_updates + char_updates + [f"Project loaded! {len(chars)} characters configured."]
	except Exception as e:
	import traceback
	traceback.print_exc()
	return [""] + [gr.update()] * 8 + [gr.update(visible=False)] * 112 + [f"Error loading project: {e}"]


	# ---------------------------------------------------------------------------
	# Build UI
	# ---------------------------------------------------------------------------

	def build_app():
	theme = gr.themes.Soft(
	primary_hue="indigo",
	secondary_hue="cyan",
	neutral_hue="slate",
	).set(
	body_background_fill="#0f172a",
	body_background_fill_dark="#0f172a",
	body_text_color="#f8fafc",
	body_text_color_subdued="#94a3b8",
	background_fill_primary="#1e293b",
	background_fill_secondary="#0f172a",
	border_color_accent="#334155",
	color_accent_soft="#22d3ee",
	button_primary_background_fill="linear-gradient(135deg, #6366f1, #4f46e5)",
	button_primary_background_fill_hover="linear-gradient(135deg, #4f46e5, #4338ca)",
	button_primary_text_color="#ffffff",
	input_background_fill="#0f172a",
	input_border_color="#334155",
	block_title_text_color="#f8fafc",
	block_label_text_color="#94a3b8",
	)

	with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="AudioBook Forge") as demo:
	gr.HTML("""
	<div class="ab-header">
	<h1>AudioBook Forge</h1>
	<p>High-fidelity audiobooks with AI character voices. Model-agnostic TTS powered by Qwen3-TTS.</p>
	</div>
	""")

	with gr.Tabs():
	# ==================== TAB 1: Story ====================
	with gr.TabItem("📖 Story"):
	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown("### Upload or Paste")
	file_upload = gr.File(
	label="Upload EPUB, PDF, TXT, or HTML",
	file_types=[".txt", ".epub", ".pdf", ".html", ".htm"],
	)
	story_input = gr.TextArea(
	label="Story Text",
	placeholder="Paste your book chapter, short story, or script here...",
	lines=18,
	max_lines=40,
	)
	sample_dropdown = gr.Dropdown(
	label="Or try a sample story",
	choices=list(SAMPLE_STORIES.keys()),
	value=None,
	)

	with gr.Column(scale=1):
	gr.Markdown("### Stats")
	with gr.Row():
	stat_words = gr.Textbox(label="Words", value="0", interactive=False)
	stat_dur = gr.Textbox(label="Est. Duration", value="0 sec", interactive=False)
	gr.Markdown("---")
	gr.Markdown("### Quick Generate")
	quick_mode = gr.Dropdown(choices=["preset", "clone", "design"], value="design", label="Narrator Mode")
	quick_preset = gr.Dropdown(choices=list(PRESET_SPEAKERS.keys()), value="Ryan", label="Preset Voice", visible=False)
	quick_audio = gr.Audio(label="Upload Voice Sample (3–10s)", type="filepath", visible=False)
	quick_ref_text = gr.Textbox(label="Reference Transcript", placeholder="What does the sample say?", visible=False)
	quick_design = gr.TextArea(label="Voice Description", placeholder="e.g. A warm, raspy baritone with measured pacing...", visible=True, lines=2, value="A clear, warm, expressive audiobook narrator voice with professional pacing and rich tone.")
	quick_instruct = gr.Textbox(label="Style Instruction", placeholder="e.g. Calm, measured storytelling.", value="")
	quick_lang = gr.Dropdown(choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", label="Language")
	quick_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
	quick_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
	quick_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format")
	quick_btn = gr.Button("⚡ Quick Generate", variant="primary")
	quick_output_audio = gr.Audio(label="Quick Audiobook", type="filepath", interactive=False)
	quick_output_file = gr.File(label="Download", interactive=False)
	quick_status = gr.Textbox(show_label=False, interactive=False)
	gr.Markdown("---")
	gr.Markdown("Quick Generate uses a single narrator voice for the entire text. Supports preset, clone, or AI-designed voices.")

	with gr.Row():
	chapter_selector = gr.Dropdown(
	label="Chapter / Section",
	choices=["All"],
	value="All",
	interactive=True,
	)
	refresh_chapters_btn = gr.Button("🔄 Detect Chapters")
	clear_story_btn = gr.Button("🗑️ Clear", variant="secondary")

	def clear_story():
	return "", gr.update(choices=["All"], value="All"), "0", "0 sec"

	clear_story_btn.click(
	clear_story,
	inputs=[],
	outputs=[story_input, chapter_selector, stat_words, stat_dur],
	)

	with gr.Row():
	gr.Markdown("### Character Detection")
	extract_btn = gr.Button("🔍 Extract Characters", variant="primary")

	extract_status = gr.Textbox(label="Status", interactive=False)

	# Wiring
	file_upload.change(handle_upload, inputs=[file_upload], outputs=[story_input, extract_status])
	def load_sample_and_update(name):
	text = SAMPLE_STORIES.get(name, "")
	wc = len(text.split()) if text else 0
	dur = estimate_duration(wc)
	return text, str(wc), dur, gr.update(choices=["All"], value="All"), ""

	sample_dropdown.change(
	load_sample_and_update,
	inputs=[sample_dropdown],
	outputs=[story_input, stat_words, stat_dur, chapter_selector, extract_status],
	)
	story_input.change(update_stats, inputs=[story_input], outputs=[stat_words, stat_dur])
	quick_btn.click(
	quick_generate_gpu,
	inputs=[story_input, quick_mode, quick_preset, quick_audio, quick_ref_text, quick_design, quick_instruct, quick_lang, quick_speed, quick_temp, quick_fmt],
	outputs=[quick_output_audio, quick_output_file, quick_status],
	)

	quick_mode.change(on_mode_change, inputs=quick_mode, outputs=[quick_preset, quick_audio, quick_ref_text, quick_design])

	def refresh_chapters(text):
	if not text:
	return gr.update(choices=["All"], value="All")
	pipe = get_pipeline()
	chs = pipe.detect_chapters(text)
	choices = ["All"] + [f"Ch{c['idx']+1}: {c['title'][:60]}" for c in chs]
	return gr.update(choices=choices, value="All")

	refresh_chapters_btn.click(refresh_chapters, inputs=[story_input], outputs=[chapter_selector])

	# ==================== TAB 2: Voice Cast ====================
	with gr.TabItem("🎭 Voice Cast"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("## Narrator")
	with gr.Column(elem_classes="ab-card"):
	nar_mode = gr.Dropdown(choices=["preset", "clone", "design"], value="design", label="Mode")
	nar_preset = gr.Dropdown(choices=list(PRESET_SPEAKERS.keys()), value="Ryan", label="Preset Voice", visible=False)
	nar_audio = gr.Audio(label="Upload Voice Sample (3–10s)", type="filepath", visible=False)
	nar_ref_text = gr.Textbox(label="Reference Transcript", placeholder="What does the sample say?", visible=False)
	nar_design = gr.TextArea(label="Voice Description", placeholder="e.g. A warm, raspy baritone with measured pacing...", visible=True, lines=2, value="A clear, warm, expressive audiobook narrator voice with professional pacing and rich tone.")
	nar_instruct = gr.Textbox(label="Style Instruction", placeholder="e.g. Calm, measured storytelling.")
	nar_lang = gr.Dropdown(choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", label="Language")
	nar_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
	nar_preview_btn = gr.Button("🔊 Preview Narrator", variant="secondary")
	nar_preview_audio = gr.Audio(label="Preview", interactive=False)
	nar_preview_status = gr.Textbox(show_label=False, interactive=False)

	nar_mode.change(on_mode_change, inputs=nar_mode, outputs=[nar_preset, nar_audio, nar_ref_text, nar_design])
	nar_preview_btn.click(
	preview_narrator_gpu,
	inputs=[nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed],
	outputs=[nar_preview_audio, nar_preview_status],
	)

	with gr.Column(scale=2):
	gr.Markdown("## Character Voices")
	gr.Markdown("""
	Configure up to 8 characters. Each character can use one of three voice modes:

	- Preset — Choose from 9 built-in speakers (Ryan, Aiden, Serena, etc.)
	- Clone — Upload a 3–10 second voice sample to clone any real voice
	- Design — Describe a voice in text (e.g. "A raspy old man with a warm chuckle") and the AI will create it
	""")

	char_names, char_descs, char_modes, char_presets = [], [], [], []
	char_audios, char_ref_texts, char_designs, char_instructs, char_langs, char_speeds = [], [], [], [], [], []
	char_rows, char_preview_btns, char_preview_audios, char_preview_statuses = [], [], [], []

	for i in range(8):
	visible_default = (i == 0)
	with gr.Group(visible=visible_default) as row:
	with gr.Row():
	cn = gr.Textbox(label="Name", placeholder="e.g. Alice", visible=visible_default)
	cd = gr.Textbox(label="Description", placeholder="Personality note", visible=visible_default)
	cm = gr.Dropdown(label="Mode", choices=["preset", "clone", "design"], value="design", visible=visible_default)
	cp = gr.Dropdown(label="Preset", choices=list(PRESET_SPEAKERS.keys()), value="Ryan", visible=False)
	with gr.Row():
	ca = gr.Audio(label="Voice Sample", type="filepath", visible=False)
	crt = gr.Textbox(label="Ref Transcript", placeholder="What the sample says", visible=False)
	cdes = gr.TextArea(label="Voice Description", placeholder="e.g. A shrill, nervous teenager.", visible=visible_default, lines=2)
	cinstr = gr.Textbox(label="Style Instruction", placeholder="e.g. Angry and loud.", visible=visible_default)
	cl = gr.Dropdown(label="Language", choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", visible=visible_default)
	cspd = gr.Slider(label="Speed", minimum=0.5, maximum=2.0, value=1.0, step=0.1, visible=visible_default)
	with gr.Row():
	cpv_btn = gr.Button("🔊 Preview", variant="secondary", visible=visible_default)
	cpv_audio = gr.Audio(label="Preview", interactive=False, visible=visible_default)
	cpv_status = gr.Textbox(show_label=False, interactive=False, visible=visible_default)

	cm.change(on_mode_change, inputs=cm, outputs=[cp, ca, crt, cdes])
	cpv_btn.click(
	preview_char_voice_gpu,
	inputs=[cn, cm, cp, ca, crt, cdes, cinstr, cl, cspd],
	outputs=[cpv_audio, cpv_status],
	)

	char_rows.append(row)
	char_names.append(cn)
	char_descs.append(cd)
	char_modes.append(cm)
	char_presets.append(cp)
	char_audios.append(ca)
	char_ref_texts.append(crt)
	char_designs.append(cdes)
	char_instructs.append(cinstr)
	char_langs.append(cl)
	char_speeds.append(cspd)
	char_preview_btns.append(cpv_btn)
	char_preview_audios.append(cpv_audio)
	char_preview_statuses.append(cpv_status)

	# ==================== TAB 3: Generate ====================
	with gr.TabItem("⚡ Generate"):
	gr.Markdown("_Note: The first generation downloads Qwen3-TTS 1.7B models (~5 GB) and may take 2–5 minutes. Subsequent runs are much faster._")
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Settings")
	gen_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
	gen_seed = gr.Number(value=42, precision=0, label="Seed (fix for consistency)")
	output_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format")
	gen_btn = gr.Button("▶️ Generate Full Audiobook", variant="primary", size="lg")
	gen_progress = gr.Textbox(label="Progress", interactive=False, value="Ready.")

	with gr.Column(scale=2):
	gr.Markdown("### Output")
	output_audio = gr.Audio(label="Generated Audiobook", type="filepath", interactive=False)
	output_file = gr.File(label="Download", interactive=False)
	output_status = gr.Textbox(label="Status", interactive=False)
	segment_list = gr.HTML(label="Segments")

	# ==================== TAB 4: Project ====================
	with gr.TabItem("💾 Project"):
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Save Project")
	save_btn = gr.Button("💾 Save Configuration", variant="primary")
	project_json = gr.TextArea(label="Project JSON (copy this to save)", lines=10, interactive=True)
	with gr.Column():
	gr.Markdown("### Load Project")
	load_json = gr.TextArea(label="Paste Project JSON here", lines=10, interactive=True)
	load_btn = gr.Button("📂 Load Configuration", variant="secondary")
	load_status = gr.Textbox(label="Status", interactive=False)

	# ==================== TAB 5: About ====================
	with gr.TabItem("ℹ️ About"):
	gr.Markdown("""
	## AudioBook Forge

	Model-agnostic, high-fidelity audiobook generator powered by [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS).

	### Features
	- 📁 File Upload — Import EPUB, PDF, TXT, or HTML directly
	- 📖 Chapter Detection — Auto-detects chapters/sections for selective generation
	- 🎙️ Character Voice Mapping — Auto-extract characters and assign unique voices
	- 🎭 Three Voice Modes — Preset (9 speakers), Clone (upload sample), Design (text description)
	- ⚡ Quick Generate — One-click audiobook with a single narrator voice
	- 🎚️ Speed Control — Adjust playback speed per voice (0.5x–2.0x)
	- 📦 Multi-format Export — MP3, WAV, or ZIP of individual segments
	- 💾 Save/Load Projects — Export and restore your voice configurations
	- 🌐 10 Languages — English, Chinese, Japanese, Korean, German, French, Spanish, Italian, Portuguese, Russian
	- ⚡ ZeroGPU — Runs on Hugging Face ZeroGPU (free compute)

	### Workflow
	1. Upload or paste your story text
	2. Detect chapters (optional) and select a range
	3. Extract characters or use Quick Generate for simple narration
	4. Assign voices to narrator and each character
	5. Generate and download your audiobook

	### Tips for Best Quality
	- Use clean, noise-free voice samples for cloning (3–10 seconds)
	- Keep reference transcripts accurate
	- Lower temperature (0.5–0.6) for stable narration; higher (0.8–0.9) for expressive dialogue
	- Use a fixed seed to prevent voice drift across segments
	- Use speed adjustment to fine-tune pacing per character

	### Note on First Run
	The first time you generate audio, the Space downloads the Qwen3-TTS 1.7B models (~5 GB total). This can take 2–5 minutes depending on network speed. Subsequent runs are much faster because models are cached. Please be patient — the progress is printed in the server logs.
	""")

	# ---------- Extract wiring ----------
	def do_extract(text):
	chars, status = extract_chars(text)
	updates = []
	for i in range(8):
	if i < len(chars):
	mode = chars[i].get("voice_mode", "design")
	is_preset = mode == "preset"
	is_clone = mode == "clone"
	is_design = mode == "design"
	updates.extend([
	gr.update(visible=True),
	gr.update(value=chars[i].get("name", ""), visible=True),
	gr.update(value=chars[i].get("description", ""), visible=True),
	gr.update(value=mode, visible=True),
	gr.update(value=chars[i].get("voice_preset", "Ryan"), visible=is_preset),
	gr.update(visible=is_clone),
	gr.update(visible=is_clone),
	gr.update(value=chars[i].get("voice_description", ""), visible=is_design),
	gr.update(value=chars[i].get("voice_instruct", ""), visible=True),
	gr.update(value=chars[i].get("language", "English"), visible=True),
	gr.update(value=chars[i].get("speed", 1.0), visible=True),
	gr.update(visible=True),
	gr.update(visible=True),
	gr.update(visible=True),
	])
	else:
	updates.extend([
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	])
	return [status] + updates

	extract_outputs = [extract_status] + [
	item for sublist in [
	[char_rows[i], char_names[i], char_descs[i], char_modes[i], char_presets[i],
	char_audios[i], char_ref_texts[i], char_designs[i], char_instructs[i], char_langs[i],
	char_speeds[i], char_preview_btns[i], char_preview_audios[i], char_preview_statuses[i]]
	for i in range(8)
	] for item in sublist
	]
	extract_btn.click(do_extract, inputs=[story_input], outputs=extract_outputs)

	# ---------- Generate wiring ----------
	all_char_inputs = (
	char_names + char_descs + char_modes + char_presets +
	char_audios + char_ref_texts + char_designs + char_instructs + char_langs + char_speeds
	)

	gen_inputs = [
	story_input, chapter_selector,
	nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
	gen_temp, gen_seed, output_fmt,
	] + all_char_inputs

	def wrapped_generate(story_text, chapter_sel, *args):
	text = get_chapter_text(story_text, chapter_sel)
	return generate_audiobook_gpu(text, *args)

	gen_btn.click(
	wrapped_generate,
	inputs=gen_inputs,
	outputs=[output_audio, output_file, segment_list, output_status, gen_progress],
	)

	# ---------- Project wiring ----------
	save_inputs = [
	story_input,
	nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
	] + all_char_inputs + [gen_temp, gen_seed]
	save_btn.click(do_save_project, inputs=save_inputs, outputs=[project_json])

	load_outputs = [story_input, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed] + extract_outputs[1:] + [load_status]
	load_btn.click(do_load_project, inputs=[load_json], outputs=load_outputs)

	return demo


	demo = build_app()

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)