Polish pass: add File download for ZIP support, Clear button, sample load updates stats/chapters, GPU detection cleanup, character descriptions persist in save/load, long-text warnings, first-run notes on Generate tab.
Browse files- app.py +44 -18
- backend.py +4 -4
app.py
CHANGED
|
@@ -234,8 +234,7 @@ _pipeline: Optional[AudiobookPipeline] = None
|
|
| 234 |
def get_pipeline() -> AudiobookPipeline:
|
| 235 |
global _pipeline
|
| 236 |
if _pipeline is None:
|
| 237 |
-
|
| 238 |
-
_pipeline = AudiobookPipeline(device=device)
|
| 239 |
return _pipeline
|
| 240 |
|
| 241 |
|
|
@@ -258,10 +257,6 @@ def update_stats(text: str) -> tuple:
|
|
| 258 |
return str(wc), dur
|
| 259 |
|
| 260 |
|
| 261 |
-
def load_sample(name: str) -> str:
|
| 262 |
-
return SAMPLE_STORIES.get(name, "")
|
| 263 |
-
|
| 264 |
-
|
| 265 |
def handle_upload(file_obj) -> tuple:
|
| 266 |
if file_obj is None:
|
| 267 |
return "", "No file uploaded."
|
|
@@ -311,7 +306,11 @@ def generate_audiobook_gpu(
|
|
| 311 |
gen_temp, gen_seed, output_fmt, *args
|
| 312 |
):
|
| 313 |
if not text or len(text.strip()) < 50:
|
| 314 |
-
return None, "", "Error: Please provide at least 50 characters of story text.", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
# Unpack character args (80 values = 8 chars x 10 fields)
|
| 317 |
names = list(args[0:8])
|
|
@@ -390,11 +389,11 @@ def generate_audiobook_gpu(
|
|
| 390 |
extra_path = pipe.export_segments_zip(seg_paths)
|
| 391 |
|
| 392 |
final_path = extra_path if extra_path else output_path
|
| 393 |
-
return final_path, seg_html, f"Done! {len(seg_meta)} segments generated.", progress_text
|
| 394 |
except Exception as e:
|
| 395 |
import traceback
|
| 396 |
traceback.print_exc()
|
| 397 |
-
return None, "", f"Error: {str(e)}", progress_text
|
| 398 |
|
| 399 |
|
| 400 |
@spaces.GPU(duration=60)
|
|
@@ -451,7 +450,11 @@ def preview_char_voice_gpu(name, mode, preset, audio, ref_text, design, instruct
|
|
| 451 |
@spaces.GPU(duration=180)
|
| 452 |
def quick_generate_gpu(text, mode, preset, audio, ref_text, design, instruct, lang, speed, gen_temp, output_fmt, gen_seed=42):
|
| 453 |
if not text or len(text.strip()) < 50:
|
| 454 |
-
return None, "Error: Text too short."
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
|
| 456 |
pipe = get_pipeline()
|
| 457 |
nar_cfg = VoiceConfig(
|
|
@@ -488,11 +491,11 @@ def quick_generate_gpu(text, mode, preset, audio, ref_text, design, instruct, la
|
|
| 488 |
extra_path = pipe.export_segments_zip(seg_paths)
|
| 489 |
|
| 490 |
final_path = extra_path if extra_path else output_path
|
| 491 |
-
return final_path, f"Quick audiobook ready! {len(seg_meta)} segments."
|
| 492 |
except Exception as e:
|
| 493 |
import traceback
|
| 494 |
traceback.print_exc()
|
| 495 |
-
return None, f"Error: {str(e)}"
|
| 496 |
|
| 497 |
|
| 498 |
# ---------------------------------------------------------------------------
|
|
@@ -527,7 +530,7 @@ def do_save_project(text, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_des
|
|
| 527 |
if not names[i]:
|
| 528 |
continue
|
| 529 |
char_configs[names[i]] = VoiceConfig(
|
| 530 |
-
name=names[i], mode=modes[i],
|
| 531 |
preset=presets[i] if modes[i] == "preset" else None,
|
| 532 |
ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None,
|
| 533 |
ref_text=ref_texts[i] if modes[i] == "clone" else None,
|
|
@@ -565,7 +568,7 @@ def do_load_project(json_str):
|
|
| 565 |
char_updates.extend([
|
| 566 |
gr.update(visible=True),
|
| 567 |
gr.update(value=c.name, visible=True),
|
| 568 |
-
gr.update(value=
|
| 569 |
gr.update(value=c.mode, visible=True),
|
| 570 |
gr.update(value=c.preset if c.preset else "Ryan", visible=c.mode=="preset"),
|
| 571 |
gr.update(value=c.ref_audio, visible=c.mode=="clone"),
|
|
@@ -679,7 +682,8 @@ def build_app():
|
|
| 679 |
quick_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
|
| 680 |
quick_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format")
|
| 681 |
quick_btn = gr.Button("⚡ Quick Generate", variant="primary")
|
| 682 |
-
quick_output_audio = gr.Audio(label="Quick Audiobook", interactive=False)
|
|
|
|
| 683 |
quick_status = gr.Textbox(show_label=False, interactive=False)
|
| 684 |
gr.Markdown("---")
|
| 685 |
gr.Markdown("**Quick Generate** uses a single narrator voice for the entire text. Supports preset, clone, or AI-designed voices.")
|
|
@@ -692,6 +696,16 @@ def build_app():
|
|
| 692 |
interactive=True,
|
| 693 |
)
|
| 694 |
refresh_chapters_btn = gr.Button("🔄 Detect Chapters")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
|
| 696 |
with gr.Row():
|
| 697 |
gr.Markdown("### Character Detection")
|
|
@@ -702,12 +716,22 @@ def build_app():
|
|
| 702 |
|
| 703 |
# Wiring
|
| 704 |
file_upload.change(handle_upload, inputs=[file_upload], outputs=[story_input, extract_status])
|
| 705 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 706 |
story_input.change(update_stats, inputs=[story_input], outputs=[stat_words, stat_dur])
|
| 707 |
quick_btn.click(
|
| 708 |
quick_generate_gpu,
|
| 709 |
inputs=[story_input, quick_mode, quick_preset, quick_audio, quick_ref_text, quick_design, quick_instruct, quick_lang, quick_speed, quick_temp, quick_fmt],
|
| 710 |
-
outputs=[quick_output_audio, quick_status],
|
| 711 |
)
|
| 712 |
|
| 713 |
quick_mode.change(on_mode_change, inputs=quick_mode, outputs=[quick_preset, quick_audio, quick_ref_text, quick_design])
|
|
@@ -805,6 +829,7 @@ Configure up to 8 characters. Each character can use one of three voice modes:
|
|
| 805 |
|
| 806 |
# ==================== TAB 3: Generate ====================
|
| 807 |
with gr.TabItem("⚡ Generate"):
|
|
|
|
| 808 |
with gr.Row():
|
| 809 |
with gr.Column(scale=1):
|
| 810 |
gr.Markdown("### Settings")
|
|
@@ -817,6 +842,7 @@ Configure up to 8 characters. Each character can use one of three voice modes:
|
|
| 817 |
with gr.Column(scale=2):
|
| 818 |
gr.Markdown("### Output")
|
| 819 |
output_audio = gr.Audio(label="Generated Audiobook", type="filepath", interactive=False)
|
|
|
|
| 820 |
output_status = gr.Textbox(label="Status", interactive=False)
|
| 821 |
segment_list = gr.HTML(label="Segments")
|
| 822 |
|
|
@@ -944,7 +970,7 @@ Configure up to 8 characters. Each character can use one of three voice modes:
|
|
| 944 |
gen_btn.click(
|
| 945 |
wrapped_generate,
|
| 946 |
inputs=gen_inputs,
|
| 947 |
-
outputs=[output_audio, segment_list, output_status, gen_progress],
|
| 948 |
)
|
| 949 |
|
| 950 |
# ---------- Project wiring ----------
|
|
|
|
| 234 |
def get_pipeline() -> AudiobookPipeline:
|
| 235 |
global _pipeline
|
| 236 |
if _pipeline is None:
|
| 237 |
+
_pipeline = AudiobookPipeline()
|
|
|
|
| 238 |
return _pipeline
|
| 239 |
|
| 240 |
|
|
|
|
| 257 |
return str(wc), dur
|
| 258 |
|
| 259 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
def handle_upload(file_obj) -> tuple:
|
| 261 |
if file_obj is None:
|
| 262 |
return "", "No file uploaded."
|
|
|
|
| 306 |
gen_temp, gen_seed, output_fmt, *args
|
| 307 |
):
|
| 308 |
if not text or len(text.strip()) < 50:
|
| 309 |
+
return None, None, "", "Error: Please provide at least 50 characters of story text.", ""
|
| 310 |
+
|
| 311 |
+
wc = len(text.split())
|
| 312 |
+
if wc > 5000:
|
| 313 |
+
print(f"[WARN] Long text: {wc} words. Generation may take a while or hit timeouts.")
|
| 314 |
|
| 315 |
# Unpack character args (80 values = 8 chars x 10 fields)
|
| 316 |
names = list(args[0:8])
|
|
|
|
| 389 |
extra_path = pipe.export_segments_zip(seg_paths)
|
| 390 |
|
| 391 |
final_path = extra_path if extra_path else output_path
|
| 392 |
+
return final_path, final_path, seg_html, f"Done! {len(seg_meta)} segments generated.", progress_text
|
| 393 |
except Exception as e:
|
| 394 |
import traceback
|
| 395 |
traceback.print_exc()
|
| 396 |
+
return None, None, "", f"Error: {str(e)}", progress_text
|
| 397 |
|
| 398 |
|
| 399 |
@spaces.GPU(duration=60)
|
|
|
|
| 450 |
@spaces.GPU(duration=180)
|
| 451 |
def quick_generate_gpu(text, mode, preset, audio, ref_text, design, instruct, lang, speed, gen_temp, output_fmt, gen_seed=42):
|
| 452 |
if not text or len(text.strip()) < 50:
|
| 453 |
+
return None, None, "Error: Text too short."
|
| 454 |
+
|
| 455 |
+
wc = len(text.split())
|
| 456 |
+
if wc > 5000:
|
| 457 |
+
print(f"[WARN] Long text: {wc} words. Quick Generate may take a while or hit timeouts.")
|
| 458 |
|
| 459 |
pipe = get_pipeline()
|
| 460 |
nar_cfg = VoiceConfig(
|
|
|
|
| 491 |
extra_path = pipe.export_segments_zip(seg_paths)
|
| 492 |
|
| 493 |
final_path = extra_path if extra_path else output_path
|
| 494 |
+
return final_path, final_path, f"Quick audiobook ready! {len(seg_meta)} segments."
|
| 495 |
except Exception as e:
|
| 496 |
import traceback
|
| 497 |
traceback.print_exc()
|
| 498 |
+
return None, None, f"Error: {str(e)}"
|
| 499 |
|
| 500 |
|
| 501 |
# ---------------------------------------------------------------------------
|
|
|
|
| 530 |
if not names[i]:
|
| 531 |
continue
|
| 532 |
char_configs[names[i]] = VoiceConfig(
|
| 533 |
+
name=names[i], mode=modes[i], description=descs[i] or "",
|
| 534 |
preset=presets[i] if modes[i] == "preset" else None,
|
| 535 |
ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None,
|
| 536 |
ref_text=ref_texts[i] if modes[i] == "clone" else None,
|
|
|
|
| 568 |
char_updates.extend([
|
| 569 |
gr.update(visible=True),
|
| 570 |
gr.update(value=c.name, visible=True),
|
| 571 |
+
gr.update(value=c.description, visible=True),
|
| 572 |
gr.update(value=c.mode, visible=True),
|
| 573 |
gr.update(value=c.preset if c.preset else "Ryan", visible=c.mode=="preset"),
|
| 574 |
gr.update(value=c.ref_audio, visible=c.mode=="clone"),
|
|
|
|
| 682 |
quick_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
|
| 683 |
quick_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format")
|
| 684 |
quick_btn = gr.Button("⚡ Quick Generate", variant="primary")
|
| 685 |
+
quick_output_audio = gr.Audio(label="Quick Audiobook", type="filepath", interactive=False)
|
| 686 |
+
quick_output_file = gr.File(label="Download", interactive=False)
|
| 687 |
quick_status = gr.Textbox(show_label=False, interactive=False)
|
| 688 |
gr.Markdown("---")
|
| 689 |
gr.Markdown("**Quick Generate** uses a single narrator voice for the entire text. Supports preset, clone, or AI-designed voices.")
|
|
|
|
| 696 |
interactive=True,
|
| 697 |
)
|
| 698 |
refresh_chapters_btn = gr.Button("🔄 Detect Chapters")
|
| 699 |
+
clear_story_btn = gr.Button("🗑️ Clear", variant="secondary")
|
| 700 |
+
|
| 701 |
+
def clear_story():
|
| 702 |
+
return "", gr.update(choices=["All"], value="All"), "0", "0 sec", ""
|
| 703 |
+
|
| 704 |
+
clear_story_btn.click(
|
| 705 |
+
clear_story,
|
| 706 |
+
inputs=[],
|
| 707 |
+
outputs=[story_input, chapter_selector, stat_words, stat_dur, extract_status],
|
| 708 |
+
)
|
| 709 |
|
| 710 |
with gr.Row():
|
| 711 |
gr.Markdown("### Character Detection")
|
|
|
|
| 716 |
|
| 717 |
# Wiring
|
| 718 |
file_upload.change(handle_upload, inputs=[file_upload], outputs=[story_input, extract_status])
|
| 719 |
+
def load_sample_and_update(name):
|
| 720 |
+
text = SAMPLE_STORIES.get(name, "")
|
| 721 |
+
wc = len(text.split()) if text else 0
|
| 722 |
+
dur = estimate_duration(wc)
|
| 723 |
+
return text, str(wc), dur, gr.update(choices=["All"], value="All"), ""
|
| 724 |
+
|
| 725 |
+
sample_dropdown.change(
|
| 726 |
+
load_sample_and_update,
|
| 727 |
+
inputs=[sample_dropdown],
|
| 728 |
+
outputs=[story_input, stat_words, stat_dur, chapter_selector, extract_status],
|
| 729 |
+
)
|
| 730 |
story_input.change(update_stats, inputs=[story_input], outputs=[stat_words, stat_dur])
|
| 731 |
quick_btn.click(
|
| 732 |
quick_generate_gpu,
|
| 733 |
inputs=[story_input, quick_mode, quick_preset, quick_audio, quick_ref_text, quick_design, quick_instruct, quick_lang, quick_speed, quick_temp, quick_fmt],
|
| 734 |
+
outputs=[quick_output_audio, quick_output_file, quick_status],
|
| 735 |
)
|
| 736 |
|
| 737 |
quick_mode.change(on_mode_change, inputs=quick_mode, outputs=[quick_preset, quick_audio, quick_ref_text, quick_design])
|
|
|
|
| 829 |
|
| 830 |
# ==================== TAB 3: Generate ====================
|
| 831 |
with gr.TabItem("⚡ Generate"):
|
| 832 |
+
gr.Markdown("_Note: The first generation downloads Qwen3-TTS 1.7B models (~5 GB) and may take 2–5 minutes. Subsequent runs are much faster._")
|
| 833 |
with gr.Row():
|
| 834 |
with gr.Column(scale=1):
|
| 835 |
gr.Markdown("### Settings")
|
|
|
|
| 842 |
with gr.Column(scale=2):
|
| 843 |
gr.Markdown("### Output")
|
| 844 |
output_audio = gr.Audio(label="Generated Audiobook", type="filepath", interactive=False)
|
| 845 |
+
output_file = gr.File(label="Download", interactive=False)
|
| 846 |
output_status = gr.Textbox(label="Status", interactive=False)
|
| 847 |
segment_list = gr.HTML(label="Segments")
|
| 848 |
|
|
|
|
| 970 |
gen_btn.click(
|
| 971 |
wrapped_generate,
|
| 972 |
inputs=gen_inputs,
|
| 973 |
+
outputs=[output_audio, output_file, segment_list, output_status, gen_progress],
|
| 974 |
)
|
| 975 |
|
| 976 |
# ---------- Project wiring ----------
|
backend.py
CHANGED
|
@@ -127,6 +127,7 @@ class VoiceConfig:
|
|
| 127 |
instruct: str = ""
|
| 128 |
language: str = "English"
|
| 129 |
speed: float = 1.0 # 0.5 to 2.0
|
|
|
|
| 130 |
|
| 131 |
def to_dict(self) -> dict:
|
| 132 |
return asdict(self)
|
|
@@ -401,8 +402,7 @@ class TextProcessor:
|
|
| 401 |
# ---------------------------------------------------------------------------
|
| 402 |
|
| 403 |
class TTSEngine:
|
| 404 |
-
def __init__(self
|
| 405 |
-
self.device = device
|
| 406 |
self._custom_voice_model = None
|
| 407 |
self._base_model = None
|
| 408 |
self._design_model = None
|
|
@@ -706,8 +706,8 @@ def ai_extract_characters(text: str, api_token: Optional[str] = None) -> List[Ch
|
|
| 706 |
# ---------------------------------------------------------------------------
|
| 707 |
|
| 708 |
class AudiobookPipeline:
|
| 709 |
-
def __init__(self
|
| 710 |
-
self.tts = TTSEngine(
|
| 711 |
self.processor = TextProcessor()
|
| 712 |
self.temp_dir = Path(tempfile.gettempdir()) / "audiobook_segments"
|
| 713 |
self.temp_dir.mkdir(exist_ok=True)
|
|
|
|
| 127 |
instruct: str = ""
|
| 128 |
language: str = "English"
|
| 129 |
speed: float = 1.0 # 0.5 to 2.0
|
| 130 |
+
description: str = "" # UI-only, not used for TTS
|
| 131 |
|
| 132 |
def to_dict(self) -> dict:
|
| 133 |
return asdict(self)
|
|
|
|
| 402 |
# ---------------------------------------------------------------------------
|
| 403 |
|
| 404 |
class TTSEngine:
|
| 405 |
+
def __init__(self):
|
|
|
|
| 406 |
self._custom_voice_model = None
|
| 407 |
self._base_model = None
|
| 408 |
self._design_model = None
|
|
|
|
| 706 |
# ---------------------------------------------------------------------------
|
| 707 |
|
| 708 |
class AudiobookPipeline:
|
| 709 |
+
def __init__(self):
|
| 710 |
+
self.tts = TTSEngine()
|
| 711 |
self.processor = TextProcessor()
|
| 712 |
self.temp_dir = Path(tempfile.gettempdir()) / "audiobook_segments"
|
| 713 |
self.temp_dir.mkdir(exist_ok=True)
|