jkorstad commited on
Commit
8b010d4
·
1 Parent(s): 799f207

Polish pass: add File download for ZIP support, Clear button, sample load updates stats/chapters, GPU detection cleanup, character descriptions persist in save/load, long-text warnings, first-run notes on Generate tab.

Browse files
Files changed (2) hide show
  1. app.py +44 -18
  2. backend.py +4 -4
app.py CHANGED
@@ -234,8 +234,7 @@ _pipeline: Optional[AudiobookPipeline] = None
234
  def get_pipeline() -> AudiobookPipeline:
235
  global _pipeline
236
  if _pipeline is None:
237
- device = "cuda" if os.system("nvidia-smi > /dev/null 2>&1") == 0 else "cpu"
238
- _pipeline = AudiobookPipeline(device=device)
239
  return _pipeline
240
 
241
 
@@ -258,10 +257,6 @@ def update_stats(text: str) -> tuple:
258
  return str(wc), dur
259
 
260
 
261
- def load_sample(name: str) -> str:
262
- return SAMPLE_STORIES.get(name, "")
263
-
264
-
265
  def handle_upload(file_obj) -> tuple:
266
  if file_obj is None:
267
  return "", "No file uploaded."
@@ -311,7 +306,11 @@ def generate_audiobook_gpu(
311
  gen_temp, gen_seed, output_fmt, *args
312
  ):
313
  if not text or len(text.strip()) < 50:
314
- return None, "", "Error: Please provide at least 50 characters of story text.", ""
 
 
 
 
315
 
316
  # Unpack character args (80 values = 8 chars x 10 fields)
317
  names = list(args[0:8])
@@ -390,11 +389,11 @@ def generate_audiobook_gpu(
390
  extra_path = pipe.export_segments_zip(seg_paths)
391
 
392
  final_path = extra_path if extra_path else output_path
393
- return final_path, seg_html, f"Done! {len(seg_meta)} segments generated.", progress_text
394
  except Exception as e:
395
  import traceback
396
  traceback.print_exc()
397
- return None, "", f"Error: {str(e)}", progress_text
398
 
399
 
400
  @spaces.GPU(duration=60)
@@ -451,7 +450,11 @@ def preview_char_voice_gpu(name, mode, preset, audio, ref_text, design, instruct
451
  @spaces.GPU(duration=180)
452
  def quick_generate_gpu(text, mode, preset, audio, ref_text, design, instruct, lang, speed, gen_temp, output_fmt, gen_seed=42):
453
  if not text or len(text.strip()) < 50:
454
- return None, "Error: Text too short."
 
 
 
 
455
 
456
  pipe = get_pipeline()
457
  nar_cfg = VoiceConfig(
@@ -488,11 +491,11 @@ def quick_generate_gpu(text, mode, preset, audio, ref_text, design, instruct, la
488
  extra_path = pipe.export_segments_zip(seg_paths)
489
 
490
  final_path = extra_path if extra_path else output_path
491
- return final_path, f"Quick audiobook ready! {len(seg_meta)} segments."
492
  except Exception as e:
493
  import traceback
494
  traceback.print_exc()
495
- return None, f"Error: {str(e)}"
496
 
497
 
498
  # ---------------------------------------------------------------------------
@@ -527,7 +530,7 @@ def do_save_project(text, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_des
527
  if not names[i]:
528
  continue
529
  char_configs[names[i]] = VoiceConfig(
530
- name=names[i], mode=modes[i],
531
  preset=presets[i] if modes[i] == "preset" else None,
532
  ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None,
533
  ref_text=ref_texts[i] if modes[i] == "clone" else None,
@@ -565,7 +568,7 @@ def do_load_project(json_str):
565
  char_updates.extend([
566
  gr.update(visible=True),
567
  gr.update(value=c.name, visible=True),
568
- gr.update(value="", visible=True),
569
  gr.update(value=c.mode, visible=True),
570
  gr.update(value=c.preset if c.preset else "Ryan", visible=c.mode=="preset"),
571
  gr.update(value=c.ref_audio, visible=c.mode=="clone"),
@@ -679,7 +682,8 @@ def build_app():
679
  quick_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
680
  quick_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format")
681
  quick_btn = gr.Button("⚡ Quick Generate", variant="primary")
682
- quick_output_audio = gr.Audio(label="Quick Audiobook", interactive=False)
 
683
  quick_status = gr.Textbox(show_label=False, interactive=False)
684
  gr.Markdown("---")
685
  gr.Markdown("**Quick Generate** uses a single narrator voice for the entire text. Supports preset, clone, or AI-designed voices.")
@@ -692,6 +696,16 @@ def build_app():
692
  interactive=True,
693
  )
694
  refresh_chapters_btn = gr.Button("🔄 Detect Chapters")
 
 
 
 
 
 
 
 
 
 
695
 
696
  with gr.Row():
697
  gr.Markdown("### Character Detection")
@@ -702,12 +716,22 @@ def build_app():
702
 
703
  # Wiring
704
  file_upload.change(handle_upload, inputs=[file_upload], outputs=[story_input, extract_status])
705
- sample_dropdown.change(load_sample, inputs=[sample_dropdown], outputs=[story_input])
 
 
 
 
 
 
 
 
 
 
706
  story_input.change(update_stats, inputs=[story_input], outputs=[stat_words, stat_dur])
707
  quick_btn.click(
708
  quick_generate_gpu,
709
  inputs=[story_input, quick_mode, quick_preset, quick_audio, quick_ref_text, quick_design, quick_instruct, quick_lang, quick_speed, quick_temp, quick_fmt],
710
- outputs=[quick_output_audio, quick_status],
711
  )
712
 
713
  quick_mode.change(on_mode_change, inputs=quick_mode, outputs=[quick_preset, quick_audio, quick_ref_text, quick_design])
@@ -805,6 +829,7 @@ Configure up to 8 characters. Each character can use one of three voice modes:
805
 
806
  # ==================== TAB 3: Generate ====================
807
  with gr.TabItem("⚡ Generate"):
 
808
  with gr.Row():
809
  with gr.Column(scale=1):
810
  gr.Markdown("### Settings")
@@ -817,6 +842,7 @@ Configure up to 8 characters. Each character can use one of three voice modes:
817
  with gr.Column(scale=2):
818
  gr.Markdown("### Output")
819
  output_audio = gr.Audio(label="Generated Audiobook", type="filepath", interactive=False)
 
820
  output_status = gr.Textbox(label="Status", interactive=False)
821
  segment_list = gr.HTML(label="Segments")
822
 
@@ -944,7 +970,7 @@ Configure up to 8 characters. Each character can use one of three voice modes:
944
  gen_btn.click(
945
  wrapped_generate,
946
  inputs=gen_inputs,
947
- outputs=[output_audio, segment_list, output_status, gen_progress],
948
  )
949
 
950
  # ---------- Project wiring ----------
 
234
  def get_pipeline() -> AudiobookPipeline:
235
  global _pipeline
236
  if _pipeline is None:
237
+ _pipeline = AudiobookPipeline()
 
238
  return _pipeline
239
 
240
 
 
257
  return str(wc), dur
258
 
259
 
 
 
 
 
260
  def handle_upload(file_obj) -> tuple:
261
  if file_obj is None:
262
  return "", "No file uploaded."
 
306
  gen_temp, gen_seed, output_fmt, *args
307
  ):
308
  if not text or len(text.strip()) < 50:
309
+ return None, None, "", "Error: Please provide at least 50 characters of story text.", ""
310
+
311
+ wc = len(text.split())
312
+ if wc > 5000:
313
+ print(f"[WARN] Long text: {wc} words. Generation may take a while or hit timeouts.")
314
 
315
  # Unpack character args (80 values = 8 chars x 10 fields)
316
  names = list(args[0:8])
 
389
  extra_path = pipe.export_segments_zip(seg_paths)
390
 
391
  final_path = extra_path if extra_path else output_path
392
+ return final_path, final_path, seg_html, f"Done! {len(seg_meta)} segments generated.", progress_text
393
  except Exception as e:
394
  import traceback
395
  traceback.print_exc()
396
+ return None, None, "", f"Error: {str(e)}", progress_text
397
 
398
 
399
  @spaces.GPU(duration=60)
 
450
  @spaces.GPU(duration=180)
451
  def quick_generate_gpu(text, mode, preset, audio, ref_text, design, instruct, lang, speed, gen_temp, output_fmt, gen_seed=42):
452
  if not text or len(text.strip()) < 50:
453
+ return None, None, "Error: Text too short."
454
+
455
+ wc = len(text.split())
456
+ if wc > 5000:
457
+ print(f"[WARN] Long text: {wc} words. Quick Generate may take a while or hit timeouts.")
458
 
459
  pipe = get_pipeline()
460
  nar_cfg = VoiceConfig(
 
491
  extra_path = pipe.export_segments_zip(seg_paths)
492
 
493
  final_path = extra_path if extra_path else output_path
494
+ return final_path, final_path, f"Quick audiobook ready! {len(seg_meta)} segments."
495
  except Exception as e:
496
  import traceback
497
  traceback.print_exc()
498
+ return None, None, f"Error: {str(e)}"
499
 
500
 
501
  # ---------------------------------------------------------------------------
 
530
  if not names[i]:
531
  continue
532
  char_configs[names[i]] = VoiceConfig(
533
+ name=names[i], mode=modes[i], description=descs[i] or "",
534
  preset=presets[i] if modes[i] == "preset" else None,
535
  ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None,
536
  ref_text=ref_texts[i] if modes[i] == "clone" else None,
 
568
  char_updates.extend([
569
  gr.update(visible=True),
570
  gr.update(value=c.name, visible=True),
571
+ gr.update(value=c.description, visible=True),
572
  gr.update(value=c.mode, visible=True),
573
  gr.update(value=c.preset if c.preset else "Ryan", visible=c.mode=="preset"),
574
  gr.update(value=c.ref_audio, visible=c.mode=="clone"),
 
682
  quick_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
683
  quick_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format")
684
  quick_btn = gr.Button("⚡ Quick Generate", variant="primary")
685
+ quick_output_audio = gr.Audio(label="Quick Audiobook", type="filepath", interactive=False)
686
+ quick_output_file = gr.File(label="Download", interactive=False)
687
  quick_status = gr.Textbox(show_label=False, interactive=False)
688
  gr.Markdown("---")
689
  gr.Markdown("**Quick Generate** uses a single narrator voice for the entire text. Supports preset, clone, or AI-designed voices.")
 
696
  interactive=True,
697
  )
698
  refresh_chapters_btn = gr.Button("🔄 Detect Chapters")
699
+ clear_story_btn = gr.Button("🗑️ Clear", variant="secondary")
700
+
701
+ def clear_story():
702
+ return "", gr.update(choices=["All"], value="All"), "0", "0 sec", ""
703
+
704
+ clear_story_btn.click(
705
+ clear_story,
706
+ inputs=[],
707
+ outputs=[story_input, chapter_selector, stat_words, stat_dur, extract_status],
708
+ )
709
 
710
  with gr.Row():
711
  gr.Markdown("### Character Detection")
 
716
 
717
  # Wiring
718
  file_upload.change(handle_upload, inputs=[file_upload], outputs=[story_input, extract_status])
719
+ def load_sample_and_update(name):
720
+ text = SAMPLE_STORIES.get(name, "")
721
+ wc = len(text.split()) if text else 0
722
+ dur = estimate_duration(wc)
723
+ return text, str(wc), dur, gr.update(choices=["All"], value="All"), ""
724
+
725
+ sample_dropdown.change(
726
+ load_sample_and_update,
727
+ inputs=[sample_dropdown],
728
+ outputs=[story_input, stat_words, stat_dur, chapter_selector, extract_status],
729
+ )
730
  story_input.change(update_stats, inputs=[story_input], outputs=[stat_words, stat_dur])
731
  quick_btn.click(
732
  quick_generate_gpu,
733
  inputs=[story_input, quick_mode, quick_preset, quick_audio, quick_ref_text, quick_design, quick_instruct, quick_lang, quick_speed, quick_temp, quick_fmt],
734
+ outputs=[quick_output_audio, quick_output_file, quick_status],
735
  )
736
 
737
  quick_mode.change(on_mode_change, inputs=quick_mode, outputs=[quick_preset, quick_audio, quick_ref_text, quick_design])
 
829
 
830
  # ==================== TAB 3: Generate ====================
831
  with gr.TabItem("⚡ Generate"):
832
+ gr.Markdown("_Note: The first generation downloads Qwen3-TTS 1.7B models (~5 GB) and may take 2–5 minutes. Subsequent runs are much faster._")
833
  with gr.Row():
834
  with gr.Column(scale=1):
835
  gr.Markdown("### Settings")
 
842
  with gr.Column(scale=2):
843
  gr.Markdown("### Output")
844
  output_audio = gr.Audio(label="Generated Audiobook", type="filepath", interactive=False)
845
+ output_file = gr.File(label="Download", interactive=False)
846
  output_status = gr.Textbox(label="Status", interactive=False)
847
  segment_list = gr.HTML(label="Segments")
848
 
 
970
  gen_btn.click(
971
  wrapped_generate,
972
  inputs=gen_inputs,
973
+ outputs=[output_audio, output_file, segment_list, output_status, gen_progress],
974
  )
975
 
976
  # ---------- Project wiring ----------
backend.py CHANGED
@@ -127,6 +127,7 @@ class VoiceConfig:
127
  instruct: str = ""
128
  language: str = "English"
129
  speed: float = 1.0 # 0.5 to 2.0
 
130
 
131
  def to_dict(self) -> dict:
132
  return asdict(self)
@@ -401,8 +402,7 @@ class TextProcessor:
401
  # ---------------------------------------------------------------------------
402
 
403
  class TTSEngine:
404
- def __init__(self, device: str = "cuda"):
405
- self.device = device
406
  self._custom_voice_model = None
407
  self._base_model = None
408
  self._design_model = None
@@ -706,8 +706,8 @@ def ai_extract_characters(text: str, api_token: Optional[str] = None) -> List[Ch
706
  # ---------------------------------------------------------------------------
707
 
708
  class AudiobookPipeline:
709
- def __init__(self, device: str = "cuda"):
710
- self.tts = TTSEngine(device=device)
711
  self.processor = TextProcessor()
712
  self.temp_dir = Path(tempfile.gettempdir()) / "audiobook_segments"
713
  self.temp_dir.mkdir(exist_ok=True)
 
127
  instruct: str = ""
128
  language: str = "English"
129
  speed: float = 1.0 # 0.5 to 2.0
130
+ description: str = "" # UI-only, not used for TTS
131
 
132
  def to_dict(self) -> dict:
133
  return asdict(self)
 
402
  # ---------------------------------------------------------------------------
403
 
404
  class TTSEngine:
405
+ def __init__(self):
 
406
  self._custom_voice_model = None
407
  self._base_model = None
408
  self._design_model = None
 
706
  # ---------------------------------------------------------------------------
707
 
708
  class AudiobookPipeline:
709
+ def __init__(self):
710
+ self.tts = TTSEngine()
711
  self.processor = TextProcessor()
712
  self.temp_dir = Path(tempfile.gettempdir()) / "audiobook_segments"
713
  self.temp_dir.mkdir(exist_ok=True)