PlotweaverModel commited on
Commit
bfd700b
·
verified ·
1 Parent(s): 3a7a3f7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -36
app.py CHANGED
@@ -1,14 +1,11 @@
1
  """
2
  Audiobook Generator - English Source to Multi-Language Audio
3
- Powered by:
4
- - Qwen3.5-Omni-Plus (preset voices, 36 languages)
5
- - Qwen3-TTS-VC (voice cloning, 10 languages)
6
- - YourVoic API (1000+ emotional voices, 93+ languages including Arabic, Swahili, Indian languages)
7
 
8
  Deploy as a Hugging Face Space:
9
  1. Create a new Space (SDK: Gradio)
10
  2. Upload app.py and requirements.txt
11
- 3. Add secrets: DASHSCOPE_API_KEY (required), YOURVOIC_API_KEY (optional)
12
  """
13
 
14
  import os
@@ -315,11 +312,11 @@ def generate_speech_yourvoic_with_retry(client, text, voice, yv_model, emotion,
315
  return None, text, f"No valid voice found for {language}. This language may not be supported on your plan. Tried: {candidates[:8]}"
316
 
317
  YOURVOIC_MODELS = [
318
- "aura-prime -- Balanced quality and speed (recommended)",
319
- "aura-lite -- Fast, good for previews",
320
- "aura-max -- Premium quality (paid plans only)",
321
- "rapid-max -- Fast with good quality",
322
- "rapid-flash -- Fastest, real-time apps",
323
  ]
324
 
325
  YOURVOIC_EMOTIONS = [
@@ -333,7 +330,16 @@ def get_voice_name(label):
333
 
334
 
335
  def get_yourvoic_model(label):
336
- return label.split("--")[0].strip()
 
 
 
 
 
 
 
 
 
337
 
338
 
339
  # ==========================================
@@ -447,7 +453,7 @@ def split_text_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK):
447
 
448
 
449
  # ==========================================
450
- # VOICE CLONING (Qwen)
451
  # ==========================================
452
  def prepare_clone_audio(audio_path):
453
  result = subprocess.run(
@@ -491,7 +497,7 @@ def clone_voice(audio_path, api_key):
491
 
492
 
493
  # ==========================================
494
- # TRANSLATION (Qwen text-only)
495
  # ==========================================
496
  def translate_text(client, text, target_language, lang_config):
497
  response = client.chat.completions.create(
@@ -505,7 +511,7 @@ def translate_text(client, text, target_language, lang_config):
505
 
506
 
507
  # ==========================================
508
- # TTS MODE 1: PRESET VOICE (Qwen Omni)
509
  # ==========================================
510
  def generate_speech_preset(client, text, voice, language, lang_config, translate, chunk_index, output_dir):
511
  output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
@@ -545,7 +551,7 @@ def generate_speech_preset(client, text, voice, language, lang_config, translate
545
 
546
 
547
  # ==========================================
548
- # TTS MODE 2: CLONED VOICE (Qwen TTS-VC)
549
  # ==========================================
550
  def generate_speech_cloned(client, text, voice_id, language, lang_config, translate, api_key, chunk_index, output_dir):
551
  output_wav = os.path.join(output_dir, f"vc_chunk_{chunk_index:04d}.wav")
@@ -573,11 +579,11 @@ def generate_speech_cloned(client, text, voice_id, language, lang_config, transl
573
 
574
 
575
  # ==========================================
576
- # TTS MODE 3: YOURVOIC (emotional voices, 93+ languages)
577
  # ==========================================
578
  def generate_speech_yourvoic(client, text, voice, yv_model, emotion, language, lang_config, translate,
579
  api_key, chunk_index, output_dir):
580
- """Generate speech using YourVoic API. Handles translation via Qwen then TTS via YourVoic."""
581
  output_file = os.path.join(output_dir, f"yv_chunk_{chunk_index:04d}.mp3")
582
 
583
  # Translate if needed
@@ -675,7 +681,7 @@ def generate_audiobook(text_input, file_input, target_language, voice_mode,
675
  lang_config = LANGUAGES[target_language]
676
  lang_engine = lang_config["engine"]
677
  use_clone = voice_mode == "Clone a Voice"
678
- use_yourvoic = voice_mode == "YourVoic (Emotional AI)"
679
  translate = target_language != "English"
680
 
681
  # Auto-correct engine if language requires it
@@ -691,12 +697,12 @@ def generate_audiobook(text_input, file_input, target_language, voice_mode,
691
  # Validate keys
692
  if use_yourvoic:
693
  if not yv_key:
694
- raise gr.Error("YOURVOIC_API_KEY not set. Add it in Settings > Secrets. Get one at yourvoic.com/api/user")
695
  if translate and not ds_key:
696
- raise gr.Error("DASHSCOPE_API_KEY needed for translation. Add it in Settings > Secrets.")
697
  else:
698
  if not ds_key:
699
- raise gr.Error("DASHSCOPE_API_KEY not set. Add it in Settings > Secrets.")
700
 
701
  client = OpenAI(api_key=ds_key, base_url=DASHSCOPE_BASE_URL) if ds_key else None
702
  tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
@@ -782,14 +788,14 @@ def generate_audiobook(text_input, file_input, target_language, voice_mode,
782
 
783
  audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
784
  if use_yourvoic:
785
- voice_info = f"YourVoic: {yourvoic_voice_label} ({yourvoic_emotion})"
786
- mode_info = f"YourVoic API ({yourvoic_model_label})"
787
  elif use_clone:
788
  voice_info = f"Cloned (ID: {cloned_voice_id[:20]}...)"
789
- mode_info = "Qwen3-TTS-VC"
790
  else:
791
  voice_info = preset_voice_label
792
- mode_info = "Qwen3.5-Omni-Plus"
793
 
794
  stats = (
795
  f"**Audiobook Generated!**\n\n"
@@ -828,7 +834,7 @@ And he would smile - that slow, careful smile that seemed to cost him something
828
 
829
  DESCRIPTION = """
830
  # Audiobook Generator
831
- ### English Text to Multi-Language Audiobook
832
 
833
  """
834
 
@@ -899,7 +905,7 @@ def on_language_change(lang_choice):
899
  gr.update(visible=True, choices=voice_choices, value=default_voice), # yv_voice
900
  gr.update(visible=True), # yv_model
901
  gr.update(visible=True), # yv_emotion
902
- gr.update(value=f"Engine: YourVoic (1000+ emotional voices)"), # engine_label
903
  gr.update(visible=False, value=False), # use_clone
904
  gr.update(visible=False), # clone_audio
905
  gr.update(visible=False), # clone_info
@@ -910,7 +916,7 @@ def on_language_change(lang_choice):
910
  gr.update(visible=False), # yv_voice
911
  gr.update(visible=False), # yv_model
912
  gr.update(visible=False), # yv_emotion
913
- gr.update(value=f"Engine: Qwen3.5-Omni-Plus (translate + speak)"), # engine_label
914
  gr.update(visible=True), # use_clone
915
  gr.update(visible=False), # clone_audio
916
  gr.update(visible=False), # clone_info
@@ -934,7 +940,7 @@ def generate_wrapper(text_input, file_input, language_choice, use_clone,
934
  if use_clone:
935
  voice_mode = "Clone a Voice"
936
  elif engine == "yourvoic":
937
- voice_mode = "YourVoic (Emotional AI)"
938
  else:
939
  voice_mode = "Preset Voice"
940
 
@@ -960,7 +966,7 @@ with gr.Blocks(title="Audiobook Generator") as demo:
960
  target_lang = gr.Dropdown(choices=lang_choices, value="English", label="Target Language",
961
  info="The right voice engine is selected automatically based on language.")
962
 
963
- engine_label = gr.Markdown(value="Engine: Qwen3.5-Omni-Plus (translate + speak)")
964
 
965
  # Qwen preset voice (visible for Qwen languages)
966
  preset_voice = gr.Dropdown(choices=PRESET_VOICES, value="Jennifer -- Cinematic narrator",
@@ -968,16 +974,16 @@ with gr.Blocks(title="Audiobook Generator") as demo:
968
 
969
  # YourVoic controls (visible for YourVoic languages)
970
  yv_voice = gr.Dropdown(choices=YOURVOIC_VOICES_DEFAULT, value="Peter -- Universal fallback",
971
- label="YourVoic Voice", visible=False, allow_custom_value=True,
972
- info="Voices update automatically per language. Peter works for all.")
973
- yv_model = gr.Dropdown(choices=YOURVOIC_MODELS, value="aura-prime -- Balanced quality and speed (recommended)",
974
- label="YourVoic Model", visible=False)
975
  yv_emotion = gr.Dropdown(choices=YOURVOIC_EMOTIONS, value="friendly",
976
  label="Emotion Style", visible=False,
977
  info="Add emotional expression to the narration")
978
 
979
  # Voice cloning toggle (optional, works for Qwen languages only)
980
- use_clone = gr.Checkbox(value=False, label="Use Voice Cloning (Qwen, 10 languages only)",
981
  info="Clone a voice from audio sample instead of using preset")
982
  clone_audio = gr.Audio(label="Voice Sample (10s-3min)", type="filepath", visible=False)
983
  clone_info = gr.Markdown(
@@ -1015,7 +1021,7 @@ with gr.Blocks(title="Audiobook Generator") as demo:
1015
 
1016
  gr.Markdown(
1017
  "---\n"
1018
- )
1019
 
1020
  if __name__ == "__main__":
1021
  demo.launch()
 
1
  """
2
  Audiobook Generator - English Source to Multi-Language Audio
3
+ Supports 51 languages with preset voices, voice cloning, and emotional AI voices.
 
 
 
4
 
5
  Deploy as a Hugging Face Space:
6
  1. Create a new Space (SDK: Gradio)
7
  2. Upload app.py and requirements.txt
8
+ 3. Add required API secrets in Settings
9
  """
10
 
11
  import os
 
312
  return None, text, f"No valid voice found for {language}. This language may not be supported on your plan. Tried: {candidates[:8]}"
313
 
314
  YOURVOIC_MODELS = [
315
+ "balanced -- Balanced quality and speed (recommended)",
316
+ "lite -- Fast, good for previews",
317
+ "premium -- Premium quality (paid plans only)",
318
+ "fast -- Fast with good quality",
319
+ "realtime -- Fastest, real-time apps",
320
  ]
321
 
322
  YOURVOIC_EMOTIONS = [
 
330
 
331
 
332
  def get_yourvoic_model(label):
333
+ """Map anonymous model label to actual API model name."""
334
+ name = label.split("--")[0].strip()
335
+ model_map = {
336
+ "balanced": "aura-prime",
337
+ "lite": "aura-lite",
338
+ "premium": "aura-max",
339
+ "fast": "rapid-max",
340
+ "realtime": "rapid-flash",
341
+ }
342
+ return model_map.get(name, "aura-prime")
343
 
344
 
345
  # ==========================================
 
453
 
454
 
455
  # ==========================================
456
+ # VOICE CLONING
457
  # ==========================================
458
  def prepare_clone_audio(audio_path):
459
  result = subprocess.run(
 
497
 
498
 
499
  # ==========================================
500
+ # TRANSLATION
501
  # ==========================================
502
  def translate_text(client, text, target_language, lang_config):
503
  response = client.chat.completions.create(
 
511
 
512
 
513
  # ==========================================
514
+ # TTS MODE 1: PRESET VOICE
515
  # ==========================================
516
  def generate_speech_preset(client, text, voice, language, lang_config, translate, chunk_index, output_dir):
517
  output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
 
551
 
552
 
553
  # ==========================================
554
+ # TTS MODE 2: CLONED VOICE
555
  # ==========================================
556
  def generate_speech_cloned(client, text, voice_id, language, lang_config, translate, api_key, chunk_index, output_dir):
557
  output_wav = os.path.join(output_dir, f"vc_chunk_{chunk_index:04d}.wav")
 
579
 
580
 
581
  # ==========================================
582
+ # TTS MODE 3: EMOTIONAL AI VOICES
583
  # ==========================================
584
  def generate_speech_yourvoic(client, text, voice, yv_model, emotion, language, lang_config, translate,
585
  api_key, chunk_index, output_dir):
586
+ """Generate speech using emotional AI voice API."""
587
  output_file = os.path.join(output_dir, f"yv_chunk_{chunk_index:04d}.mp3")
588
 
589
  # Translate if needed
 
681
  lang_config = LANGUAGES[target_language]
682
  lang_engine = lang_config["engine"]
683
  use_clone = voice_mode == "Clone a Voice"
684
+ use_yourvoic = voice_mode == "Emotional AI"
685
  translate = target_language != "English"
686
 
687
  # Auto-correct engine if language requires it
 
697
  # Validate keys
698
  if use_yourvoic:
699
  if not yv_key:
700
+ raise gr.Error("Voice API key for emotional voices not set. Add YOURVOIC_API_KEY in Settings > Secrets.")
701
  if translate and not ds_key:
702
+ raise gr.Error("Translation API key not set. Add DASHSCOPE_API_KEY in Settings > Secrets.")
703
  else:
704
  if not ds_key:
705
+ raise gr.Error("Voice API key not set. Add DASHSCOPE_API_KEY in Settings > Secrets.")
706
 
707
  client = OpenAI(api_key=ds_key, base_url=DASHSCOPE_BASE_URL) if ds_key else None
708
  tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
 
788
 
789
  audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
790
  if use_yourvoic:
791
+ voice_info = f"Emotional AI: {yourvoic_voice_label} ({yourvoic_emotion})"
792
+ mode_info = f"Emotional AI Engine"
793
  elif use_clone:
794
  voice_info = f"Cloned (ID: {cloned_voice_id[:20]}...)"
795
+ mode_info = "Voice Clone Engine"
796
  else:
797
  voice_info = preset_voice_label
798
+ mode_info = "Premium AI Engine"
799
 
800
  stats = (
801
  f"**Audiobook Generated!**\n\n"
 
834
 
835
  DESCRIPTION = """
836
  # Audiobook Generator
837
+ ### English Text to Multi-Language Audiobook (51 Languages)
838
 
839
  """
840
 
 
905
  gr.update(visible=True, choices=voice_choices, value=default_voice), # yv_voice
906
  gr.update(visible=True), # yv_model
907
  gr.update(visible=True), # yv_emotion
908
+ gr.update(value=f"Engine: Emotional AI Voices"), # engine_label
909
  gr.update(visible=False, value=False), # use_clone
910
  gr.update(visible=False), # clone_audio
911
  gr.update(visible=False), # clone_info
 
916
  gr.update(visible=False), # yv_voice
917
  gr.update(visible=False), # yv_model
918
  gr.update(visible=False), # yv_emotion
919
+ gr.update(value=f"Engine: Premium AI Voices"), # engine_label
920
  gr.update(visible=True), # use_clone
921
  gr.update(visible=False), # clone_audio
922
  gr.update(visible=False), # clone_info
 
940
  if use_clone:
941
  voice_mode = "Clone a Voice"
942
  elif engine == "yourvoic":
943
+ voice_mode = "Emotional AI"
944
  else:
945
  voice_mode = "Preset Voice"
946
 
 
966
  target_lang = gr.Dropdown(choices=lang_choices, value="English", label="Target Language",
967
  info="The right voice engine is selected automatically based on language.")
968
 
969
+ engine_label = gr.Markdown(value="Engine: Premium AI Voices")
970
 
971
  # Qwen preset voice (visible for Qwen languages)
972
  preset_voice = gr.Dropdown(choices=PRESET_VOICES, value="Jennifer -- Cinematic narrator",
 
974
 
975
  # YourVoic controls (visible for YourVoic languages)
976
  yv_voice = gr.Dropdown(choices=YOURVOIC_VOICES_DEFAULT, value="Peter -- Universal fallback",
977
+ label="Voice", visible=False, allow_custom_value=True,
978
+ info="Voices update automatically per language.")
979
+ yv_model = gr.Dropdown(choices=YOURVOIC_MODELS, value="balanced -- Balanced quality and speed (recommended)",
980
+ label="AI Model", visible=False)
981
  yv_emotion = gr.Dropdown(choices=YOURVOIC_EMOTIONS, value="friendly",
982
  label="Emotion Style", visible=False,
983
  info="Add emotional expression to the narration")
984
 
985
  # Voice cloning toggle (optional, works for Qwen languages only)
986
+ use_clone = gr.Checkbox(value=False, label="Use Voice Cloning (10 core languages only)",
987
  info="Clone a voice from audio sample instead of using preset")
988
  clone_audio = gr.Audio(label="Voice Sample (10s-3min)", type="filepath", visible=False)
989
  clone_info = gr.Markdown(
 
1021
 
1022
  gr.Markdown(
1023
  "---\n"
1024
+ )
1025
 
1026
  if __name__ == "__main__":
1027
  demo.launch()