Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
|
@@ -1,14 +1,11 @@
|
|
| 1 |
"""
|
| 2 |
Audiobook Generator - English Source to Multi-Language Audio
|
| 3 |
-
|
| 4 |
-
- Qwen3.5-Omni-Plus (preset voices, 36 languages)
|
| 5 |
-
- Qwen3-TTS-VC (voice cloning, 10 languages)
|
| 6 |
-
- YourVoic API (1000+ emotional voices, 93+ languages including Arabic, Swahili, Indian languages)
|
| 7 |
|
| 8 |
Deploy as a Hugging Face Space:
|
| 9 |
1. Create a new Space (SDK: Gradio)
|
| 10 |
2. Upload app.py and requirements.txt
|
| 11 |
-
3. Add
|
| 12 |
"""
|
| 13 |
|
| 14 |
import os
|
|
@@ -315,11 +312,11 @@ def generate_speech_yourvoic_with_retry(client, text, voice, yv_model, emotion,
|
|
| 315 |
return None, text, f"No valid voice found for {language}. This language may not be supported on your plan. Tried: {candidates[:8]}"
|
| 316 |
|
| 317 |
YOURVOIC_MODELS = [
|
| 318 |
-
"
|
| 319 |
-
"
|
| 320 |
-
"
|
| 321 |
-
"
|
| 322 |
-
"
|
| 323 |
]
|
| 324 |
|
| 325 |
YOURVOIC_EMOTIONS = [
|
|
@@ -333,7 +330,16 @@ def get_voice_name(label):
|
|
| 333 |
|
| 334 |
|
| 335 |
def get_yourvoic_model(label):
|
| 336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
|
| 339 |
# ==========================================
|
|
@@ -447,7 +453,7 @@ def split_text_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK):
|
|
| 447 |
|
| 448 |
|
| 449 |
# ==========================================
|
| 450 |
-
# VOICE CLONING
|
| 451 |
# ==========================================
|
| 452 |
def prepare_clone_audio(audio_path):
|
| 453 |
result = subprocess.run(
|
|
@@ -491,7 +497,7 @@ def clone_voice(audio_path, api_key):
|
|
| 491 |
|
| 492 |
|
| 493 |
# ==========================================
|
| 494 |
-
# TRANSLATION
|
| 495 |
# ==========================================
|
| 496 |
def translate_text(client, text, target_language, lang_config):
|
| 497 |
response = client.chat.completions.create(
|
|
@@ -505,7 +511,7 @@ def translate_text(client, text, target_language, lang_config):
|
|
| 505 |
|
| 506 |
|
| 507 |
# ==========================================
|
| 508 |
-
# TTS MODE 1: PRESET VOICE
|
| 509 |
# ==========================================
|
| 510 |
def generate_speech_preset(client, text, voice, language, lang_config, translate, chunk_index, output_dir):
|
| 511 |
output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
|
|
@@ -545,7 +551,7 @@ def generate_speech_preset(client, text, voice, language, lang_config, translate
|
|
| 545 |
|
| 546 |
|
| 547 |
# ==========================================
|
| 548 |
-
# TTS MODE 2: CLONED VOICE
|
| 549 |
# ==========================================
|
| 550 |
def generate_speech_cloned(client, text, voice_id, language, lang_config, translate, api_key, chunk_index, output_dir):
|
| 551 |
output_wav = os.path.join(output_dir, f"vc_chunk_{chunk_index:04d}.wav")
|
|
@@ -573,11 +579,11 @@ def generate_speech_cloned(client, text, voice_id, language, lang_config, transl
|
|
| 573 |
|
| 574 |
|
| 575 |
# ==========================================
|
| 576 |
-
# TTS MODE 3:
|
| 577 |
# ==========================================
|
| 578 |
def generate_speech_yourvoic(client, text, voice, yv_model, emotion, language, lang_config, translate,
|
| 579 |
api_key, chunk_index, output_dir):
|
| 580 |
-
"""Generate speech using
|
| 581 |
output_file = os.path.join(output_dir, f"yv_chunk_{chunk_index:04d}.mp3")
|
| 582 |
|
| 583 |
# Translate if needed
|
|
@@ -675,7 +681,7 @@ def generate_audiobook(text_input, file_input, target_language, voice_mode,
|
|
| 675 |
lang_config = LANGUAGES[target_language]
|
| 676 |
lang_engine = lang_config["engine"]
|
| 677 |
use_clone = voice_mode == "Clone a Voice"
|
| 678 |
-
use_yourvoic = voice_mode == "
|
| 679 |
translate = target_language != "English"
|
| 680 |
|
| 681 |
# Auto-correct engine if language requires it
|
|
@@ -691,12 +697,12 @@ def generate_audiobook(text_input, file_input, target_language, voice_mode,
|
|
| 691 |
# Validate keys
|
| 692 |
if use_yourvoic:
|
| 693 |
if not yv_key:
|
| 694 |
-
raise gr.Error("
|
| 695 |
if translate and not ds_key:
|
| 696 |
-
raise gr.Error("
|
| 697 |
else:
|
| 698 |
if not ds_key:
|
| 699 |
-
raise gr.Error("
|
| 700 |
|
| 701 |
client = OpenAI(api_key=ds_key, base_url=DASHSCOPE_BASE_URL) if ds_key else None
|
| 702 |
tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
|
|
@@ -782,14 +788,14 @@ def generate_audiobook(text_input, file_input, target_language, voice_mode,
|
|
| 782 |
|
| 783 |
audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
|
| 784 |
if use_yourvoic:
|
| 785 |
-
voice_info = f"
|
| 786 |
-
mode_info = f"
|
| 787 |
elif use_clone:
|
| 788 |
voice_info = f"Cloned (ID: {cloned_voice_id[:20]}...)"
|
| 789 |
-
mode_info = "
|
| 790 |
else:
|
| 791 |
voice_info = preset_voice_label
|
| 792 |
-
mode_info = "
|
| 793 |
|
| 794 |
stats = (
|
| 795 |
f"**Audiobook Generated!**\n\n"
|
|
@@ -828,7 +834,7 @@ And he would smile - that slow, careful smile that seemed to cost him something
|
|
| 828 |
|
| 829 |
DESCRIPTION = """
|
| 830 |
# Audiobook Generator
|
| 831 |
-
### English Text to Multi-Language Audiobook
|
| 832 |
|
| 833 |
"""
|
| 834 |
|
|
@@ -899,7 +905,7 @@ def on_language_change(lang_choice):
|
|
| 899 |
gr.update(visible=True, choices=voice_choices, value=default_voice), # yv_voice
|
| 900 |
gr.update(visible=True), # yv_model
|
| 901 |
gr.update(visible=True), # yv_emotion
|
| 902 |
-
gr.update(value=f"Engine:
|
| 903 |
gr.update(visible=False, value=False), # use_clone
|
| 904 |
gr.update(visible=False), # clone_audio
|
| 905 |
gr.update(visible=False), # clone_info
|
|
@@ -910,7 +916,7 @@ def on_language_change(lang_choice):
|
|
| 910 |
gr.update(visible=False), # yv_voice
|
| 911 |
gr.update(visible=False), # yv_model
|
| 912 |
gr.update(visible=False), # yv_emotion
|
| 913 |
-
gr.update(value=f"Engine:
|
| 914 |
gr.update(visible=True), # use_clone
|
| 915 |
gr.update(visible=False), # clone_audio
|
| 916 |
gr.update(visible=False), # clone_info
|
|
@@ -934,7 +940,7 @@ def generate_wrapper(text_input, file_input, language_choice, use_clone,
|
|
| 934 |
if use_clone:
|
| 935 |
voice_mode = "Clone a Voice"
|
| 936 |
elif engine == "yourvoic":
|
| 937 |
-
voice_mode = "
|
| 938 |
else:
|
| 939 |
voice_mode = "Preset Voice"
|
| 940 |
|
|
@@ -960,7 +966,7 @@ with gr.Blocks(title="Audiobook Generator") as demo:
|
|
| 960 |
target_lang = gr.Dropdown(choices=lang_choices, value="English", label="Target Language",
|
| 961 |
info="The right voice engine is selected automatically based on language.")
|
| 962 |
|
| 963 |
-
engine_label = gr.Markdown(value="Engine:
|
| 964 |
|
| 965 |
# Qwen preset voice (visible for Qwen languages)
|
| 966 |
preset_voice = gr.Dropdown(choices=PRESET_VOICES, value="Jennifer -- Cinematic narrator",
|
|
@@ -968,16 +974,16 @@ with gr.Blocks(title="Audiobook Generator") as demo:
|
|
| 968 |
|
| 969 |
# YourVoic controls (visible for YourVoic languages)
|
| 970 |
yv_voice = gr.Dropdown(choices=YOURVOIC_VOICES_DEFAULT, value="Peter -- Universal fallback",
|
| 971 |
-
label="
|
| 972 |
-
info="Voices update automatically per language.
|
| 973 |
-
yv_model = gr.Dropdown(choices=YOURVOIC_MODELS, value="
|
| 974 |
-
label="
|
| 975 |
yv_emotion = gr.Dropdown(choices=YOURVOIC_EMOTIONS, value="friendly",
|
| 976 |
label="Emotion Style", visible=False,
|
| 977 |
info="Add emotional expression to the narration")
|
| 978 |
|
| 979 |
# Voice cloning toggle (optional, works for Qwen languages only)
|
| 980 |
-
use_clone = gr.Checkbox(value=False, label="Use Voice Cloning (
|
| 981 |
info="Clone a voice from audio sample instead of using preset")
|
| 982 |
clone_audio = gr.Audio(label="Voice Sample (10s-3min)", type="filepath", visible=False)
|
| 983 |
clone_info = gr.Markdown(
|
|
@@ -1015,7 +1021,7 @@ with gr.Blocks(title="Audiobook Generator") as demo:
|
|
| 1015 |
|
| 1016 |
gr.Markdown(
|
| 1017 |
"---\n"
|
| 1018 |
-
|
| 1019 |
|
| 1020 |
if __name__ == "__main__":
|
| 1021 |
demo.launch()
|
|
|
|
| 1 |
"""
|
| 2 |
Audiobook Generator - English Source to Multi-Language Audio
|
| 3 |
+
Supports 51 languages with preset voices, voice cloning, and emotional AI voices.
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
Deploy as a Hugging Face Space:
|
| 6 |
1. Create a new Space (SDK: Gradio)
|
| 7 |
2. Upload app.py and requirements.txt
|
| 8 |
+
3. Add required API secrets in Settings
|
| 9 |
"""
|
| 10 |
|
| 11 |
import os
|
|
|
|
| 312 |
return None, text, f"No valid voice found for {language}. This language may not be supported on your plan. Tried: {candidates[:8]}"
|
| 313 |
|
| 314 |
YOURVOIC_MODELS = [
|
| 315 |
+
"balanced -- Balanced quality and speed (recommended)",
|
| 316 |
+
"lite -- Fast, good for previews",
|
| 317 |
+
"premium -- Premium quality (paid plans only)",
|
| 318 |
+
"fast -- Fast with good quality",
|
| 319 |
+
"realtime -- Fastest, real-time apps",
|
| 320 |
]
|
| 321 |
|
| 322 |
YOURVOIC_EMOTIONS = [
|
|
|
|
| 330 |
|
| 331 |
|
| 332 |
def get_yourvoic_model(label):
|
| 333 |
+
"""Map anonymous model label to actual API model name."""
|
| 334 |
+
name = label.split("--")[0].strip()
|
| 335 |
+
model_map = {
|
| 336 |
+
"balanced": "aura-prime",
|
| 337 |
+
"lite": "aura-lite",
|
| 338 |
+
"premium": "aura-max",
|
| 339 |
+
"fast": "rapid-max",
|
| 340 |
+
"realtime": "rapid-flash",
|
| 341 |
+
}
|
| 342 |
+
return model_map.get(name, "aura-prime")
|
| 343 |
|
| 344 |
|
| 345 |
# ==========================================
|
|
|
|
| 453 |
|
| 454 |
|
| 455 |
# ==========================================
|
| 456 |
+
# VOICE CLONING
|
| 457 |
# ==========================================
|
| 458 |
def prepare_clone_audio(audio_path):
|
| 459 |
result = subprocess.run(
|
|
|
|
| 497 |
|
| 498 |
|
| 499 |
# ==========================================
|
| 500 |
+
# TRANSLATION
|
| 501 |
# ==========================================
|
| 502 |
def translate_text(client, text, target_language, lang_config):
|
| 503 |
response = client.chat.completions.create(
|
|
|
|
| 511 |
|
| 512 |
|
| 513 |
# ==========================================
|
| 514 |
+
# TTS MODE 1: PRESET VOICE
|
| 515 |
# ==========================================
|
| 516 |
def generate_speech_preset(client, text, voice, language, lang_config, translate, chunk_index, output_dir):
|
| 517 |
output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
|
|
|
|
| 551 |
|
| 552 |
|
| 553 |
# ==========================================
|
| 554 |
+
# TTS MODE 2: CLONED VOICE
|
| 555 |
# ==========================================
|
| 556 |
def generate_speech_cloned(client, text, voice_id, language, lang_config, translate, api_key, chunk_index, output_dir):
|
| 557 |
output_wav = os.path.join(output_dir, f"vc_chunk_{chunk_index:04d}.wav")
|
|
|
|
| 579 |
|
| 580 |
|
| 581 |
# ==========================================
|
| 582 |
+
# TTS MODE 3: EMOTIONAL AI VOICES
|
| 583 |
# ==========================================
|
| 584 |
def generate_speech_yourvoic(client, text, voice, yv_model, emotion, language, lang_config, translate,
|
| 585 |
api_key, chunk_index, output_dir):
|
| 586 |
+
"""Generate speech using emotional AI voice API."""
|
| 587 |
output_file = os.path.join(output_dir, f"yv_chunk_{chunk_index:04d}.mp3")
|
| 588 |
|
| 589 |
# Translate if needed
|
|
|
|
| 681 |
lang_config = LANGUAGES[target_language]
|
| 682 |
lang_engine = lang_config["engine"]
|
| 683 |
use_clone = voice_mode == "Clone a Voice"
|
| 684 |
+
use_yourvoic = voice_mode == "Emotional AI"
|
| 685 |
translate = target_language != "English"
|
| 686 |
|
| 687 |
# Auto-correct engine if language requires it
|
|
|
|
| 697 |
# Validate keys
|
| 698 |
if use_yourvoic:
|
| 699 |
if not yv_key:
|
| 700 |
+
raise gr.Error("Voice API key for emotional voices not set. Add YOURVOIC_API_KEY in Settings > Secrets.")
|
| 701 |
if translate and not ds_key:
|
| 702 |
+
raise gr.Error("Translation API key not set. Add DASHSCOPE_API_KEY in Settings > Secrets.")
|
| 703 |
else:
|
| 704 |
if not ds_key:
|
| 705 |
+
raise gr.Error("Voice API key not set. Add DASHSCOPE_API_KEY in Settings > Secrets.")
|
| 706 |
|
| 707 |
client = OpenAI(api_key=ds_key, base_url=DASHSCOPE_BASE_URL) if ds_key else None
|
| 708 |
tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
|
|
|
|
| 788 |
|
| 789 |
audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
|
| 790 |
if use_yourvoic:
|
| 791 |
+
voice_info = f"Emotional AI: {yourvoic_voice_label} ({yourvoic_emotion})"
|
| 792 |
+
mode_info = f"Emotional AI Engine"
|
| 793 |
elif use_clone:
|
| 794 |
voice_info = f"Cloned (ID: {cloned_voice_id[:20]}...)"
|
| 795 |
+
mode_info = "Voice Clone Engine"
|
| 796 |
else:
|
| 797 |
voice_info = preset_voice_label
|
| 798 |
+
mode_info = "Premium AI Engine"
|
| 799 |
|
| 800 |
stats = (
|
| 801 |
f"**Audiobook Generated!**\n\n"
|
|
|
|
| 834 |
|
| 835 |
DESCRIPTION = """
|
| 836 |
# Audiobook Generator
|
| 837 |
+
### English Text to Multi-Language Audiobook (51 Languages)
|
| 838 |
|
| 839 |
"""
|
| 840 |
|
|
|
|
| 905 |
gr.update(visible=True, choices=voice_choices, value=default_voice), # yv_voice
|
| 906 |
gr.update(visible=True), # yv_model
|
| 907 |
gr.update(visible=True), # yv_emotion
|
| 908 |
+
gr.update(value=f"Engine: Emotional AI Voices"), # engine_label
|
| 909 |
gr.update(visible=False, value=False), # use_clone
|
| 910 |
gr.update(visible=False), # clone_audio
|
| 911 |
gr.update(visible=False), # clone_info
|
|
|
|
| 916 |
gr.update(visible=False), # yv_voice
|
| 917 |
gr.update(visible=False), # yv_model
|
| 918 |
gr.update(visible=False), # yv_emotion
|
| 919 |
+
gr.update(value=f"Engine: Premium AI Voices"), # engine_label
|
| 920 |
gr.update(visible=True), # use_clone
|
| 921 |
gr.update(visible=False), # clone_audio
|
| 922 |
gr.update(visible=False), # clone_info
|
|
|
|
| 940 |
if use_clone:
|
| 941 |
voice_mode = "Clone a Voice"
|
| 942 |
elif engine == "yourvoic":
|
| 943 |
+
voice_mode = "Emotional AI"
|
| 944 |
else:
|
| 945 |
voice_mode = "Preset Voice"
|
| 946 |
|
|
|
|
| 966 |
target_lang = gr.Dropdown(choices=lang_choices, value="English", label="Target Language",
|
| 967 |
info="The right voice engine is selected automatically based on language.")
|
| 968 |
|
| 969 |
+
engine_label = gr.Markdown(value="Engine: Premium AI Voices")
|
| 970 |
|
| 971 |
# Qwen preset voice (visible for Qwen languages)
|
| 972 |
preset_voice = gr.Dropdown(choices=PRESET_VOICES, value="Jennifer -- Cinematic narrator",
|
|
|
|
| 974 |
|
| 975 |
# YourVoic controls (visible for YourVoic languages)
|
| 976 |
yv_voice = gr.Dropdown(choices=YOURVOIC_VOICES_DEFAULT, value="Peter -- Universal fallback",
|
| 977 |
+
label="Voice", visible=False, allow_custom_value=True,
|
| 978 |
+
info="Voices update automatically per language.")
|
| 979 |
+
yv_model = gr.Dropdown(choices=YOURVOIC_MODELS, value="balanced -- Balanced quality and speed (recommended)",
|
| 980 |
+
label="AI Model", visible=False)
|
| 981 |
yv_emotion = gr.Dropdown(choices=YOURVOIC_EMOTIONS, value="friendly",
|
| 982 |
label="Emotion Style", visible=False,
|
| 983 |
info="Add emotional expression to the narration")
|
| 984 |
|
| 985 |
# Voice cloning toggle (optional, works for Qwen languages only)
|
| 986 |
+
use_clone = gr.Checkbox(value=False, label="Use Voice Cloning (10 core languages only)",
|
| 987 |
info="Clone a voice from audio sample instead of using preset")
|
| 988 |
clone_audio = gr.Audio(label="Voice Sample (10s-3min)", type="filepath", visible=False)
|
| 989 |
clone_info = gr.Markdown(
|
|
|
|
| 1021 |
|
| 1022 |
gr.Markdown(
|
| 1023 |
"---\n"
|
| 1024 |
+
)
|
| 1025 |
|
| 1026 |
if __name__ == "__main__":
|
| 1027 |
demo.launch()
|