AudioDubbingAI / app.py
JyuViole's picture
Upload 4 files
cf135b0 verified
import gradio as gr
import spaces
import uuid
import os
import asyncio
import edge_tts
from deep_translator import GoogleTranslator
from patch_tts import tts # Import patched TTS
import logging
import torch
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
language_mapping = {
"English": ("en", "en-US-ChristopherNeural"),
"Spanish": ("es", "es-ES-AlvaroNeural"),
"French": ("fr", "fr-FR-DeniseNeural"),
"German": ("de", "de-DE-KatjaNeural"),
"Italian": ("it", "it-IT-IsabellaNeural"),
"Portuguese": ("pt", "pt-PT-DuarteNeural"),
"Polish": ("pl", "pl-PL-AgnieszkaNeural"),
"Turkish": ("tr", "tr-TR-AhmetNeural"),
"Russian": ("ru", "ru-RU-DmitryNeural"),
"Dutch": ("nl", "nl-NL-ColetteNeural"),
"Czech": ("cs", "cs-CZ-VlastaNeural"),
"Arabic": ("ar", "ar-SA-HamedNeural"),
"Chinese": ("zh", "zh-CN-XiaoxiaoNeural"),
"Japanese": ("ja", "ja-JP-NanamiNeural"),
"Hungarian": ("hu", "hu-HU-TamasNeural"),
"Korean": ("ko", "ko-KR-SunHiNeural")
}
def text_to_speech(text, voice, output_file, speaker_wav=None, language="en"):
if speaker_wav:
try:
logger.info("Using patched Coqui TTS with XTTS-v2 model")
# Get device safely
device = "cpu" if not torch.cuda.is_available() else "cuda"
logger.info(f"Using device: {device}")
logger.info(f"Generating speech with text: {text[:50]}... and speaker_wav: {speaker_wav}")
tts.tts_to_file(
text=text,
speaker_wav=speaker_wav,
language=language.lower(),
file_path=output_file,
speed=1.0
)
logger.info(f"Generated audio saved to {output_file}")
except Exception as e:
logger.error(f"Coqui TTS error: {str(e)}")
raise Exception(f"Coqui TTS error: {str(e)}")
else:
logger.info("Using edge-tts as fallback")
communicate = edge_tts.Communicate(text, voice)
asyncio.run(communicate.save(output_file))
@spaces.GPU
def process_audio(input_text, target_language, speaker_wav=None):
try:
if target_language is None:
raise ValueError("Please select a Target Language.")
if not input_text:
raise ValueError("Please provide text to synthesize.")
if not speaker_wav:
raise ValueError("Please upload a voice sample for cloning.")
run_uuid = uuid.uuid4().hex[:6]
output_filename = f"{run_uuid}_output_synth.wav"
target_language_code, voice = language_mapping[target_language]
translator = GoogleTranslator(source='auto', target=target_language_code)
translated_text = translator.translate(input_text)
logger.info(f"Translated text: {translated_text}")
text_to_speech(translated_text, voice, output_filename, speaker_wav=speaker_wav, language=target_language_code)
if not os.path.exists(output_filename):
raise FileNotFoundError(f"Error: {output_filename} was not generated.")
return output_filename, ""
except Exception as e:
logger.error(f"Error in process_audio: {str(e)}")
return None, f"Error: {str(e)}"
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Audio Dubbing AI")
gr.Markdown("Upload a voice sample (2-3 seconds), provide text to synthesize, and select a target language.")
with gr.Row():
with gr.Column(scale=2):
input_text = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text you want to synthesize")
speaker_wav = gr.Audio(label="Upload Voice Sample (2-3 seconds)", type="filepath")
target_language = gr.Dropdown(
choices=list(language_mapping.keys()),
label="Target Language",
value="Russian"
)
submit_button = gr.Button("Generate Audio", variant="primary")
with gr.Column(scale=3):
output_audio = gr.Audio(label="Synthesized Audio")
error_message = gr.Textbox(label="Status / Error Message", interactive=False)
submit_button.click(
process_audio,
inputs=[input_text, target_language, speaker_wav],
outputs=[output_audio, error_message]
)
if __name__ == "__main__":
demo.launch()