import gradio as gr import spaces import uuid import os import asyncio import edge_tts from deep_translator import GoogleTranslator from patch_tts import tts # Import patched TTS import logging import torch # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) language_mapping = { "English": ("en", "en-US-ChristopherNeural"), "Spanish": ("es", "es-ES-AlvaroNeural"), "French": ("fr", "fr-FR-DeniseNeural"), "German": ("de", "de-DE-KatjaNeural"), "Italian": ("it", "it-IT-IsabellaNeural"), "Portuguese": ("pt", "pt-PT-DuarteNeural"), "Polish": ("pl", "pl-PL-AgnieszkaNeural"), "Turkish": ("tr", "tr-TR-AhmetNeural"), "Russian": ("ru", "ru-RU-DmitryNeural"), "Dutch": ("nl", "nl-NL-ColetteNeural"), "Czech": ("cs", "cs-CZ-VlastaNeural"), "Arabic": ("ar", "ar-SA-HamedNeural"), "Chinese": ("zh", "zh-CN-XiaoxiaoNeural"), "Japanese": ("ja", "ja-JP-NanamiNeural"), "Hungarian": ("hu", "hu-HU-TamasNeural"), "Korean": ("ko", "ko-KR-SunHiNeural") } def text_to_speech(text, voice, output_file, speaker_wav=None, language="en"): if speaker_wav: try: logger.info("Using patched Coqui TTS with XTTS-v2 model") # Get device safely device = "cpu" if not torch.cuda.is_available() else "cuda" logger.info(f"Using device: {device}") logger.info(f"Generating speech with text: {text[:50]}... and speaker_wav: {speaker_wav}") tts.tts_to_file( text=text, speaker_wav=speaker_wav, language=language.lower(), file_path=output_file, speed=1.0 ) logger.info(f"Generated audio saved to {output_file}") except Exception as e: logger.error(f"Coqui TTS error: {str(e)}") raise Exception(f"Coqui TTS error: {str(e)}") else: logger.info("Using edge-tts as fallback") communicate = edge_tts.Communicate(text, voice) asyncio.run(communicate.save(output_file)) @spaces.GPU def process_audio(input_text, target_language, speaker_wav=None): try: if target_language is None: raise ValueError("Please select a Target Language.") if not input_text: raise ValueError("Please provide text to synthesize.") if not speaker_wav: raise ValueError("Please upload a voice sample for cloning.") run_uuid = uuid.uuid4().hex[:6] output_filename = f"{run_uuid}_output_synth.wav" target_language_code, voice = language_mapping[target_language] translator = GoogleTranslator(source='auto', target=target_language_code) translated_text = translator.translate(input_text) logger.info(f"Translated text: {translated_text}") text_to_speech(translated_text, voice, output_filename, speaker_wav=speaker_wav, language=target_language_code) if not os.path.exists(output_filename): raise FileNotFoundError(f"Error: {output_filename} was not generated.") return output_filename, "" except Exception as e: logger.error(f"Error in process_audio: {str(e)}") return None, f"Error: {str(e)}" with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# Audio Dubbing AI") gr.Markdown("Upload a voice sample (2-3 seconds), provide text to synthesize, and select a target language.") with gr.Row(): with gr.Column(scale=2): input_text = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text you want to synthesize") speaker_wav = gr.Audio(label="Upload Voice Sample (2-3 seconds)", type="filepath") target_language = gr.Dropdown( choices=list(language_mapping.keys()), label="Target Language", value="Russian" ) submit_button = gr.Button("Generate Audio", variant="primary") with gr.Column(scale=3): output_audio = gr.Audio(label="Synthesized Audio") error_message = gr.Textbox(label="Status / Error Message", interactive=False) submit_button.click( process_audio, inputs=[input_text, target_language, speaker_wav], outputs=[output_audio, error_message] ) if __name__ == "__main__": demo.launch()