Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import spaces | |
| import uuid | |
| import os | |
| import asyncio | |
| import edge_tts | |
| from deep_translator import GoogleTranslator | |
| from patch_tts import tts # Import patched TTS | |
| import logging | |
| import torch | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| language_mapping = { | |
| "English": ("en", "en-US-ChristopherNeural"), | |
| "Spanish": ("es", "es-ES-AlvaroNeural"), | |
| "French": ("fr", "fr-FR-DeniseNeural"), | |
| "German": ("de", "de-DE-KatjaNeural"), | |
| "Italian": ("it", "it-IT-IsabellaNeural"), | |
| "Portuguese": ("pt", "pt-PT-DuarteNeural"), | |
| "Polish": ("pl", "pl-PL-AgnieszkaNeural"), | |
| "Turkish": ("tr", "tr-TR-AhmetNeural"), | |
| "Russian": ("ru", "ru-RU-DmitryNeural"), | |
| "Dutch": ("nl", "nl-NL-ColetteNeural"), | |
| "Czech": ("cs", "cs-CZ-VlastaNeural"), | |
| "Arabic": ("ar", "ar-SA-HamedNeural"), | |
| "Chinese": ("zh", "zh-CN-XiaoxiaoNeural"), | |
| "Japanese": ("ja", "ja-JP-NanamiNeural"), | |
| "Hungarian": ("hu", "hu-HU-TamasNeural"), | |
| "Korean": ("ko", "ko-KR-SunHiNeural") | |
| } | |
| def text_to_speech(text, voice, output_file, speaker_wav=None, language="en"): | |
| if speaker_wav: | |
| try: | |
| logger.info("Using patched Coqui TTS with XTTS-v2 model") | |
| # Get device safely | |
| device = "cpu" if not torch.cuda.is_available() else "cuda" | |
| logger.info(f"Using device: {device}") | |
| logger.info(f"Generating speech with text: {text[:50]}... and speaker_wav: {speaker_wav}") | |
| tts.tts_to_file( | |
| text=text, | |
| speaker_wav=speaker_wav, | |
| language=language.lower(), | |
| file_path=output_file, | |
| speed=1.0 | |
| ) | |
| logger.info(f"Generated audio saved to {output_file}") | |
| except Exception as e: | |
| logger.error(f"Coqui TTS error: {str(e)}") | |
| raise Exception(f"Coqui TTS error: {str(e)}") | |
| else: | |
| logger.info("Using edge-tts as fallback") | |
| communicate = edge_tts.Communicate(text, voice) | |
| asyncio.run(communicate.save(output_file)) | |
| def process_audio(input_text, target_language, speaker_wav=None): | |
| try: | |
| if target_language is None: | |
| raise ValueError("Please select a Target Language.") | |
| if not input_text: | |
| raise ValueError("Please provide text to synthesize.") | |
| if not speaker_wav: | |
| raise ValueError("Please upload a voice sample for cloning.") | |
| run_uuid = uuid.uuid4().hex[:6] | |
| output_filename = f"{run_uuid}_output_synth.wav" | |
| target_language_code, voice = language_mapping[target_language] | |
| translator = GoogleTranslator(source='auto', target=target_language_code) | |
| translated_text = translator.translate(input_text) | |
| logger.info(f"Translated text: {translated_text}") | |
| text_to_speech(translated_text, voice, output_filename, speaker_wav=speaker_wav, language=target_language_code) | |
| if not os.path.exists(output_filename): | |
| raise FileNotFoundError(f"Error: {output_filename} was not generated.") | |
| return output_filename, "" | |
| except Exception as e: | |
| logger.error(f"Error in process_audio: {str(e)}") | |
| return None, f"Error: {str(e)}" | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# Audio Dubbing AI") | |
| gr.Markdown("Upload a voice sample (2-3 seconds), provide text to synthesize, and select a target language.") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| input_text = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text you want to synthesize") | |
| speaker_wav = gr.Audio(label="Upload Voice Sample (2-3 seconds)", type="filepath") | |
| target_language = gr.Dropdown( | |
| choices=list(language_mapping.keys()), | |
| label="Target Language", | |
| value="Russian" | |
| ) | |
| submit_button = gr.Button("Generate Audio", variant="primary") | |
| with gr.Column(scale=3): | |
| output_audio = gr.Audio(label="Synthesized Audio") | |
| error_message = gr.Textbox(label="Status / Error Message", interactive=False) | |
| submit_button.click( | |
| process_audio, | |
| inputs=[input_text, target_language, speaker_wav], | |
| outputs=[output_audio, error_message] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |