File size: 4,395 Bytes
cf135b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
import spaces
import uuid
import os
import asyncio
import edge_tts
from deep_translator import GoogleTranslator
from patch_tts import tts  # Import patched TTS
import logging
import torch

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

language_mapping = {
    "English": ("en", "en-US-ChristopherNeural"),
    "Spanish": ("es", "es-ES-AlvaroNeural"),
    "French": ("fr", "fr-FR-DeniseNeural"),
    "German": ("de", "de-DE-KatjaNeural"),
    "Italian": ("it", "it-IT-IsabellaNeural"),
    "Portuguese": ("pt", "pt-PT-DuarteNeural"),
    "Polish": ("pl", "pl-PL-AgnieszkaNeural"),
    "Turkish": ("tr", "tr-TR-AhmetNeural"),
    "Russian": ("ru", "ru-RU-DmitryNeural"),
    "Dutch": ("nl", "nl-NL-ColetteNeural"),
    "Czech": ("cs", "cs-CZ-VlastaNeural"),
    "Arabic": ("ar", "ar-SA-HamedNeural"),
    "Chinese": ("zh", "zh-CN-XiaoxiaoNeural"),
    "Japanese": ("ja", "ja-JP-NanamiNeural"),
    "Hungarian": ("hu", "hu-HU-TamasNeural"),
    "Korean": ("ko", "ko-KR-SunHiNeural")
}

def text_to_speech(text, voice, output_file, speaker_wav=None, language="en"):
    if speaker_wav:
        try:
            logger.info("Using patched Coqui TTS with XTTS-v2 model")
            # Get device safely
            device = "cpu" if not torch.cuda.is_available() else "cuda"
            logger.info(f"Using device: {device}")
            logger.info(f"Generating speech with text: {text[:50]}... and speaker_wav: {speaker_wav}")
            tts.tts_to_file(
                text=text,
                speaker_wav=speaker_wav,
                language=language.lower(),
                file_path=output_file,
                speed=1.0
            )
            logger.info(f"Generated audio saved to {output_file}")
        except Exception as e:
            logger.error(f"Coqui TTS error: {str(e)}")
            raise Exception(f"Coqui TTS error: {str(e)}")
    else:
        logger.info("Using edge-tts as fallback")
        communicate = edge_tts.Communicate(text, voice)
        asyncio.run(communicate.save(output_file))

@spaces.GPU
def process_audio(input_text, target_language, speaker_wav=None):
    try:
        if target_language is None:
            raise ValueError("Please select a Target Language.")
        if not input_text:
            raise ValueError("Please provide text to synthesize.")
        if not speaker_wav:
            raise ValueError("Please upload a voice sample for cloning.")

        run_uuid = uuid.uuid4().hex[:6]
        output_filename = f"{run_uuid}_output_synth.wav"

        target_language_code, voice = language_mapping[target_language]
        translator = GoogleTranslator(source='auto', target=target_language_code)
        translated_text = translator.translate(input_text)
        logger.info(f"Translated text: {translated_text}")

        text_to_speech(translated_text, voice, output_filename, speaker_wav=speaker_wav, language=target_language_code)

        if not os.path.exists(output_filename):
            raise FileNotFoundError(f"Error: {output_filename} was not generated.")
        
        return output_filename, ""
    except Exception as e:
        logger.error(f"Error in process_audio: {str(e)}")
        return None, f"Error: {str(e)}"

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Audio Dubbing AI")
    gr.Markdown("Upload a voice sample (2-3 seconds), provide text to synthesize, and select a target language.")
    with gr.Row():
        with gr.Column(scale=2):
            input_text = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text you want to synthesize")
            speaker_wav = gr.Audio(label="Upload Voice Sample (2-3 seconds)", type="filepath")
            target_language = gr.Dropdown(
                choices=list(language_mapping.keys()), 
                label="Target Language", 
                value="Russian"
            )
            submit_button = gr.Button("Generate Audio", variant="primary")
        with gr.Column(scale=3):
            output_audio = gr.Audio(label="Synthesized Audio")
            error_message = gr.Textbox(label="Status / Error Message", interactive=False)
    
    submit_button.click(
        process_audio, 
        inputs=[input_text, target_language, speaker_wav], 
        outputs=[output_audio, error_message]
    )

if __name__ == "__main__":
    demo.launch()