JyuViole commited on
Commit
cf135b0
·
verified ·
1 Parent(s): ae88530

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +42 -0
  2. app.py +111 -0
  3. patch_tts.py +28 -0
  4. requirements.txt +9 -0
Dockerfile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ # Install system dependencies
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ ffmpeg \
7
+ libsndfile1 \
8
+ espeak-ng \
9
+ libportaudio2 \
10
+ && rm -rf /var/lib/apt/lists/\* \
11
+ && apt-get clean
12
+
13
+ # Set environment variable to accept Coqui TTS license
14
+
15
+ ENV COQUI_TTS_ACCEPT_LICENSE=y
16
+
17
+ # Debug: Print environment variable
18
+
19
+ RUN echo "COQUI_TTS_ACCEPT_LICENSE=$COQUI_TTS_ACCEPT_LICENSE" >> /tmp/env.log
20
+
21
+ # Create user
22
+
23
+ RUN useradd -m -u 1000 user USER user WORKDIR /home/user/app
24
+
25
+ # Install Python dependencies
26
+
27
+ COPY requirements.txt . RUN pip install --no-cache-dir torch==2.4.0 torchaudio==2.4.0 \
28
+ && pip install --no-cache-dir -r requirements.txt \
29
+ && pip cache purge
30
+
31
+ # Clear Coqui TTS model cache and pre-download model
32
+
33
+ RUN rm -rf \~/.local/share/tts && \
34
+ echo "y" | python -c "import os; os.environ\['COQUI_TTS_ACCEPT_LICENSE'\]='y'; from TTS.api import TTS; tts = TTS(model_name='tts_models/multilingual/multi-dataset/xtts_v2', progress_bar=False)"
35
+
36
+ # Copy application code
37
+
38
+ COPY --chown=user:user . .
39
+
40
+ # Run the application
41
+
42
+ CMD \["python", "app.py"\]
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import uuid
4
+ import os
5
+ import asyncio
6
+ import edge_tts
7
+ from deep_translator import GoogleTranslator
8
+ from patch_tts import tts # Import patched TTS
9
+ import logging
10
+ import torch
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ language_mapping = {
17
+ "English": ("en", "en-US-ChristopherNeural"),
18
+ "Spanish": ("es", "es-ES-AlvaroNeural"),
19
+ "French": ("fr", "fr-FR-DeniseNeural"),
20
+ "German": ("de", "de-DE-KatjaNeural"),
21
+ "Italian": ("it", "it-IT-IsabellaNeural"),
22
+ "Portuguese": ("pt", "pt-PT-DuarteNeural"),
23
+ "Polish": ("pl", "pl-PL-AgnieszkaNeural"),
24
+ "Turkish": ("tr", "tr-TR-AhmetNeural"),
25
+ "Russian": ("ru", "ru-RU-DmitryNeural"),
26
+ "Dutch": ("nl", "nl-NL-ColetteNeural"),
27
+ "Czech": ("cs", "cs-CZ-VlastaNeural"),
28
+ "Arabic": ("ar", "ar-SA-HamedNeural"),
29
+ "Chinese": ("zh", "zh-CN-XiaoxiaoNeural"),
30
+ "Japanese": ("ja", "ja-JP-NanamiNeural"),
31
+ "Hungarian": ("hu", "hu-HU-TamasNeural"),
32
+ "Korean": ("ko", "ko-KR-SunHiNeural")
33
+ }
34
+
35
+ def text_to_speech(text, voice, output_file, speaker_wav=None, language="en"):
36
+ if speaker_wav:
37
+ try:
38
+ logger.info("Using patched Coqui TTS with XTTS-v2 model")
39
+ # Get device safely
40
+ device = "cpu" if not torch.cuda.is_available() else "cuda"
41
+ logger.info(f"Using device: {device}")
42
+ logger.info(f"Generating speech with text: {text[:50]}... and speaker_wav: {speaker_wav}")
43
+ tts.tts_to_file(
44
+ text=text,
45
+ speaker_wav=speaker_wav,
46
+ language=language.lower(),
47
+ file_path=output_file,
48
+ speed=1.0
49
+ )
50
+ logger.info(f"Generated audio saved to {output_file}")
51
+ except Exception as e:
52
+ logger.error(f"Coqui TTS error: {str(e)}")
53
+ raise Exception(f"Coqui TTS error: {str(e)}")
54
+ else:
55
+ logger.info("Using edge-tts as fallback")
56
+ communicate = edge_tts.Communicate(text, voice)
57
+ asyncio.run(communicate.save(output_file))
58
+
59
+ @spaces.GPU
60
+ def process_audio(input_text, target_language, speaker_wav=None):
61
+ try:
62
+ if target_language is None:
63
+ raise ValueError("Please select a Target Language.")
64
+ if not input_text:
65
+ raise ValueError("Please provide text to synthesize.")
66
+ if not speaker_wav:
67
+ raise ValueError("Please upload a voice sample for cloning.")
68
+
69
+ run_uuid = uuid.uuid4().hex[:6]
70
+ output_filename = f"{run_uuid}_output_synth.wav"
71
+
72
+ target_language_code, voice = language_mapping[target_language]
73
+ translator = GoogleTranslator(source='auto', target=target_language_code)
74
+ translated_text = translator.translate(input_text)
75
+ logger.info(f"Translated text: {translated_text}")
76
+
77
+ text_to_speech(translated_text, voice, output_filename, speaker_wav=speaker_wav, language=target_language_code)
78
+
79
+ if not os.path.exists(output_filename):
80
+ raise FileNotFoundError(f"Error: {output_filename} was not generated.")
81
+
82
+ return output_filename, ""
83
+ except Exception as e:
84
+ logger.error(f"Error in process_audio: {str(e)}")
85
+ return None, f"Error: {str(e)}"
86
+
87
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
88
+ gr.Markdown("# Audio Dubbing AI")
89
+ gr.Markdown("Upload a voice sample (2-3 seconds), provide text to synthesize, and select a target language.")
90
+ with gr.Row():
91
+ with gr.Column(scale=2):
92
+ input_text = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text you want to synthesize")
93
+ speaker_wav = gr.Audio(label="Upload Voice Sample (2-3 seconds)", type="filepath")
94
+ target_language = gr.Dropdown(
95
+ choices=list(language_mapping.keys()),
96
+ label="Target Language",
97
+ value="Russian"
98
+ )
99
+ submit_button = gr.Button("Generate Audio", variant="primary")
100
+ with gr.Column(scale=3):
101
+ output_audio = gr.Audio(label="Synthesized Audio")
102
+ error_message = gr.Textbox(label="Status / Error Message", interactive=False)
103
+
104
+ submit_button.click(
105
+ process_audio,
106
+ inputs=[input_text, target_language, speaker_wav],
107
+ outputs=[output_audio, error_message]
108
+ )
109
+
110
+ if __name__ == "__main__":
111
+ demo.launch()
patch_tts.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from unittest.mock import patch
4
+ from io import StringIO
5
+ from TTS.api import TTS
6
+ import logging
7
+
8
+ # Configure logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Force license acceptance
13
+ os.environ["COQUI_TTS_ACCEPT_LICENSE"] = "y"
14
+
15
+ # Mock input to return 'y' for license prompt
16
+ def mock_input(prompt):
17
+ logger.info("Mocking input for license prompt")
18
+ return "y"
19
+
20
+ # Patch input function
21
+ with patch('builtins.input', mock_input):
22
+ try:
23
+ logger.info("Initializing TTS with XTTS-v2 model")
24
+ tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False)
25
+ logger.info("TTS initialized successfully")
26
+ except Exception as e:
27
+ logger.error(f"Failed to initialize TTS: {str(e)}")
28
+ raise
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ deep-translator==1.11.4
2
+ edge-tts==6.1.10
3
+ huggingface-hub==0.27.1
4
+ gradio==4.44.0
5
+ coqui-tts==0.24.2
6
+ torch==2.4.0
7
+ torchaudio==2.4.0
8
+ cached-path==1.7.2
9
+ pydub