JyuViole commited on
Commit
1a99414
·
verified ·
1 Parent(s): 2760843

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +42 -0
  2. app.py +218 -0
  3. patch_tts.py +28 -0
  4. requirements.txt +9 -0
Dockerfile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ # Install system dependencies
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ ffmpeg \
7
+ libsndfile1 \
8
+ espeak-ng \
9
+ libportaudio2 \
10
+ && rm -rf /var/lib/apt/lists/\* \
11
+ && apt-get clean
12
+
13
+ # Set environment variable to accept Coqui TTS license
14
+
15
+ ENV COQUI_TTS_ACCEPT_LICENSE=y
16
+
17
+ # Debug: Print environment variable
18
+
19
+ RUN echo "COQUI_TTS_ACCEPT_LICENSE=$COQUI_TTS_ACCEPT_LICENSE" >> /tmp/env.log
20
+
21
+ # Create user
22
+
23
+ RUN useradd -m -u 1000 user USER user WORKDIR /home/user/app
24
+
25
+ # Install Python dependencies
26
+
27
+ COPY requirements.txt . RUN pip install --no-cache-dir torch==2.4.0 torchaudio==2.4.0 \
28
+ && pip install --no-cache-dir -r requirements.txt \
29
+ && pip cache purge
30
+
31
+ # Clear Coqui TTS model cache and pre-download model
32
+
33
+ RUN rm -rf \~/.local/share/tts && \
34
+ echo "y" | python -c "import os; os.environ\['COQUI_TTS_ACCEPT_LICENSE'\]='y'; from TTS.api import TTS; tts = TTS(model_name='tts_models/multilingual/multi-dataset/xtts_v2', progress_bar=False)"
35
+
36
+ # Copy application code
37
+
38
+ COPY --chown=user:user . .
39
+
40
+ # Run the application
41
+
42
+ CMD \["python", "app.py"\]
app.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import uuid
4
+ import os
5
+ import asyncio
6
+ import edge_tts
7
+ from deep_translator import GoogleTranslator
8
+ from patch_tts import tts
9
+ import logging
10
+ import torch
11
+ import zipfile
12
+ from pathlib import Path
13
+ import tempfile
14
+ import shutil
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ language_mapping = {
21
+ "English": ("en", "en-US-ChristopherNeural"),
22
+ "Spanish": ("es", "es-ES-AlvaroNeural"),
23
+ "French": ("fr", "fr-FR-DeniseNeural"),
24
+ "German": ("de", "de-DE-KatjaNeural"),
25
+ "Italian": ("it", "it-IT-IsabellaNeural"),
26
+ "Portuguese": ("pt", "pt-PT-DuarteNeural"),
27
+ "Polish": ("pl", "pl-PL-AgnieszkaNeural"),
28
+ "Turkish": ("tr", "tr-TR-AhmetNeural"),
29
+ "Russian": ("ru", "ru-RU-DmitryNeural"),
30
+ "Dutch": ("nl", "nl-NL-ColetteNeural"),
31
+ "Czech": ("cs", "cs-CZ-VlastaNeural"),
32
+ "Arabic": ("ar", "ar-SA-HamedNeural"),
33
+ "Chinese": ("zh", "zh-CN-XiaoxiaoNeural"),
34
+ "Japanese": ("ja", "ja-JP-NanamiNeural"),
35
+ "Hungarian": ("hu", "hu-HU-TamasNeural"),
36
+ "Korean": ("ko", "ko-KR-SunHiNeural")
37
+ }
38
+
39
+ def text_to_speech(text, voice, output_file, speaker_wav=None, language="en"):
40
+ if speaker_wav:
41
+ try:
42
+ logger.info("Using patched Coqui TTS with XTTS-v2 model")
43
+ device = "cpu" if not torch.cuda.is_available() else "cuda"
44
+ logger.info(f"Using device: {device}")
45
+ logger.info(f"Generating speech with text: {text[:50]}... and speaker_wav: {speaker_wav}")
46
+ tts.tts_to_file(
47
+ text=text,
48
+ speaker_wav=speaker_wav,
49
+ language=language.lower(),
50
+ file_path=output_file,
51
+ speed=1.0
52
+ )
53
+ logger.info(f"Generated audio saved to {output_file}")
54
+ except Exception as e:
55
+ logger.error(f"Coqui TTS error: {str(e)}")
56
+ raise Exception(f"Coqui TTS error: {str(e)}")
57
+ else:
58
+ logger.info("Using edge-tts as fallback")
59
+ communicate = edge_tts.Communicate(text, voice)
60
+ asyncio.run(communicate.save(output_file))
61
+
62
+ @spaces.GPU
63
+ def process_audio(input_text, target_language, speaker_wav=None):
64
+ try:
65
+ if target_language is None:
66
+ raise ValueError("Please select a Target Language.")
67
+ if not input_text:
68
+ raise ValueError("Please provide text to synthesize.")
69
+ if not speaker_wav:
70
+ raise ValueError("Please upload a voice sample for cloning.")
71
+
72
+ run_uuid = uuid.uuid4().hex[:6]
73
+ output_filename = f"{run_uuid}_output_synth.wav"
74
+
75
+ target_language_code, voice = language_mapping[target_language]
76
+ translator = GoogleTranslator(source='auto', target=target_language_code)
77
+ translated_text = translator.translate(input_text)
78
+ logger.info(f"Translated text: {translated_text}")
79
+
80
+ text_to_speech(translated_text, voice, output_filename, speaker_wav=speaker_wav, language=target_language_code)
81
+
82
+ if not os.path.exists(output_filename):
83
+ raise FileNotFoundError(f"Error: {output_filename} was not generated.")
84
+
85
+ return output_filename, ""
86
+ except Exception as e:
87
+ logger.error(f"Error in process_audio: {str(e)}")
88
+ return None, f"Error: {str(e)}"
89
+
90
+ @spaces.GPU
91
+ def process_batch_audio(audio_files, text_input, target_language, progress=gr.Progress()):
92
+ try:
93
+ if not audio_files:
94
+ return None, "Error: No audio files uploaded."
95
+ if not text_input:
96
+ return None, "Error: No text provided."
97
+ if target_language is None:
98
+ return None, "Error: Please select a Target Language."
99
+
100
+ # Parse text input (expecting one text per line)
101
+ texts = text_input.strip().split("\n")
102
+ texts = [t.strip() for t in texts if t.strip()] # Remove empty lines
103
+
104
+ if len(audio_files) != len(texts):
105
+ return None, f"Error: Number of audio files ({len(audio_files)}) does not match number of text lines ({len(texts)})."
106
+
107
+ if len(audio_files) > 100:
108
+ return None, "Error: Maximum 100 audio files allowed."
109
+
110
+ target_language_code, _ = language_mapping[target_language]
111
+ translator = GoogleTranslator(source='auto', target=target_language_code)
112
+
113
+ # Create temporary directory for output files
114
+ with tempfile.TemporaryDirectory() as temp_dir:
115
+ output_files = []
116
+ seen_filenames = set() # Track filenames to handle duplicates
117
+ for idx, (audio_file, text) in enumerate(zip(audio_files, texts), 1):
118
+ progress(idx / len(audio_files), desc=f"Processing file {idx}/{len(audio_files)}")
119
+ try:
120
+ translated_text = translator.translate(text)
121
+ if not translated_text:
122
+ raise ValueError(f"Translation failed for text: {text[:50]}...")
123
+
124
+ # Extract original filename without path and extension
125
+ original_filename = Path(audio_file).stem
126
+ # Handle duplicate filenames by appending index if needed
127
+ base_filename = original_filename
128
+ suffix = 0
129
+ while base_filename in seen_filenames:
130
+ suffix += 1
131
+ base_filename = f"{original_filename}_{suffix}"
132
+ seen_filenames.add(base_filename)
133
+
134
+ output_filename = os.path.join(temp_dir, f"{base_filename}.wav")
135
+
136
+ text_to_speech(
137
+ text=translated_text,
138
+ voice=None, # Not used with speaker_wav
139
+ output_file=output_filename,
140
+ speaker_wav=audio_file,
141
+ language=target_language_code
142
+ )
143
+
144
+ if not os.path.exists(output_filename):
145
+ raise FileNotFoundError(f"Output file {output_filename} was not generated.")
146
+
147
+ output_files.append((output_filename, f"{base_filename}.wav"))
148
+ except Exception as e:
149
+ logger.error(f"Error processing file {idx} ({original_filename}): {str(e)}")
150
+ return None, f"Error processing file {idx} ({original_filename}): {str(e)}"
151
+
152
+ # Create ZIP archive
153
+ zip_filename = f"batch_output_{uuid.uuid4().hex[:6]}.zip"
154
+ with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
155
+ for output_file, zip_name in output_files:
156
+ zipf.write(output_file, zip_name)
157
+
158
+ return zip_filename, f"Successfully processed {len(output_files)} files. Download the ZIP archive."
159
+
160
+ except Exception as e:
161
+ logger.error(f"Error in batch processing: {str(e)}")
162
+ return None, f"Error: {str(e)}"
163
+
164
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
165
+ gr.Markdown("# Audio Dubbing AI")
166
+ gr.Markdown("Upload voice samples and provide text to synthesize. Supports single or batch processing (up to 100 files).")
167
+
168
+ with gr.Tabs():
169
+ with gr.Tab("Single Audio"):
170
+ gr.Markdown("Process one audio file with one text.")
171
+ with gr.Row():
172
+ with gr.Column(scale=2):
173
+ single_input_text = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text you want to synthesize")
174
+ single_speaker_wav = gr.Audio(label="Upload Voice Sample (2-3 seconds)", type="filepath")
175
+ single_target_language = gr.Dropdown(
176
+ choices=list(language_mapping.keys()),
177
+ label="Target Language",
178
+ value="Russian"
179
+ )
180
+ single_submit_button = gr.Button("Generate Audio", variant="primary")
181
+ with gr.Column(scale=3):
182
+ single_output_audio = gr.Audio(label="Synthesized Audio")
183
+ single_error_message = gr.Textbox(label="Status / Error Message", interactive=False)
184
+
185
+ single_submit_button.click(
186
+ process_audio,
187
+ inputs=[single_input_text, single_target_language, single_speaker_wav],
188
+ outputs=[single_output_audio, single_error_message]
189
+ )
190
+
191
+ with gr.Tab("Batch Audio"):
192
+ gr.Markdown("Upload multiple WAV files and provide one text per file (one per line).")
193
+ with gr.Row():
194
+ with gr.Column(scale=2):
195
+ batch_audio_files = gr.Files(label="Upload WAV Files (up to 100)", file_types=[".wav"], file_count="multiple")
196
+ batch_text_input = gr.Textbox(
197
+ label="Text for Each File (one per line)",
198
+ placeholder="Text for file 1\nText for file 2\n...",
199
+ lines=5
200
+ )
201
+ batch_target_language = gr.Dropdown(
202
+ choices=list(language_mapping.keys()),
203
+ label="Target Language",
204
+ value="Russian"
205
+ )
206
+ batch_submit_button = gr.Button("Generate Batch Audio", variant="primary")
207
+ with gr.Column(scale=3):
208
+ batch_output_file = gr.File(label="Download ZIP Archive")
209
+ batch_status_message = gr.Textbox(label="Status / Error Message", interactive=False)
210
+
211
+ batch_submit_button.click(
212
+ process_batch_audio,
213
+ inputs=[batch_audio_files, batch_text_input, batch_target_language],
214
+ outputs=[batch_output_file, batch_status_message]
215
+ )
216
+
217
+ if __name__ == "__main__":
218
+ demo.launch()
patch_tts.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from unittest.mock import patch
4
+ from io import StringIO
5
+ from TTS.api import TTS
6
+ import logging
7
+
8
+ # Configure logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Force license acceptance
13
+ os.environ["COQUI_TTS_ACCEPT_LICENSE"] = "y"
14
+
15
+ # Mock input to return 'y' for license prompt
16
+ def mock_input(prompt):
17
+ logger.info("Mocking input for license prompt")
18
+ return "y"
19
+
20
+ # Patch input function
21
+ with patch('builtins.input', mock_input):
22
+ try:
23
+ logger.info("Initializing TTS with XTTS-v2 model")
24
+ tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False)
25
+ logger.info("TTS initialized successfully")
26
+ except Exception as e:
27
+ logger.error(f"Failed to initialize TTS: {str(e)}")
28
+ raise
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ deep-translator==1.11.4
2
+ edge-tts==6.1.10
3
+ huggingface-hub==0.27.1
4
+ gradio==4.44.0
5
+ coqui-tts==0.24.2
6
+ torch==2.4.0
7
+ torchaudio==2.4.0
8
+ cached-path==1.7.2
9
+ pydub