Spaces:

JyuViole
/

AudioDubbingAI

Sleeping

App Files Files Community

JyuViole commited on Apr 21, 2025

Commit

cf135b0

verified ·

1 Parent(s): ae88530

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +42 -0
app.py +111 -0
patch_tts.py +28 -0
requirements.txt +9 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,42 @@

+FROM python:3.10
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+ffmpeg \
+libsndfile1 \
+espeak-ng \
+libportaudio2 \
+&& rm -rf /var/lib/apt/lists/\* \
+&& apt-get clean
+# Set environment variable to accept Coqui TTS license
+ENV COQUI_TTS_ACCEPT_LICENSE=y
+# Debug: Print environment variable
+RUN echo "COQUI_TTS_ACCEPT_LICENSE=$COQUI_TTS_ACCEPT_LICENSE" &gt;&gt; /tmp/env.log
+# Create user
+RUN useradd -m -u 1000 user USER user WORKDIR /home/user/app
+# Install Python dependencies
+COPY requirements.txt . RUN pip install --no-cache-dir torch==2.4.0 torchaudio==2.4.0 \
+&& pip install --no-cache-dir -r requirements.txt \
+&& pip cache purge
+# Clear Coqui TTS model cache and pre-download model
+RUN rm -rf \~/.local/share/tts && \
+echo "y" | python -c "import os; os.environ\['COQUI_TTS_ACCEPT_LICENSE'\]='y'; from TTS.api import TTS; tts = TTS(model_name='tts_models/multilingual/multi-dataset/xtts_v2', progress_bar=False)"
+# Copy application code
+COPY --chown=user:user . .
+# Run the application
+CMD \["python", "app.py"\]

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import gradio as gr
+import spaces
+import uuid
+import os
+import asyncio
+import edge_tts
+from deep_translator import GoogleTranslator
+from patch_tts import tts  # Import patched TTS
+import logging
+import torch
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+language_mapping = {
+    "English": ("en", "en-US-ChristopherNeural"),
+    "Spanish": ("es", "es-ES-AlvaroNeural"),
+    "French": ("fr", "fr-FR-DeniseNeural"),
+    "German": ("de", "de-DE-KatjaNeural"),
+    "Italian": ("it", "it-IT-IsabellaNeural"),
+    "Portuguese": ("pt", "pt-PT-DuarteNeural"),
+    "Polish": ("pl", "pl-PL-AgnieszkaNeural"),
+    "Turkish": ("tr", "tr-TR-AhmetNeural"),
+    "Russian": ("ru", "ru-RU-DmitryNeural"),
+    "Dutch": ("nl", "nl-NL-ColetteNeural"),
+    "Czech": ("cs", "cs-CZ-VlastaNeural"),
+    "Arabic": ("ar", "ar-SA-HamedNeural"),
+    "Chinese": ("zh", "zh-CN-XiaoxiaoNeural"),
+    "Japanese": ("ja", "ja-JP-NanamiNeural"),
+    "Hungarian": ("hu", "hu-HU-TamasNeural"),
+    "Korean": ("ko", "ko-KR-SunHiNeural")
+}
+def text_to_speech(text, voice, output_file, speaker_wav=None, language="en"):
+    if speaker_wav:
+        try:
+            logger.info("Using patched Coqui TTS with XTTS-v2 model")
+            # Get device safely
+            device = "cpu" if not torch.cuda.is_available() else "cuda"
+            logger.info(f"Using device: {device}")
+            logger.info(f"Generating speech with text: {text[:50]}... and speaker_wav: {speaker_wav}")
+            tts.tts_to_file(
+                text=text,
+                speaker_wav=speaker_wav,
+                language=language.lower(),
+                file_path=output_file,
+                speed=1.0
+            )
+            logger.info(f"Generated audio saved to {output_file}")
+        except Exception as e:
+            logger.error(f"Coqui TTS error: {str(e)}")
+            raise Exception(f"Coqui TTS error: {str(e)}")
+    else:
+        logger.info("Using edge-tts as fallback")
+        communicate = edge_tts.Communicate(text, voice)
+        asyncio.run(communicate.save(output_file))
+@spaces.GPU
+def process_audio(input_text, target_language, speaker_wav=None):
+    try:
+        if target_language is None:
+            raise ValueError("Please select a Target Language.")
+        if not input_text:
+            raise ValueError("Please provide text to synthesize.")
+        if not speaker_wav:
+            raise ValueError("Please upload a voice sample for cloning.")
+        run_uuid = uuid.uuid4().hex[:6]
+        output_filename = f"{run_uuid}_output_synth.wav"
+        target_language_code, voice = language_mapping[target_language]
+        translator = GoogleTranslator(source='auto', target=target_language_code)
+        translated_text = translator.translate(input_text)
+        logger.info(f"Translated text: {translated_text}")
+        text_to_speech(translated_text, voice, output_filename, speaker_wav=speaker_wav, language=target_language_code)
+        if not os.path.exists(output_filename):
+            raise FileNotFoundError(f"Error: {output_filename} was not generated.")
+        return output_filename, ""
+    except Exception as e:
+        logger.error(f"Error in process_audio: {str(e)}")
+        return None, f"Error: {str(e)}"
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Audio Dubbing AI")
+    gr.Markdown("Upload a voice sample (2-3 seconds), provide text to synthesize, and select a target language.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            input_text = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text you want to synthesize")
+            speaker_wav = gr.Audio(label="Upload Voice Sample (2-3 seconds)", type="filepath")
+            target_language = gr.Dropdown(
+                choices=list(language_mapping.keys()),
+                label="Target Language",
+                value="Russian"
+            )
+            submit_button = gr.Button("Generate Audio", variant="primary")
+        with gr.Column(scale=3):
+            output_audio = gr.Audio(label="Synthesized Audio")
+            error_message = gr.Textbox(label="Status / Error Message", interactive=False)
+    submit_button.click(
+        process_audio,
+        inputs=[input_text, target_language, speaker_wav],
+        outputs=[output_audio, error_message]
+    )
+if __name__ == "__main__":
+    demo.launch()

patch_tts.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+import sys
+from unittest.mock import patch
+from io import StringIO
+from TTS.api import TTS
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Force license acceptance
+os.environ["COQUI_TTS_ACCEPT_LICENSE"] = "y"
+# Mock input to return 'y' for license prompt
+def mock_input(prompt):
+    logger.info("Mocking input for license prompt")
+    return "y"
+# Patch input function
+with patch('builtins.input', mock_input):
+    try:
+        logger.info("Initializing TTS with XTTS-v2 model")
+        tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False)
+        logger.info("TTS initialized successfully")
+    except Exception as e:
+        logger.error(f"Failed to initialize TTS: {str(e)}")
+        raise

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+deep-translator==1.11.4
+edge-tts==6.1.10
+huggingface-hub==0.27.1
+gradio==4.44.0
+coqui-tts==0.24.2
+torch==2.4.0
+torchaudio==2.4.0
+cached-path==1.7.2
+pydub