Spaces:

JyuViole
/

AudioDubbingAI

Sleeping

App Files Files Community

AudioDubbingAI / app.py

JyuViole

Upload 4 files

cf135b0 verified 10 months ago

raw

history blame contribute delete

4.4 kB

	import gradio as gr
	import spaces
	import uuid
	import os
	import asyncio
	import edge_tts
	from deep_translator import GoogleTranslator
	from patch_tts import tts # Import patched TTS
	import logging
	import torch

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	language_mapping = {
	"English": ("en", "en-US-ChristopherNeural"),
	"Spanish": ("es", "es-ES-AlvaroNeural"),
	"French": ("fr", "fr-FR-DeniseNeural"),
	"German": ("de", "de-DE-KatjaNeural"),
	"Italian": ("it", "it-IT-IsabellaNeural"),
	"Portuguese": ("pt", "pt-PT-DuarteNeural"),
	"Polish": ("pl", "pl-PL-AgnieszkaNeural"),
	"Turkish": ("tr", "tr-TR-AhmetNeural"),
	"Russian": ("ru", "ru-RU-DmitryNeural"),
	"Dutch": ("nl", "nl-NL-ColetteNeural"),
	"Czech": ("cs", "cs-CZ-VlastaNeural"),
	"Arabic": ("ar", "ar-SA-HamedNeural"),
	"Chinese": ("zh", "zh-CN-XiaoxiaoNeural"),
	"Japanese": ("ja", "ja-JP-NanamiNeural"),
	"Hungarian": ("hu", "hu-HU-TamasNeural"),
	"Korean": ("ko", "ko-KR-SunHiNeural")
	}

	def text_to_speech(text, voice, output_file, speaker_wav=None, language="en"):
	if speaker_wav:
	try:
	logger.info("Using patched Coqui TTS with XTTS-v2 model")
	# Get device safely
	device = "cpu" if not torch.cuda.is_available() else "cuda"
	logger.info(f"Using device: {device}")
	logger.info(f"Generating speech with text: {text[:50]}... and speaker_wav: {speaker_wav}")
	tts.tts_to_file(
	text=text,
	speaker_wav=speaker_wav,
	language=language.lower(),
	file_path=output_file,
	speed=1.0
	)
	logger.info(f"Generated audio saved to {output_file}")
	except Exception as e:
	logger.error(f"Coqui TTS error: {str(e)}")
	raise Exception(f"Coqui TTS error: {str(e)}")
	else:
	logger.info("Using edge-tts as fallback")
	communicate = edge_tts.Communicate(text, voice)
	asyncio.run(communicate.save(output_file))

	@spaces.GPU
	def process_audio(input_text, target_language, speaker_wav=None):
	try:
	if target_language is None:
	raise ValueError("Please select a Target Language.")
	if not input_text:
	raise ValueError("Please provide text to synthesize.")
	if not speaker_wav:
	raise ValueError("Please upload a voice sample for cloning.")

	run_uuid = uuid.uuid4().hex[:6]
	output_filename = f"{run_uuid}_output_synth.wav"

	target_language_code, voice = language_mapping[target_language]
	translator = GoogleTranslator(source='auto', target=target_language_code)
	translated_text = translator.translate(input_text)
	logger.info(f"Translated text: {translated_text}")

	text_to_speech(translated_text, voice, output_filename, speaker_wav=speaker_wav, language=target_language_code)

	if not os.path.exists(output_filename):
	raise FileNotFoundError(f"Error: {output_filename} was not generated.")

	return output_filename, ""
	except Exception as e:
	logger.error(f"Error in process_audio: {str(e)}")
	return None, f"Error: {str(e)}"

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Audio Dubbing AI")
	gr.Markdown("Upload a voice sample (2-3 seconds), provide text to synthesize, and select a target language.")
	with gr.Row():
	with gr.Column(scale=2):
	input_text = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text you want to synthesize")
	speaker_wav = gr.Audio(label="Upload Voice Sample (2-3 seconds)", type="filepath")
	target_language = gr.Dropdown(
	choices=list(language_mapping.keys()),
	label="Target Language",
	value="Russian"
	)
	submit_button = gr.Button("Generate Audio", variant="primary")
	with gr.Column(scale=3):
	output_audio = gr.Audio(label="Synthesized Audio")
	error_message = gr.Textbox(label="Status / Error Message", interactive=False)

	submit_button.click(
	process_audio,
	inputs=[input_text, target_language, speaker_wav],
	outputs=[output_audio, error_message]
	)

	if __name__ == "__main__":
	demo.launch()