Spaces:

Nishur
/

video_translator

Sleeping

App Files Files Community

video_translator / app.py

Nishur

Update app.py

45248e1 verified about 1 year ago

raw

history blame contribute delete

17.4 kB

	import gradio as gr
	import os
	import subprocess
	import torch
	from TTS.api import TTS
	from deep_translator import GoogleTranslator
	import pysrt
	import whisper
	import webvtt
	import shutil
	import time
	from tqdm import tqdm
	from typing import Dict, List, Optional
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Configuration
	LANGUAGES = {
	"English": {"code": "en", "speakers": ["default"], "whisper": "en"},
	"Spanish": {"code": "es", "speakers": ["default"], "whisper": "es"},
	"French": {"code": "fr", "speakers": ["default"], "whisper": "fr"},
	"German": {"code": "de", "speakers": ["thorsten", "eva_k"], "whisper": "de"},
	"Japanese": {"code": "ja", "speakers": ["default"], "whisper": "ja"},
	"Hindi": {"code": "hi", "speakers": ["default"], "whisper": "hi"}
	}

	SUBTITLE_STYLES = {
	"Default": "",
	"White Text": "color: white;",
	"Yellow Text": "color: yellow;",
	"Large Text": "font-size: 24px;",
	"Bold Text": "font-weight: bold;",
	"Black Background": "background-color: black; padding: 5px;"
	}

	# Create output directory (relative path for Spaces)
	OUTPUT_DIR = "outputs"
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# Initialize TTS with error handling
	device = "cuda" if torch.cuda.is_available() else "cpu"
	tts_models = {}

	def load_tts_model(model_name: str, lang_code: str):
	try:
	tts = TTS(model_name).to(device)
	# Try to use gruut phonemizer if espeak fails
	if hasattr(tts.synthesizer, 'tts_config'):
	tts.synthesizer.tts_config.phonemizer = "gruut"
	return tts
	except Exception as e:
	logger.error(f"Failed to load {model_name}: {str(e)}")
	return None

	# Initialize models only when needed
	def get_tts_model(lang_code: str):
	if lang_code not in tts_models:
	model_map = {
	"en": "tts_models/en/ljspeech/tacotron2-DDC",
	"es": "tts_models/es/css10/vits",
	"fr": "tts_models/fr/css10/vits",
	"de": "tts_models/de/thorsten/vits", # Using VITS instead of tacotron2
	"ja": "tts_models/ja/kokoro/tacotron2-DDC",
	"hi": "tts_models/hi/kb/tacotron2-DDC"
	}
	tts_models[lang_code] = load_tts_model(model_map[lang_code], lang_code)
	return tts_models[lang_code]

	# Initialize Whisper (load when needed)
	whisper_model = None

	def get_whisper_model():
	global whisper_model
	if whisper_model is None:
	whisper_model = whisper.load_model("small")
	return whisper_model

	def extract_audio(video_path: str) -> str:
	"""Extract audio using ffmpeg"""
	audio_path = os.path.join(OUTPUT_DIR, "audio.wav")
	cmd = [
	'ffmpeg', '-i', video_path, '-vn',
	'-acodec', 'pcm_s16le', '-ar', '16000',
	'-ac', '1', '-y', audio_path
	]
	subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	return audio_path

	def transcribe_with_whisper(audio_path: str, language: str = None) -> str:
	"""Transcribe audio using Whisper"""
	model = get_whisper_model()
	result = model.transcribe(audio_path, language=language, word_timestamps=True)
	return result

	def generate_srt_from_whisper(audio_path: str, language: str) -> str:
	"""Generate SRT subtitles from Whisper output"""
	result = transcribe_with_whisper(audio_path, language)

	subs = pysrt.SubRipFile()
	for i, segment in enumerate(result["segments"]):
	subs.append(pysrt.SubRipItem(
	index=i+1,
	start=pysrt.SubRipTime(seconds=segment["start"]),
	end=pysrt.SubRipTime(seconds=segment["end"]),
	text=segment["text"]
	))

	srt_path = os.path.join(OUTPUT_DIR, "subtitles.srt")
	subs.save(srt_path, encoding='utf-8')
	return srt_path

	def detect_language(audio_path: str) -> str:
	"""Detect language using Whisper"""
	result = transcribe_with_whisper(audio_path)
	detected_code = result["language"]
	for name, data in LANGUAGES.items():
	if data["whisper"] == detected_code:
	return name
	return "English"

	def translate_subtitles(srt_path: str, target_langs: List[str]) -> Dict[str, str]:
	"""Translate subtitles to multiple languages"""
	subs = pysrt.open(srt_path)
	results = {}

	for lang_name in target_langs:
	lang_code = LANGUAGES[lang_name]["code"]
	translated_subs = subs[:]
	translator = GoogleTranslator(source='auto', target=lang_code)

	for sub in translated_subs:
	try:
	sub.text = translator.translate(sub.text)
	except Exception as e:
	logger.warning(f"Translation failed: {str(e)}")
	continue

	output_path = os.path.join(OUTPUT_DIR, f"subtitles_{lang_code}.srt")
	translated_subs.save(output_path, encoding='utf-8')
	results[lang_code] = output_path

	return results

	def generate_webvtt_subtitles(srt_path: str, style: str = "") -> str:
	"""Convert SRT to WebVTT with optional styling"""
	subs = pysrt.open(srt_path)
	lang_code = os.path.basename(srt_path).split('_')[-1].replace('.srt', '')
	vtt_path = os.path.join(OUTPUT_DIR, f"subtitles_{lang_code}.vtt")

	with open(vtt_path, 'w', encoding='utf-8') as f:
	f.write("WEBVTT\n\n")
	if style:
	f.write(f"STYLE\n::cue {{\n{style}\n}}\n\n")

	for sub in subs:
	start = sub.start.to_time().strftime('%H:%M:%S.%f')[:-3]
	end = sub.end.to_time().strftime('%H:%M:%S.%f')[:-3]
	f.write(f"{start} --> {end}\n")
	f.write(f"{sub.text}\n\n")

	return vtt_path

	def generate_translated_audio(
	srt_path: str,
	target_lang: str,
	speaker: str = "default"
	) -> str:
	"""Generate translated audio using TTS"""
	subs = pysrt.open(srt_path)
	temp_dir = os.path.join(OUTPUT_DIR, f"temp_audio_{target_lang}")
	os.makedirs(temp_dir, exist_ok=True)

	audio_files = []
	timings = []
	tts = get_tts_model(target_lang)

	if tts is None:
	raise Exception(f"TTS model for {target_lang} not available")

	for i, sub in enumerate(tqdm(subs, desc=f"Generating {target_lang} audio")):
	text = sub.text.strip()
	if not text:
	continue

	start_time = sub.start.ordinal / 1000
	audio_file = os.path.join(temp_dir, f"chunk_{i:04d}.wav")

	try:
	kwargs = {"speaker": speaker} if speaker != "default" and hasattr(tts, 'synthesizer') else {}
	tts.tts_to_file(text=text, file_path=audio_file, **kwargs)
	audio_files.append(audio_file)
	timings.append((start_time, audio_file))
	except Exception as e:
	logger.warning(f"TTS failed: {str(e)}")

	if not audio_files:
	raise Exception("No audio generated")

	# Create silent audio
	video_duration = get_video_duration(os.path.join(OUTPUT_DIR, "base_video.mp4"))
	silence_file = os.path.join(temp_dir, "silence.wav")
	subprocess.run([
	'ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=stereo',
	'-t', str(video_duration), '-y', silence_file
	], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

	# Mix audio
	filter_complex = "[0:a]" + "".join(
	f"[{i+1}:a]adelay={int(start1000)}\|{int(start1000)}[a{i}];" +
	f"[a{i-1 if i>0 else 'out'}]" + f"[a{i}]amix=inputs=2[aout]"
	for i, (start, _) in enumerate(timings)
	)

	cmd = ['ffmpeg', '-y', '-i', silence_file] + \
	[f'-i {f}' for f in audio_files] + [
	'-filter_complex', filter_complex,
	'-map', '[aout]',
	os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")]

	subprocess.run(' '.join(cmd), shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	shutil.rmtree(temp_dir)
	return os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")

	def get_video_duration(video_path: str) -> float:
	"""Get video duration in seconds"""
	result = subprocess.run([
	'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
	'-of', 'default=noprint_wrappers=1:nokey=1', video_path
	], capture_output=True, text=True)
	return float(result.stdout.strip() or 180)

	def create_html_player(
	video_path: str,
	subtitle_paths: Dict[str, str],
	style: str = ""
	) -> str:
	"""Create HTML player with video and subtitles"""
	html_path = os.path.join(OUTPUT_DIR, "player.html")
	video_name = os.path.basename(video_path)

	subtitle_tracks = "\n".join(
	f'<track kind="subtitles" src="{os.path.basename(path)}" '
	f'srclang="{lang}" label="{lang.capitalize()}" '
	f'{"default" if lang == "en" else ""}>'
	for lang, path in subtitle_paths.items()
	)

	style_block = f"video::cue {{ {style} }}" if style else ""

	html_content = f"""<!DOCTYPE html>
	<html>
	<head>
	<title>Video Player</title>
	<style>
	body {{ font-family: Arial, sans-serif; margin: 20px; }}
	.container {{ max-width: 800px; margin: 0 auto; }}
	video {{ width: 100%; background: #000; }}
	.downloads {{ margin-top: 20px; }}
	{style_block}
	</style>
	</head>
	<body>
	<div class="container">
	<h2>Video Player with Subtitles</h2>
	<video controls>
	<source src="{video_name}" type="video/mp4">
	{subtitle_tracks}
	</video>

	<div class="downloads">
	<h3>Download Subtitles:</h3>
	{"".join(
	f'<a href="{os.path.basename(path)}" download>'
	f'{lang.upper()} Subtitles (.vtt)</a><br>'
	for lang, path in subtitle_paths.items()
	)}
	</div>
	</div>
	</body>
	</html>"""

	with open(html_path, 'w', encoding='utf-8') as f:
	f.write(html_content)

	return html_path

	def process_video(
	video_file: str,
	source_lang: str,
	target_langs: List[str],
	subtitle_style: str,
	speaker_settings: Dict[str, str],
	progress: gr.Progress = gr.Progress()
	) -> List[str]:
	"""Complete video processing pipeline"""
	try:
	progress(0.05, "Initializing...")

	# 1. Extract audio
	progress(0.1, "Extracting audio...")
	audio_path = extract_audio(video_file)

	# 2. Detect language if needed
	if source_lang == "Auto-detect":
	source_lang = detect_language(audio_path)
	progress(0.15, f"Detected language: {source_lang}")

	# 3. Generate subtitles
	progress(0.2, "Generating subtitles...")
	srt_path = generate_srt_from_whisper(
	audio_path,
	LANGUAGES[source_lang]["whisper"]
	)

	# 4. Translate subtitles
	progress(0.3, "Translating subtitles...")
	translated_subs = translate_subtitles(srt_path, target_langs)

	# 5. Save original video
	base_video = os.path.join(OUTPUT_DIR, "base_video.mp4")
	shutil.copy(video_file, base_video)

	# 6. Process each target language
	translated_vtts = {}
	for i, lang_name in enumerate(target_langs, 1):
	lang_code = LANGUAGES[lang_name]["code"]
	progress(0.4 + (i * 0.5 / len(target_langs)), f"Processing {lang_name}...")

	# Generate audio
	translated_audio = generate_translated_audio(
	translated_subs[lang_code],
	lang_code,
	speaker_settings.get(lang_code, "default")
	)

	# Generate subtitles
	vtt_path = generate_webvtt_subtitles(
	translated_subs[lang_code],
	SUBTITLE_STYLES.get(subtitle_style, "")
	)
	translated_vtts[lang_code] = vtt_path

	# Create translated video version
	output_video = os.path.join(OUTPUT_DIR, f"output_{lang_code}.mp4")
	subprocess.run([
	'ffmpeg', '-i', base_video, '-i', translated_audio,
	'-map', '0:v', '-map', '1:a', '-c:v', 'copy', '-c:a', 'aac',
	'-y', output_video
	], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

	# 7. Create HTML player
	progress(0.9, "Creating HTML player...")
	html_path = create_html_player(
	base_video,
	translated_vtts,
	SUBTITLE_STYLES.get(subtitle_style, "")
	)

	# Prepare all output files
	output_files = [html_path, base_video] + \
	list(translated_vtts.values()) + \
	[os.path.join(OUTPUT_DIR, f"output_{LANGUAGES[lang]['code']}.mp4")
	for lang in target_langs]

	progress(1.0, "Done!")
	return output_files, "Processing completed successfully!"

	except Exception as e:
	logger.error(f"Processing failed: {str(e)}", exc_info=True)
	return None, f"Error: {str(e)}"

	def get_speaker_settings(*args) -> Dict[str, str]:
	"""Create speaker settings dictionary from inputs"""
	settings = {}
	for i, lang in enumerate(LANGUAGES.keys()):
	if i < len(args) and args[i]:
	settings[LANGUAGES[lang]["code"]] = args[i]
	return settings

	def create_interface():
	"""Create Gradio interface"""
	with gr.Blocks(title="Video Translator") as demo:
	gr.Markdown("# Free Video Translation System")
	gr.Markdown("Translate videos with subtitles and audio dubbing using free/open-source tools")

	with gr.Row():
	with gr.Column(scale=1):
	video_input = gr.Video(label="Upload Video")

	with gr.Accordion("Source Settings", open=True):
	source_lang = gr.Dropdown(
	label="Source Language",
	choices=["Auto-detect"] + list(LANGUAGES.keys()),
	value="Auto-detect"
	)

	with gr.Accordion("Target Languages", open=True):
	target_langs = gr.CheckboxGroup(
	label="Select target languages",
	choices=list(LANGUAGES.keys()),
	value=["English", "Spanish"]
	)

	with gr.Accordion("Subtitle Styling", open=False):
	subtitle_style = gr.Dropdown(
	label="Subtitle Appearance",
	choices=list(SUBTITLE_STYLES.keys()),
	value="Default"
	)

	with gr.Accordion("Voice Settings", open=False):
	speaker_inputs = []
	for lang_name in LANGUAGES.keys():
	speakers = LANGUAGES[lang_name]["speakers"]
	if len(speakers) > 1:
	speaker_inputs.append(
	gr.Dropdown(
	label=f"{lang_name} Speaker",
	choices=speakers,
	value=speakers[0],
	visible=False
	)
	)
	else:
	speaker_inputs.append(gr.Textbox(visible=False))

	submit_btn = gr.Button("Translate Video", variant="primary")

	with gr.Column(scale=2):
	output_files = gr.Files(label="Download Files")
	status = gr.Textbox(label="Status")

	gr.Markdown("""
	Instructions:
	1. Upload a video file
	2. Select source and target languages
	3. Customize subtitles and voices
	4. Click Translate
	5. Download the HTML player and open in browser
	""")

	def update_speaker_ui(selected_langs):
	updates = []
	for i, lang_name in enumerate(LANGUAGES.keys()):
	visible = lang_name in selected_langs and len(LANGUAGES[lang_name]["speakers"]) > 1
	updates.append(gr.Dropdown.update(visible=visible))
	return updates

	target_langs.change(
	update_speaker_ui,
	inputs=target_langs,
	outputs=speaker_inputs
	)

	submit_btn.click(
	process_video,
	inputs=[
	video_input,
	source_lang,
	target_langs,
	subtitle_style,
	gr.State(lambda: get_speaker_settings(*speaker_inputs))
	],
	outputs=[output_files, status]
	)

	return demo

	if __name__ == "__main__":
	# Clear output directory on startup
	if os.path.exists(OUTPUT_DIR):
	shutil.rmtree(OUTPUT_DIR)
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	demo = create_interface()
	demo.launch(share=True) # Required for Hugging Face Spaces