video-dubbing-agent / services /translator.py
dashhdata's picture
Upload folder using huggingface_hub
4ec3855 verified
"""
Stage 5 — Translation Service
Translates transcribed segments into target language.
Uses deep-translator (Google Translate free tier) — no API key needed.
Batches nearby segments for better translation quality.
"""
import logging
import json
from pathlib import Path
from typing import List, Dict, Optional
from config import TRANSLATION_BATCH_SIZE
logger = logging.getLogger(__name__)
def translate_segments(
segments: List[Dict],
source_language: str,
target_language: str,
output_dir: Path,
progress_callback=None
) -> List[Dict]:
"""
Translate all segments into the target language.
Groups short consecutive segments from same speaker for better context.
Returns segments with added 'translated_text' field.
"""
try:
from deep_translator import GoogleTranslator
except ImportError:
raise RuntimeError("deep-translator not installed. Run: pip install deep-translator")
translator = GoogleTranslator(source=source_language, target=target_language)
total = len(segments)
translated_segments = []
# Batch translate for efficiency and context
batches = _create_translation_batches(segments)
logger.info(f"Translating {total} segments in {len(batches)} batches → {target_language}")
done_count = 0
for batch_idx, batch in enumerate(batches):
# Combine batch texts with separator
combined_text = " ||| ".join(seg["text"] for seg in batch)
try:
translated_combined = translator.translate(combined_text)
# Split back
translated_parts = translated_combined.split(" ||| ")
# If split count doesn't match, translate individually
if len(translated_parts) != len(batch):
translated_parts = _translate_individually(translator, batch)
except Exception as e:
logger.warning(f"Batch translation failed: {e}. Translating individually.")
translated_parts = _translate_individually(translator, batch)
# Assign translations back to segments
for seg, translated_text in zip(batch, translated_parts):
seg_copy = seg.copy()
seg_copy["translated_text"] = translated_text.strip()
translated_segments.append(seg_copy)
done_count += 1
if progress_callback:
progress_callback(int(done_count / total * 100))
# Handle empty/non-speech segments
for seg in translated_segments:
if not seg.get("translated_text") or seg["translated_text"].strip() == "":
seg["translated_text"] = seg["text"] # Keep original if translation empty
# Save translated transcript
output_path = output_dir / "translated_segments.json"
with open(output_path, "w", encoding="utf-8") as f:
json.dump({
"source_language": source_language,
"target_language": target_language,
"segments": translated_segments,
"total": len(translated_segments),
}, f, ensure_ascii=False, indent=2)
logger.info(f"Translation complete: {len(translated_segments)} segments → {output_path}")
return translated_segments
def _create_translation_batches(
segments: List[Dict],
max_batch_chars: int = 4000
) -> List[List[Dict]]:
"""
Group consecutive segments from same speaker into batches.
Respects Google Translate's character limits (~5000 chars).
"""
batches = []
current_batch = []
current_chars = 0
current_speaker = None
for seg in segments:
text = seg.get("text", "").strip()
if not text:
continue
# Start new batch if speaker changes or char limit reached
if (current_speaker and seg["speaker"] != current_speaker) or \
(current_chars + len(text) > max_batch_chars) or \
len(current_batch) >= TRANSLATION_BATCH_SIZE:
if current_batch:
batches.append(current_batch)
current_batch = []
current_chars = 0
current_batch.append(seg)
current_chars += len(text) + 5 # +5 for separator
current_speaker = seg["speaker"]
if current_batch:
batches.append(current_batch)
return batches
def _translate_individually(translator, batch: List[Dict]) -> List[str]:
"""Fallback: translate each segment one by one."""
results = []
for seg in batch:
try:
translated = translator.translate(seg["text"])
results.append(translated or seg["text"])
except Exception as e:
logger.warning(f"Individual translation failed for '{seg['text'][:50]}...': {e}")
results.append(seg["text"]) # Keep original on failure
return results