Spaces:

testingfaces
/

clearwave-ai

Running

App Files Files Community

clearwave-ai / translator.py

testingfaces

Update translator.py

e86fdec verified 14 days ago

raw

history blame

7.86 kB

	"""
	Department 3 - Translator
	Primary : NLLB-200-distilled-1.3B (Meta)
	Fallback : deep-translator (Google Translate)

	✅ UPGRADED:
	- Text chunking for long transcripts (fixes repetition bug)
	- Splits by sentence, translates in 400-token chunks
	- Rejoins cleanly into full translation
	"""

	import time
	import logging

	logger = logging.getLogger(__name__)

	NLLB_CODES = {
	"en": "eng_Latn",
	"te": "tel_Telu",
	"hi": "hin_Deva",
	"ta": "tam_Taml",
	"kn": "kan_Knda",
	"es": "spa_Latn",
	"fr": "fra_Latn",
	"de": "deu_Latn",
	"ja": "jpn_Jpan",
	"zh": "zho_Hans",
	"ar": "arb_Arab",
	"pt": "por_Latn",
	"ru": "rus_Cyrl",
	}

	MODEL_ID = "facebook/nllb-200-distilled-1.3B"
	MAX_LENGTH = 512
	CHUNK_WORDS = 80 # ~400 tokens, safe for NLLB


	class Translator:
	def __init__(self):
	self._pipeline = None
	self._tokenizer = None
	self._model = None
	self._nllb_loaded = False
	print("[Translator] Ready (NLLB loads on first use)")

	# ── Public ───────────────────────────────────────────────────────
	def translate(self, text: str, src_lang: str, tgt_lang: str):
	if not text or not text.strip():
	return "", "skipped (empty)"
	if src_lang == tgt_lang:
	return text, "skipped (same language)"

	# Load NLLB on first use
	if not self._nllb_loaded:
	self._init_nllb()
	self._nllb_loaded = True

	# Split long text into chunks
	chunks = self._split_into_chunks(text, CHUNK_WORDS)
	print(f"[Translator] Translating {len(chunks)} chunks ({len(text)} chars)")

	if self._pipeline is not None or self._model is not None:
	try:
	return self._translate_chunks_nllb(chunks, src_lang, tgt_lang)
	except Exception as e:
	logger.warning(f"[Translator] NLLB failed ({e}), trying Google...")

	return self._translate_chunks_google(chunks, src_lang, tgt_lang)

	# ── Chunking ─────────────────────────────────────────────────────
	def _split_into_chunks(self, text: str, max_words: int):
	"""Split text into sentence-aware chunks of max_words words."""
	# Split by sentence endings
	import re
	sentences = re.split(r'(?<=[.!?])\s+', text.strip())

	chunks = []
	current = []
	count = 0

	for sentence in sentences:
	words = sentence.split()
	if count + len(words) > max_words and current:
	chunks.append(" ".join(current))
	current = []
	count = 0
	current.append(sentence)
	count += len(words)

	if current:
	chunks.append(" ".join(current))

	return chunks

	# ── NLLB chunked translation ──────────────────────────────────────
	def _translate_chunks_nllb(self, chunks, src_lang, tgt_lang):
	t0 = time.time()
	results = []
	src_code = NLLB_CODES.get(src_lang, "eng_Latn")
	tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu")

	for i, chunk in enumerate(chunks):
	if not chunk.strip():
	continue
	try:
	if self._pipeline is not None:
	result = self._pipeline(
	chunk,
	src_lang=src_code,
	tgt_lang=tgt_code,
	max_length=MAX_LENGTH,
	)
	results.append(result[0]["translation_text"])
	else:
	import torch
	inputs = self._tokenizer(
	chunk,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=MAX_LENGTH,
	)
	if torch.cuda.is_available():
	inputs = {k: v.cuda() for k, v in inputs.items()}
	tgt_lang_id = self._tokenizer.convert_tokens_to_ids(tgt_code)
	with torch.no_grad():
	output_ids = self._model.generate(
	**inputs,
	forced_bos_token_id=tgt_lang_id,
	max_length=MAX_LENGTH,
	num_beams=4,
	early_stopping=True,
	)
	translated = self._tokenizer.batch_decode(
	output_ids, skip_special_tokens=True)[0]
	results.append(translated)
	except Exception as e:
	logger.warning(f"[Translator] Chunk {i+1} failed: {e}")
	results.append(chunk) # fallback: keep original

	translated = " ".join(results)
	elapsed = time.time() - t0
	logger.info(f"[Translator] NLLB done in {elapsed:.2f}s: {src_code}->{tgt_code}")
	print(f"[Translator] ✅ Done in {elapsed:.2f}s ({len(chunks)} chunks)")
	return translated, f"NLLB-200-distilled-1.3B ({len(chunks)} chunks)"

	# ── Google chunked translation ────────────────────────────────────
	def _translate_chunks_google(self, chunks, src_lang, tgt_lang):
	t0 = time.time()
	try:
	from deep_translator import GoogleTranslator
	results = []
	for chunk in chunks:
	if not chunk.strip():
	continue
	translated = GoogleTranslator(
	source=src_lang if src_lang != "auto" else "auto",
	target=tgt_lang,
	).translate(chunk)
	results.append(translated)
	full = " ".join(results)
	logger.info(f"[Translator] Google done in {time.time()-t0:.2f}s")
	return full, f"Google Translate ({len(chunks)} chunks)"
	except Exception as e:
	logger.error(f"[Translator] Google fallback failed: {e}")
	return f"[Translation failed: {str(e)}]", "error"

	# ── NLLB init ────────────────────────────────────────────────────
	def _init_nllb(self):
	try:
	from transformers import pipeline as hf_pipeline
	self._pipeline = hf_pipeline(
	"translation",
	model=MODEL_ID,
	device_map="auto",
	max_length=MAX_LENGTH,
	)
	print(f"[Translator] ✅ {MODEL_ID} loaded")
	except Exception as e:
	logger.warning(f"[Translator] Pipeline init failed: {e}, trying manual...")
	self._init_nllb_manual()

	def _init_nllb_manual(self):
	try:
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import torch
	self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	self._model = AutoModelForSeq2SeqLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	)
	if torch.cuda.is_available():
	self._model = self._model.cuda()
	self._model.eval()
	print(f"[Translator] ✅ {MODEL_ID} loaded manually")
	except Exception as e:
	logger.error(f"[Translator] NLLB manual load failed: {e}")
	self._model = None