Spaces:

LocaleNLP
/

LocaleNLP_Translator

Running

App Files Files Community

LocaleNLP_Translator / app.py

Mgolo

Update app.py

738edc4 verified 7 months ago

raw

history blame

9.99 kB

	"""
	LocaleNLP Translation Service
	============================
	A multi-language translation application supporting English, Wolof, Hausa, and Darija.
	Features text, audio, and document translation with automatic chaining for all language pairs.
	Author: LocaleNLP
	"""

	import os
	import re
	import logging
	import tempfile
	from typing import Optional, Dict, Tuple, Any, Union
	from pathlib import Path
	from dataclasses import dataclass
	from enum import Enum

	import gradio as gr
	import torch
	import whisper
	import fitz # PyMuPDF
	import docx
	from bs4 import BeautifulSoup
	from markdown import markdown
	import chardet
	from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM
	from huggingface_hub import login

	# ================================
	# Configuration & Constants
	# ================================

	class Language(str, Enum):
	"""Supported languages for translation."""
	ENGLISH = "English"
	WOLOF = "Wolof"
	HAUSA = "Hausa"
	DARIJA = "Darija"

	class InputMode(str, Enum):
	"""Supported input modes."""
	TEXT = "Text"
	AUDIO = "Audio"
	FILE = "File"

	@dataclass
	class ModelConfig:
	"""Configuration for translation models."""
	model_name: str
	language_tag: str

	# Language pair configurations
	TRANSLATION_MODELS: Dict[Tuple[Language, Language], ModelConfig] = {
	(Language.ENGLISH, Language.WOLOF): ModelConfig(
	"LocaleNLP/localenlp-eng-wol-0.03", ">>wol<<"
	),
	(Language.WOLOF, Language.ENGLISH): ModelConfig(
	"LocaleNLP/localenlp-wol-eng-0.03", ">>eng<<"
	),
	(Language.ENGLISH, Language.HAUSA): ModelConfig(
	"LocaleNLP/localenlp-eng-hau-0.01", ">>hau<<"
	),
	(Language.HAUSA, Language.ENGLISH): ModelConfig(
	"LocaleNLP/localenlp-hau-eng-0.01", ">>eng<<"
	),
	(Language.ENGLISH, Language.DARIJA): ModelConfig(
	"LocaleNLP/english_darija", ">>dar<<"
	)
	}

	# File type support
	SUPPORTED_FILE_TYPES = [
	".pdf", ".docx", ".html", ".htm", ".md",
	".srt", ".txt", ".text"
	]

	# Audio file extensions
	AUDIO_EXTENSIONS = [".wav", ".mp3", ".m4a"]

	# ================================
	# Logging Configuration
	# ================================

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	# ================================
	# Model Management
	# ================================

	class ModelManager:
	"""Centralized model management for translation and transcription."""

	def __init__(self):
	self._translation_pipeline = None
	self._whisper_model = None
	self._current_model_name = None

	def get_translation_pipeline(
	self,
	source_lang: Language,
	target_lang: Language
	) -> Tuple[Any, str]:
	"""
	Load and return translation pipeline for given language pair.

	Args:
	source_lang: Source language
	target_lang: Target language

	Returns:
	Tuple of (pipeline, language_tag)

	Raises:
	ValueError: If language pair is not supported
	"""
	key = (source_lang, target_lang)
	if key not in TRANSLATION_MODELS:
	raise ValueError(f"Unsupported translation pair: {source_lang} -> {target_lang}")

	config = TRANSLATION_MODELS[key]

	# Load model if not loaded or different model needed
	if (self._translation_pipeline is None or
	self._current_model_name != config.model_name):

	logger.info(f"Loading translation model: {config.model_name}")

	# Authenticate with Hugging Face if token provided
	if hf_token := os.getenv("hffff"):
	login(token=hf_token)

	model = AutoModelForSeq2SeqLM.from_pretrained(
	config.model_name,
	token=hf_token
	).to(self._get_device())

	tokenizer = MarianTokenizer.from_pretrained(
	config.model_name,
	token=hf_token
	)

	self._translation_pipeline = pipeline(
	"translation",
	model=model,
	tokenizer=tokenizer,
	device=0 if self._get_device().type == "cuda" else -1
	)

	self._current_model_name = config.model_name

	return self._translation_pipeline, config.language_tag

	def get_whisper_model(self) -> Any:
	"""
	Load and return Whisper transcription model.

	Returns:
	Whisper model instance
	"""
	if self._whisper_model is None:
	logger.info("Loading Whisper base model...")
	self._whisper_model = whisper.load_model("base")
	return self._whisper_model

	def _get_device(self) -> torch.device:
	"""Get appropriate device for model execution."""
	return torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# ================================
	# Content Processing
	# ================================

	class ContentProcessor:
	"""Handles extraction and processing of content from various sources."""

	@staticmethod
	def extract_text_from_file(file_path: Union[str, Path]) -> str:
	"""
	Extract text content from various file formats.

	Args:
	file_path: Path to the file

	Returns:
	Extracted text content

	Raises:
	ValueError: If file type is unsupported
	Exception: If file processing fails
	"""
	file_path = Path(file_path)
	extension = file_path.suffix.lower()

	try:
	content = file_path.read_bytes()

	if extension == ".pdf":
	return ContentProcessor._extract_pdf_text(content)
	elif extension == ".docx":
	return ContentProcessor._extract_docx_text(file_path)
	elif extension in (".html", ".htm"):
	return ContentProcessor._extract_html_text(content)
	elif extension == ".md":
	return ContentProcessor._extract_markdown_text(content)
	elif extension == ".srt":
	return ContentProcessor._extract_srt_text(content)
	elif extension in (".txt", ".text"):
	return ContentProcessor._extract_plain_text(content)
	else:
	raise ValueError(f"Unsupported file type: {extension}")

	except Exception as e:
	logger.error(f"Failed to extract text from {file_path}: {e}")
	raise

	@staticmethod
	def _extract_pdf_text(content: bytes) -> str:
	"""Extract text from PDF file."""
	with fitz.open(stream=content, filetype="pdf") as doc:
	return "\n".join(page.get_text() for page in doc)

	@staticmethod
	def _extract_docx_text(file_path: Path) -> str:
	"""Extract text from DOCX file."""
	doc = docx.Document(str(file_path))
	return "\n".join(paragraph.text for paragraph in doc.paragraphs)

	@staticmethod
	def _extract_html_text(content: bytes) -> str:
	"""Extract text from HTML file."""
	encoding = chardet.detect(content)["encoding"] or "utf-8"
	text = content.decode(encoding, errors="ignore")
	soup = BeautifulSoup(text, "html.parser")
	return soup.get_text()

	@staticmethod
	def _extract_markdown_text(content: bytes) -> str:
	"""Extract text from Markdown file."""
	encoding = chardet.detect(content)["encoding"] or "utf-8"
	text = content.decode(encoding, errors="ignore")
	html = markdown(text)
	soup = BeautifulSoup(html, "html.parser")
	return soup.get_text()

	@staticmethod
	def _extract_srt_text(content: bytes) -> str:
	"""Extract text from SRT subtitle file."""
	encoding = chardet.detect(content)["encoding"] or "utf-8"
	text = content.decode(encoding, errors="ignore")
	# Remove timestamp lines
	return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", text)

	@staticmethod
	def _extract_plain_text(content: bytes) -> str:
	"""Extract text from plain text file."""
	encoding = chardet.detect(content)["encoding"] or "utf-8"
	return content.decode(encoding, errors="ignore")

	# ================================
	# Translation Service
	# ================================

	class TranslationService:
	"""Core translation service with advanced processing capabilities."""

	def __init__(self, model_manager: ModelManager):
	self.model_manager = model_manager

	def translate(
	self,
	text: str,
	source_lang: Language,
	target_lang: Language
	) -> str:
	"""
	Translate text from source to target language with automatic chaining.

	Args:
	text: Input text to translate
	source_lang: Source language
	target_lang: Target language

	Returns:
	Translated text
	"""
	if not text.strip():
	return "No input text to translate."

	# Direct translation if model exists
	if (source_lang, target_lang) in TRANSLATION_MODELS:
	return self._direct_translate(text, source_lang, target_lang)

	# Automatic chaining through English
	return self._chained_translate(text, source_lang, target_lang)

	def _direct_translate(
	self,
	text: str,
	source_lang: Language,
	target_lang: Language
	) -> str:
	"""Perform direct translation using available model."""
	pipeline_obj, lang_tag = self.model_manager.get_translation_pipeline(
	source_lang, target_lang
	)