Spaces:
Running
Running
| """ | |
| LocaleNLP Translation Service | |
| ============================ | |
| A multi-language translation application supporting English, Wolof, Hausa, and Darija. | |
| Features text, audio, and document translation with automatic chaining for all language pairs. | |
| Author: LocaleNLP | |
| """ | |
| import os | |
| import re | |
| import logging | |
| import tempfile | |
| from typing import Optional, Dict, Tuple, Any, Union | |
| from pathlib import Path | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| import gradio as gr | |
| import torch | |
| import whisper | |
| import fitz # PyMuPDF | |
| import docx | |
| from bs4 import BeautifulSoup | |
| from markdown import markdown | |
| import chardet | |
| from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM | |
| from huggingface_hub import login | |
| # ================================ | |
| # Configuration & Constants | |
| # ================================ | |
| class Language(str, Enum): | |
| """Supported languages for translation.""" | |
| ENGLISH = "English" | |
| WOLOF = "Wolof" | |
| HAUSA = "Hausa" | |
| DARIJA = "Darija" | |
| class InputMode(str, Enum): | |
| """Supported input modes.""" | |
| TEXT = "Text" | |
| AUDIO = "Audio" | |
| FILE = "File" | |
| class ModelConfig: | |
| """Configuration for translation models.""" | |
| model_name: str | |
| language_tag: str | |
| # Language pair configurations | |
| TRANSLATION_MODELS: Dict[Tuple[Language, Language], ModelConfig] = { | |
| (Language.ENGLISH, Language.WOLOF): ModelConfig( | |
| "LocaleNLP/localenlp-eng-wol-0.03", ">>wol<<" | |
| ), | |
| (Language.WOLOF, Language.ENGLISH): ModelConfig( | |
| "LocaleNLP/localenlp-wol-eng-0.03", ">>eng<<" | |
| ), | |
| (Language.ENGLISH, Language.HAUSA): ModelConfig( | |
| "LocaleNLP/localenlp-eng-hau-0.01", ">>hau<<" | |
| ), | |
| (Language.HAUSA, Language.ENGLISH): ModelConfig( | |
| "LocaleNLP/localenlp-hau-eng-0.01", ">>eng<<" | |
| ), | |
| (Language.ENGLISH, Language.DARIJA): ModelConfig( | |
| "LocaleNLP/english_darija", ">>dar<<" | |
| ) | |
| } | |
| # File type support | |
| SUPPORTED_FILE_TYPES = [ | |
| ".pdf", ".docx", ".html", ".htm", ".md", | |
| ".srt", ".txt", ".text" | |
| ] | |
| # Audio file extensions | |
| AUDIO_EXTENSIONS = [".wav", ".mp3", ".m4a"] | |
| # ================================ | |
| # Logging Configuration | |
| # ================================ | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # ================================ | |
| # Model Management | |
| # ================================ | |
| class ModelManager: | |
| """Centralized model management for translation and transcription.""" | |
| def __init__(self): | |
| self._translation_pipeline = None | |
| self._whisper_model = None | |
| self._current_model_name = None | |
| def get_translation_pipeline( | |
| self, | |
| source_lang: Language, | |
| target_lang: Language | |
| ) -> Tuple[Any, str]: | |
| """ | |
| Load and return translation pipeline for given language pair. | |
| Args: | |
| source_lang: Source language | |
| target_lang: Target language | |
| Returns: | |
| Tuple of (pipeline, language_tag) | |
| Raises: | |
| ValueError: If language pair is not supported | |
| """ | |
| key = (source_lang, target_lang) | |
| if key not in TRANSLATION_MODELS: | |
| raise ValueError(f"Unsupported translation pair: {source_lang} -> {target_lang}") | |
| config = TRANSLATION_MODELS[key] | |
| # Load model if not loaded or different model needed | |
| if (self._translation_pipeline is None or | |
| self._current_model_name != config.model_name): | |
| logger.info(f"Loading translation model: {config.model_name}") | |
| # Authenticate with Hugging Face if token provided | |
| if hf_token := os.getenv("hffff"): | |
| login(token=hf_token) | |
| model = AutoModelForSeq2SeqLM.from_pretrained( | |
| config.model_name, | |
| token=hf_token | |
| ).to(self._get_device()) | |
| tokenizer = MarianTokenizer.from_pretrained( | |
| config.model_name, | |
| token=hf_token | |
| ) | |
| self._translation_pipeline = pipeline( | |
| "translation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| device=0 if self._get_device().type == "cuda" else -1 | |
| ) | |
| self._current_model_name = config.model_name | |
| return self._translation_pipeline, config.language_tag | |
| def get_whisper_model(self) -> Any: | |
| """ | |
| Load and return Whisper transcription model. | |
| Returns: | |
| Whisper model instance | |
| """ | |
| if self._whisper_model is None: | |
| logger.info("Loading Whisper base model...") | |
| self._whisper_model = whisper.load_model("base") | |
| return self._whisper_model | |
| def _get_device(self) -> torch.device: | |
| """Get appropriate device for model execution.""" | |
| return torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # ================================ | |
| # Content Processing | |
| # ================================ | |
| class ContentProcessor: | |
| """Handles extraction and processing of content from various sources.""" | |
| def extract_text_from_file(file_path: Union[str, Path]) -> str: | |
| """ | |
| Extract text content from various file formats. | |
| Args: | |
| file_path: Path to the file | |
| Returns: | |
| Extracted text content | |
| Raises: | |
| ValueError: If file type is unsupported | |
| Exception: If file processing fails | |
| """ | |
| file_path = Path(file_path) | |
| extension = file_path.suffix.lower() | |
| try: | |
| content = file_path.read_bytes() | |
| if extension == ".pdf": | |
| return ContentProcessor._extract_pdf_text(content) | |
| elif extension == ".docx": | |
| return ContentProcessor._extract_docx_text(file_path) | |
| elif extension in (".html", ".htm"): | |
| return ContentProcessor._extract_html_text(content) | |
| elif extension == ".md": | |
| return ContentProcessor._extract_markdown_text(content) | |
| elif extension == ".srt": | |
| return ContentProcessor._extract_srt_text(content) | |
| elif extension in (".txt", ".text"): | |
| return ContentProcessor._extract_plain_text(content) | |
| else: | |
| raise ValueError(f"Unsupported file type: {extension}") | |
| except Exception as e: | |
| logger.error(f"Failed to extract text from {file_path}: {e}") | |
| raise | |
| def _extract_pdf_text(content: bytes) -> str: | |
| """Extract text from PDF file.""" | |
| with fitz.open(stream=content, filetype="pdf") as doc: | |
| return "\n".join(page.get_text() for page in doc) | |
| def _extract_docx_text(file_path: Path) -> str: | |
| """Extract text from DOCX file.""" | |
| doc = docx.Document(str(file_path)) | |
| return "\n".join(paragraph.text for paragraph in doc.paragraphs) | |
| def _extract_html_text(content: bytes) -> str: | |
| """Extract text from HTML file.""" | |
| encoding = chardet.detect(content)["encoding"] or "utf-8" | |
| text = content.decode(encoding, errors="ignore") | |
| soup = BeautifulSoup(text, "html.parser") | |
| return soup.get_text() | |
| def _extract_markdown_text(content: bytes) -> str: | |
| """Extract text from Markdown file.""" | |
| encoding = chardet.detect(content)["encoding"] or "utf-8" | |
| text = content.decode(encoding, errors="ignore") | |
| html = markdown(text) | |
| soup = BeautifulSoup(html, "html.parser") | |
| return soup.get_text() | |
| def _extract_srt_text(content: bytes) -> str: | |
| """Extract text from SRT subtitle file.""" | |
| encoding = chardet.detect(content)["encoding"] or "utf-8" | |
| text = content.decode(encoding, errors="ignore") | |
| # Remove timestamp lines | |
| return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", text) | |
| def _extract_plain_text(content: bytes) -> str: | |
| """Extract text from plain text file.""" | |
| encoding = chardet.detect(content)["encoding"] or "utf-8" | |
| return content.decode(encoding, errors="ignore") | |
| # ================================ | |
| # Translation Service | |
| # ================================ | |
| class TranslationService: | |
| """Core translation service with advanced processing capabilities.""" | |
| def __init__(self, model_manager: ModelManager): | |
| self.model_manager = model_manager | |
| def translate( | |
| self, | |
| text: str, | |
| source_lang: Language, | |
| target_lang: Language | |
| ) -> str: | |
| """ | |
| Translate text from source to target language with automatic chaining. | |
| Args: | |
| text: Input text to translate | |
| source_lang: Source language | |
| target_lang: Target language | |
| Returns: | |
| Translated text | |
| """ | |
| if not text.strip(): | |
| return "No input text to translate." | |
| # Direct translation if model exists | |
| if (source_lang, target_lang) in TRANSLATION_MODELS: | |
| return self._direct_translate(text, source_lang, target_lang) | |
| # Automatic chaining through English | |
| return self._chained_translate(text, source_lang, target_lang) | |
| def _direct_translate( | |
| self, | |
| text: str, | |
| source_lang: Language, | |
| target_lang: Language | |
| ) -> str: | |
| """Perform direct translation using available model.""" | |
| pipeline_obj, lang_tag = self.model_manager.get_translation_pipeline( | |
| source_lang, target_lang | |
| ) | |