Mgolo's picture
Update app.py
738edc4 verified
raw
history blame
9.99 kB
"""
LocaleNLP Translation Service
============================
A multi-language translation application supporting English, Wolof, Hausa, and Darija.
Features text, audio, and document translation with automatic chaining for all language pairs.
Author: LocaleNLP
"""
import os
import re
import logging
import tempfile
from typing import Optional, Dict, Tuple, Any, Union
from pathlib import Path
from dataclasses import dataclass
from enum import Enum
import gradio as gr
import torch
import whisper
import fitz # PyMuPDF
import docx
from bs4 import BeautifulSoup
from markdown import markdown
import chardet
from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM
from huggingface_hub import login
# ================================
# Configuration & Constants
# ================================
class Language(str, Enum):
"""Supported languages for translation."""
ENGLISH = "English"
WOLOF = "Wolof"
HAUSA = "Hausa"
DARIJA = "Darija"
class InputMode(str, Enum):
"""Supported input modes."""
TEXT = "Text"
AUDIO = "Audio"
FILE = "File"
@dataclass
class ModelConfig:
"""Configuration for translation models."""
model_name: str
language_tag: str
# Language pair configurations
TRANSLATION_MODELS: Dict[Tuple[Language, Language], ModelConfig] = {
(Language.ENGLISH, Language.WOLOF): ModelConfig(
"LocaleNLP/localenlp-eng-wol-0.03", ">>wol<<"
),
(Language.WOLOF, Language.ENGLISH): ModelConfig(
"LocaleNLP/localenlp-wol-eng-0.03", ">>eng<<"
),
(Language.ENGLISH, Language.HAUSA): ModelConfig(
"LocaleNLP/localenlp-eng-hau-0.01", ">>hau<<"
),
(Language.HAUSA, Language.ENGLISH): ModelConfig(
"LocaleNLP/localenlp-hau-eng-0.01", ">>eng<<"
),
(Language.ENGLISH, Language.DARIJA): ModelConfig(
"LocaleNLP/english_darija", ">>dar<<"
)
}
# File type support
SUPPORTED_FILE_TYPES = [
".pdf", ".docx", ".html", ".htm", ".md",
".srt", ".txt", ".text"
]
# Audio file extensions
AUDIO_EXTENSIONS = [".wav", ".mp3", ".m4a"]
# ================================
# Logging Configuration
# ================================
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# ================================
# Model Management
# ================================
class ModelManager:
"""Centralized model management for translation and transcription."""
def __init__(self):
self._translation_pipeline = None
self._whisper_model = None
self._current_model_name = None
def get_translation_pipeline(
self,
source_lang: Language,
target_lang: Language
) -> Tuple[Any, str]:
"""
Load and return translation pipeline for given language pair.
Args:
source_lang: Source language
target_lang: Target language
Returns:
Tuple of (pipeline, language_tag)
Raises:
ValueError: If language pair is not supported
"""
key = (source_lang, target_lang)
if key not in TRANSLATION_MODELS:
raise ValueError(f"Unsupported translation pair: {source_lang} -> {target_lang}")
config = TRANSLATION_MODELS[key]
# Load model if not loaded or different model needed
if (self._translation_pipeline is None or
self._current_model_name != config.model_name):
logger.info(f"Loading translation model: {config.model_name}")
# Authenticate with Hugging Face if token provided
if hf_token := os.getenv("hffff"):
login(token=hf_token)
model = AutoModelForSeq2SeqLM.from_pretrained(
config.model_name,
token=hf_token
).to(self._get_device())
tokenizer = MarianTokenizer.from_pretrained(
config.model_name,
token=hf_token
)
self._translation_pipeline = pipeline(
"translation",
model=model,
tokenizer=tokenizer,
device=0 if self._get_device().type == "cuda" else -1
)
self._current_model_name = config.model_name
return self._translation_pipeline, config.language_tag
def get_whisper_model(self) -> Any:
"""
Load and return Whisper transcription model.
Returns:
Whisper model instance
"""
if self._whisper_model is None:
logger.info("Loading Whisper base model...")
self._whisper_model = whisper.load_model("base")
return self._whisper_model
def _get_device(self) -> torch.device:
"""Get appropriate device for model execution."""
return torch.device("cuda" if torch.cuda.is_available() else "cpu")
# ================================
# Content Processing
# ================================
class ContentProcessor:
"""Handles extraction and processing of content from various sources."""
@staticmethod
def extract_text_from_file(file_path: Union[str, Path]) -> str:
"""
Extract text content from various file formats.
Args:
file_path: Path to the file
Returns:
Extracted text content
Raises:
ValueError: If file type is unsupported
Exception: If file processing fails
"""
file_path = Path(file_path)
extension = file_path.suffix.lower()
try:
content = file_path.read_bytes()
if extension == ".pdf":
return ContentProcessor._extract_pdf_text(content)
elif extension == ".docx":
return ContentProcessor._extract_docx_text(file_path)
elif extension in (".html", ".htm"):
return ContentProcessor._extract_html_text(content)
elif extension == ".md":
return ContentProcessor._extract_markdown_text(content)
elif extension == ".srt":
return ContentProcessor._extract_srt_text(content)
elif extension in (".txt", ".text"):
return ContentProcessor._extract_plain_text(content)
else:
raise ValueError(f"Unsupported file type: {extension}")
except Exception as e:
logger.error(f"Failed to extract text from {file_path}: {e}")
raise
@staticmethod
def _extract_pdf_text(content: bytes) -> str:
"""Extract text from PDF file."""
with fitz.open(stream=content, filetype="pdf") as doc:
return "\n".join(page.get_text() for page in doc)
@staticmethod
def _extract_docx_text(file_path: Path) -> str:
"""Extract text from DOCX file."""
doc = docx.Document(str(file_path))
return "\n".join(paragraph.text for paragraph in doc.paragraphs)
@staticmethod
def _extract_html_text(content: bytes) -> str:
"""Extract text from HTML file."""
encoding = chardet.detect(content)["encoding"] or "utf-8"
text = content.decode(encoding, errors="ignore")
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
@staticmethod
def _extract_markdown_text(content: bytes) -> str:
"""Extract text from Markdown file."""
encoding = chardet.detect(content)["encoding"] or "utf-8"
text = content.decode(encoding, errors="ignore")
html = markdown(text)
soup = BeautifulSoup(html, "html.parser")
return soup.get_text()
@staticmethod
def _extract_srt_text(content: bytes) -> str:
"""Extract text from SRT subtitle file."""
encoding = chardet.detect(content)["encoding"] or "utf-8"
text = content.decode(encoding, errors="ignore")
# Remove timestamp lines
return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", text)
@staticmethod
def _extract_plain_text(content: bytes) -> str:
"""Extract text from plain text file."""
encoding = chardet.detect(content)["encoding"] or "utf-8"
return content.decode(encoding, errors="ignore")
# ================================
# Translation Service
# ================================
class TranslationService:
"""Core translation service with advanced processing capabilities."""
def __init__(self, model_manager: ModelManager):
self.model_manager = model_manager
def translate(
self,
text: str,
source_lang: Language,
target_lang: Language
) -> str:
"""
Translate text from source to target language with automatic chaining.
Args:
text: Input text to translate
source_lang: Source language
target_lang: Target language
Returns:
Translated text
"""
if not text.strip():
return "No input text to translate."
# Direct translation if model exists
if (source_lang, target_lang) in TRANSLATION_MODELS:
return self._direct_translate(text, source_lang, target_lang)
# Automatic chaining through English
return self._chained_translate(text, source_lang, target_lang)
def _direct_translate(
self,
text: str,
source_lang: Language,
target_lang: Language
) -> str:
"""Perform direct translation using available model."""
pipeline_obj, lang_tag = self.model_manager.get_translation_pipeline(
source_lang, target_lang
)