hmc-rag / src /language.py
webmuppet
Initial commit — health marketing compliance RAG
bad8b6c
"""
Language detection and translation layer.
Uses langdetect for detection and Qwen 2.5 via Ollama for translation.
"""
import litellm
from langdetect import detect, LangDetectException
from src.config import MODEL
from src.usage import _extract_usage, _empty_usage
# Languages we explicitly support (from the brief)
SUPPORTED_LANGUAGES = {
"en": "English",
"mi": "te reo Māori",
"tl": "Filipino",
"hi": "Hindi",
"sm": "Samoan",
"zh-cn": "Mandarin Chinese",
"zh-tw": "Cantonese/Traditional Chinese",
}
def detect_language(text: str) -> str:
"""Detect the language of the input text.
Returns a language code (e.g. 'en', 'tl', 'hi', 'zh-cn').
Falls back to 'en' if detection fails.
"""
try:
lang = detect(text)
# langdetect returns 'zh-cn' for Chinese, 'tl' for Filipino, etc.
return lang
except LangDetectException:
return "en"
def get_language_name(lang_code: str) -> str:
"""Get human-readable language name."""
return SUPPORTED_LANGUAGES.get(lang_code, lang_code)
def translate_to_english(text: str, source_lang: str) -> str:
"""Translate a query from the source language to English.
Returns the original text if already English.
"""
if source_lang == "en":
return text, _empty_usage()
lang_name = get_language_name(source_lang)
prompt = f"""Translate the following text from {lang_name} to English.
Return ONLY the English translation, nothing else.
Text: {text}"""
try:
response = litellm.completion(
model=MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0,
max_tokens=500,
)
usage = _extract_usage(response)
return (response.choices[0].message.content or "").strip(), usage
except Exception as e:
print(f"Translation error: {e}")
return text, _empty_usage() # Fall back to original