Spaces:

webmuppetnz
/

hmc-rag

Running

hmc-rag / src /language.py

webmuppet

Initial commit — health marketing compliance RAG

bad8b6c 5 days ago

1.95 kB

	"""
	Language detection and translation layer.
	Uses langdetect for detection and Qwen 2.5 via Ollama for translation.
	"""

	import litellm
	from langdetect import detect, LangDetectException
	from src.config import MODEL
	from src.usage import _extract_usage, _empty_usage

	# Languages we explicitly support (from the brief)
	SUPPORTED_LANGUAGES = {
	"en": "English",
	"mi": "te reo Māori",
	"tl": "Filipino",
	"hi": "Hindi",
	"sm": "Samoan",
	"zh-cn": "Mandarin Chinese",
	"zh-tw": "Cantonese/Traditional Chinese",
	}


	def detect_language(text: str) -> str:
	"""Detect the language of the input text.

	Returns a language code (e.g. 'en', 'tl', 'hi', 'zh-cn').
	Falls back to 'en' if detection fails.
	"""
	try:
	lang = detect(text)
	# langdetect returns 'zh-cn' for Chinese, 'tl' for Filipino, etc.
	return lang
	except LangDetectException:
	return "en"


	def get_language_name(lang_code: str) -> str:
	"""Get human-readable language name."""
	return SUPPORTED_LANGUAGES.get(lang_code, lang_code)


	def translate_to_english(text: str, source_lang: str) -> str:
	"""Translate a query from the source language to English.

	Returns the original text if already English.
	"""
	if source_lang == "en":
	return text, _empty_usage()

	lang_name = get_language_name(source_lang)

	prompt = f"""Translate the following text from {lang_name} to English.
	Return ONLY the English translation, nothing else.

	Text: {text}"""

	try:
	response = litellm.completion(
	model=MODEL,
	messages=[{"role": "user", "content": prompt}],
	temperature=0,
	max_tokens=500,
	)
	usage = _extract_usage(response)
	return (response.choices[0].message.content or "").strip(), usage
	except Exception as e:
	print(f"Translation error: {e}")
	return text, _empty_usage() # Fall back to original