agroadvisor-bd / src /language.py
Borhan72's picture
Initial commit
84a9a2a
Raw
History Blame Contribute Delete
1.75 kB
import re
from langdetect import detect, LangDetectException
# Common romanized Bengali words — detect transliterated Bengali
ROMANIZED_BENGALI = {
'tumi', 'tmi', 'ami', 'amar', 'apnar', 'apni', 'tomar', 'tomr',
'ki', 'ke', 'keno', 'kothay', 'boro', 'chash', 'dhan', 'poka',
'rog', 'foshol', 'krishi', 'kemon', 'acho', 'achen', 'bolo',
'bolun', 'jano', 'janon', 'dhaner', 'shobji', 'alu', 'begun',
'amn', 'boro', 'rabi', 'kharip', 'shech', 'sar', 'bishi',
'kharap', 'valo', 'vhalo', 'sundor', 'khub', 'onek', 'ektu',
'kothake', 'kothai', 'kivabe', 'kore', 'korbo', 'korte'
}
def detect_language(text: str) -> str:
"""Detect if text is Bengali, romanized Bengali, or English."""
# 1. Check Bengali Unicode characters (strongest signal)
bengali_chars = sum(1 for c in text if '\u0980' <= c <= '\u09FF')
if bengali_chars > len(text) * 0.15:
return 'bn'
# 2. Check romanized Bengali (transliterated)
words = set(re.findall(r'\b\w+\b', text.lower()))
roman_bn_matches = words.intersection(ROMANIZED_BENGALI)
if len(roman_bn_matches) >= 1 and len(words) <= 6:
return 'bn' # short romanized Bengali query
# 3. Standard language detection
try:
lang = detect(text)
# Correct South Asian language misdetections
if lang in ['hi', 'ur', 'ne', 'si'] and bengali_chars > 0:
return 'bn'
return lang if lang in ['bn', 'en'] else 'en'
except LangDetectException:
return 'en'
def get_language_instruction(lang_code: str) -> str:
if lang_code == 'bn':
return "Respond entirely in Bengali (বাংলায় উত্তর দিন). Use clear, simple Bengali."
return "Respond entirely in English."