Spaces:
Running
Running
| import re | |
| from langdetect import detect, LangDetectException | |
| # Common romanized Bengali words — detect transliterated Bengali | |
| ROMANIZED_BENGALI = { | |
| 'tumi', 'tmi', 'ami', 'amar', 'apnar', 'apni', 'tomar', 'tomr', | |
| 'ki', 'ke', 'keno', 'kothay', 'boro', 'chash', 'dhan', 'poka', | |
| 'rog', 'foshol', 'krishi', 'kemon', 'acho', 'achen', 'bolo', | |
| 'bolun', 'jano', 'janon', 'dhaner', 'shobji', 'alu', 'begun', | |
| 'amn', 'boro', 'rabi', 'kharip', 'shech', 'sar', 'bishi', | |
| 'kharap', 'valo', 'vhalo', 'sundor', 'khub', 'onek', 'ektu', | |
| 'kothake', 'kothai', 'kivabe', 'kore', 'korbo', 'korte' | |
| } | |
| def detect_language(text: str) -> str: | |
| """Detect if text is Bengali, romanized Bengali, or English.""" | |
| # 1. Check Bengali Unicode characters (strongest signal) | |
| bengali_chars = sum(1 for c in text if '\u0980' <= c <= '\u09FF') | |
| if bengali_chars > len(text) * 0.15: | |
| return 'bn' | |
| # 2. Check romanized Bengali (transliterated) | |
| words = set(re.findall(r'\b\w+\b', text.lower())) | |
| roman_bn_matches = words.intersection(ROMANIZED_BENGALI) | |
| if len(roman_bn_matches) >= 1 and len(words) <= 6: | |
| return 'bn' # short romanized Bengali query | |
| # 3. Standard language detection | |
| try: | |
| lang = detect(text) | |
| # Correct South Asian language misdetections | |
| if lang in ['hi', 'ur', 'ne', 'si'] and bengali_chars > 0: | |
| return 'bn' | |
| return lang if lang in ['bn', 'en'] else 'en' | |
| except LangDetectException: | |
| return 'en' | |
| def get_language_instruction(lang_code: str) -> str: | |
| if lang_code == 'bn': | |
| return "Respond entirely in Bengali (বাংলায় উত্তর দিন). Use clear, simple Bengali." | |
| return "Respond entirely in English." |