import unicodedata from collections import Counter from app.schemas import LanguageProfile, LanguageSignal, ScriptRatio class HeuristicLanguageDetector: roman_hindi_markers = { "hai", "nahi", "nahin", "tum", "aap", "mera", "meri", "kya", "kyu", "kyun", "mat", "kar", "kr", "bhai", "yaar", "acha", "accha", "bakwas", "bewakoof", } def detect(self, text: str) -> LanguageProfile: script_counts = Counter() for char in text: script_name = self._script_name(char) if script_name is not None: script_counts[script_name] += 1 total = sum(script_counts.values()) scripts = [] if total: scripts = [ ScriptRatio(name=name, ratio=round(count / total, 3)) for name, count in script_counts.most_common() ] lowered_tokens = {token.strip(".,!?;:()[]{}\"'").lower() for token in text.split()} roman_hindi_hits = len(self.roman_hindi_markers.intersection(lowered_tokens)) has_latin = script_counts["latin"] > 0 has_devanagari = script_counts["devanagari"] > 0 has_other_indic = script_counts["indic_other"] > 0 if has_devanagari and has_latin: return LanguageProfile( primary_language="hinglish", code_mixed=True, scripts=scripts, candidates=[ LanguageSignal(name="hinglish", confidence=0.9), LanguageSignal(name="hindi", confidence=0.72), LanguageSignal(name="english", confidence=0.63), ], ) if has_devanagari: return LanguageProfile( primary_language="hindi", code_mixed=False, scripts=scripts, candidates=[ LanguageSignal(name="hindi", confidence=0.92), LanguageSignal(name="hinglish", confidence=0.28), ], ) if has_latin and roman_hindi_hits >= 2: return LanguageProfile( primary_language="hinglish", code_mixed=True, scripts=scripts, candidates=[ LanguageSignal(name="hinglish", confidence=0.82), LanguageSignal(name="english", confidence=0.58), ], ) if has_latin: return LanguageProfile( primary_language="english", code_mixed=False, scripts=scripts, candidates=[ LanguageSignal(name="english", confidence=0.9), LanguageSignal(name="hinglish", confidence=0.25), ], ) if has_other_indic: return LanguageProfile( primary_language="indic_other", code_mixed=False, scripts=scripts, candidates=[LanguageSignal(name="indic_other", confidence=0.8)], ) return LanguageProfile( primary_language="unknown", code_mixed=False, scripts=scripts, candidates=[LanguageSignal(name="unknown", confidence=0.4)], ) def _script_name(self, char: str) -> str | None: if not char.isalpha(): return None name = unicodedata.name(char, "") if "LATIN" in name: return "latin" if "DEVANAGARI" in name: return "devanagari" if any( block in name for block in ( "BENGALI", "GURMUKHI", "GUJARATI", "ORIYA", "TAMIL", "TELUGU", "KANNADA", "MALAYALAM", ) ): return "indic_other" return "other"