Spaces:
Runtime error
Runtime error
| import unicodedata | |
| from collections import Counter | |
| from app.schemas import LanguageProfile, LanguageSignal, ScriptRatio | |
| class HeuristicLanguageDetector: | |
| roman_hindi_markers = { | |
| "hai", | |
| "nahi", | |
| "nahin", | |
| "tum", | |
| "aap", | |
| "mera", | |
| "meri", | |
| "kya", | |
| "kyu", | |
| "kyun", | |
| "mat", | |
| "kar", | |
| "kr", | |
| "bhai", | |
| "yaar", | |
| "acha", | |
| "accha", | |
| "bakwas", | |
| "bewakoof", | |
| } | |
| def detect(self, text: str) -> LanguageProfile: | |
| script_counts = Counter() | |
| for char in text: | |
| script_name = self._script_name(char) | |
| if script_name is not None: | |
| script_counts[script_name] += 1 | |
| total = sum(script_counts.values()) | |
| scripts = [] | |
| if total: | |
| scripts = [ | |
| ScriptRatio(name=name, ratio=round(count / total, 3)) | |
| for name, count in script_counts.most_common() | |
| ] | |
| lowered_tokens = {token.strip(".,!?;:()[]{}\"'").lower() for token in text.split()} | |
| roman_hindi_hits = len(self.roman_hindi_markers.intersection(lowered_tokens)) | |
| has_latin = script_counts["latin"] > 0 | |
| has_devanagari = script_counts["devanagari"] > 0 | |
| has_other_indic = script_counts["indic_other"] > 0 | |
| if has_devanagari and has_latin: | |
| return LanguageProfile( | |
| primary_language="hinglish", | |
| code_mixed=True, | |
| scripts=scripts, | |
| candidates=[ | |
| LanguageSignal(name="hinglish", confidence=0.9), | |
| LanguageSignal(name="hindi", confidence=0.72), | |
| LanguageSignal(name="english", confidence=0.63), | |
| ], | |
| ) | |
| if has_devanagari: | |
| return LanguageProfile( | |
| primary_language="hindi", | |
| code_mixed=False, | |
| scripts=scripts, | |
| candidates=[ | |
| LanguageSignal(name="hindi", confidence=0.92), | |
| LanguageSignal(name="hinglish", confidence=0.28), | |
| ], | |
| ) | |
| if has_latin and roman_hindi_hits >= 2: | |
| return LanguageProfile( | |
| primary_language="hinglish", | |
| code_mixed=True, | |
| scripts=scripts, | |
| candidates=[ | |
| LanguageSignal(name="hinglish", confidence=0.82), | |
| LanguageSignal(name="english", confidence=0.58), | |
| ], | |
| ) | |
| if has_latin: | |
| return LanguageProfile( | |
| primary_language="english", | |
| code_mixed=False, | |
| scripts=scripts, | |
| candidates=[ | |
| LanguageSignal(name="english", confidence=0.9), | |
| LanguageSignal(name="hinglish", confidence=0.25), | |
| ], | |
| ) | |
| if has_other_indic: | |
| return LanguageProfile( | |
| primary_language="indic_other", | |
| code_mixed=False, | |
| scripts=scripts, | |
| candidates=[LanguageSignal(name="indic_other", confidence=0.8)], | |
| ) | |
| return LanguageProfile( | |
| primary_language="unknown", | |
| code_mixed=False, | |
| scripts=scripts, | |
| candidates=[LanguageSignal(name="unknown", confidence=0.4)], | |
| ) | |
| def _script_name(self, char: str) -> str | None: | |
| if not char.isalpha(): | |
| return None | |
| name = unicodedata.name(char, "") | |
| if "LATIN" in name: | |
| return "latin" | |
| if "DEVANAGARI" in name: | |
| return "devanagari" | |
| if any( | |
| block in name | |
| for block in ( | |
| "BENGALI", | |
| "GURMUKHI", | |
| "GUJARATI", | |
| "ORIYA", | |
| "TAMIL", | |
| "TELUGU", | |
| "KANNADA", | |
| "MALAYALAM", | |
| ) | |
| ): | |
| return "indic_other" | |
| return "other" | |