vineet88's picture
Deploy standalone ML service
16f57d9 verified
import unicodedata
from collections import Counter
from app.schemas import LanguageProfile, LanguageSignal, ScriptRatio
class HeuristicLanguageDetector:
roman_hindi_markers = {
"hai",
"nahi",
"nahin",
"tum",
"aap",
"mera",
"meri",
"kya",
"kyu",
"kyun",
"mat",
"kar",
"kr",
"bhai",
"yaar",
"acha",
"accha",
"bakwas",
"bewakoof",
}
def detect(self, text: str) -> LanguageProfile:
script_counts = Counter()
for char in text:
script_name = self._script_name(char)
if script_name is not None:
script_counts[script_name] += 1
total = sum(script_counts.values())
scripts = []
if total:
scripts = [
ScriptRatio(name=name, ratio=round(count / total, 3))
for name, count in script_counts.most_common()
]
lowered_tokens = {token.strip(".,!?;:()[]{}\"'").lower() for token in text.split()}
roman_hindi_hits = len(self.roman_hindi_markers.intersection(lowered_tokens))
has_latin = script_counts["latin"] > 0
has_devanagari = script_counts["devanagari"] > 0
has_other_indic = script_counts["indic_other"] > 0
if has_devanagari and has_latin:
return LanguageProfile(
primary_language="hinglish",
code_mixed=True,
scripts=scripts,
candidates=[
LanguageSignal(name="hinglish", confidence=0.9),
LanguageSignal(name="hindi", confidence=0.72),
LanguageSignal(name="english", confidence=0.63),
],
)
if has_devanagari:
return LanguageProfile(
primary_language="hindi",
code_mixed=False,
scripts=scripts,
candidates=[
LanguageSignal(name="hindi", confidence=0.92),
LanguageSignal(name="hinglish", confidence=0.28),
],
)
if has_latin and roman_hindi_hits >= 2:
return LanguageProfile(
primary_language="hinglish",
code_mixed=True,
scripts=scripts,
candidates=[
LanguageSignal(name="hinglish", confidence=0.82),
LanguageSignal(name="english", confidence=0.58),
],
)
if has_latin:
return LanguageProfile(
primary_language="english",
code_mixed=False,
scripts=scripts,
candidates=[
LanguageSignal(name="english", confidence=0.9),
LanguageSignal(name="hinglish", confidence=0.25),
],
)
if has_other_indic:
return LanguageProfile(
primary_language="indic_other",
code_mixed=False,
scripts=scripts,
candidates=[LanguageSignal(name="indic_other", confidence=0.8)],
)
return LanguageProfile(
primary_language="unknown",
code_mixed=False,
scripts=scripts,
candidates=[LanguageSignal(name="unknown", confidence=0.4)],
)
def _script_name(self, char: str) -> str | None:
if not char.isalpha():
return None
name = unicodedata.name(char, "")
if "LATIN" in name:
return "latin"
if "DEVANAGARI" in name:
return "devanagari"
if any(
block in name
for block in (
"BENGALI",
"GURMUKHI",
"GUJARATI",
"ORIYA",
"TAMIL",
"TELUGU",
"KANNADA",
"MALAYALAM",
)
):
return "indic_other"
return "other"