x2aqq's picture
Upload folder using huggingface_hub
47bc13b verified
"""Hindi transliteration and script handling for multilingual addresses."""
import re
class HindiTransliterator:
"""
Handles Hindi (Devanagari) to Latin transliteration and script detection.
Supports:
- Devanagari to Latin conversion
- Common Hindi address terms
- Mixed script (code-switched) addresses
"""
# Devanagari Unicode range
DEVANAGARI_START = 0x0900
DEVANAGARI_END = 0x097F
# Common Hindi address terms with transliterations
HINDI_TERMS = {
# Devanagari -> Latin
'गली': 'GALI',
'गलि': 'GALI',
'मोहल्ला': 'MOHALLA',
'नगर': 'NAGAR',
'विहार': 'VIHAR',
'पुरी': 'PURI',
'पुर': 'PUR',
'बाग': 'BAGH',
'मार्ग': 'MARG',
'रोड': 'ROAD',
'मंजिल': 'FLOOR',
'पहली': 'FIRST',
'दूसरी': 'SECOND',
'तीसरी': 'THIRD',
'चौथी': 'FOURTH',
'भूतल': 'GROUND FLOOR',
'तहखाना': 'BASEMENT',
'मकान': 'HOUSE',
'प्लॉट': 'PLOT',
'खसरा': 'KHASRA',
'ब्लॉक': 'BLOCK',
'सेक्टर': 'SECTOR',
'कॉलोनी': 'COLONY',
'इलाका': 'AREA',
'क्षेत्र': 'AREA',
'दिल्ली': 'DELHI',
'नई दिल्ली': 'NEW DELHI',
'नम्बर': 'NUMBER',
'नंबर': 'NUMBER',
'संख्या': 'NUMBER',
'पास': 'NEAR',
'सामने': 'OPPOSITE',
'पीछे': 'BEHIND',
'के पास': 'NEAR',
'के सामने': 'OPPOSITE',
'चौक': 'CHOWK',
'बाजार': 'BAZAAR',
'बस्ती': 'BASTI',
'पार्क': 'PARK',
'एक्सटेंशन': 'EXTENSION',
'फेज': 'PHASE',
'वार्ड': 'WARD',
'जोन': 'ZONE',
}
# Devanagari consonants to Latin (basic ITRANS-like mapping)
CONSONANT_MAP = {
'क': 'k', 'ख': 'kh', 'ग': 'g', 'घ': 'gh', 'ङ': 'ng',
'च': 'ch', 'छ': 'chh', 'ज': 'j', 'झ': 'jh', 'ञ': 'ny',
'ट': 't', 'ठ': 'th', 'ड': 'd', 'ढ': 'dh', 'ण': 'n',
'त': 't', 'थ': 'th', 'द': 'd', 'ध': 'dh', 'न': 'n',
'प': 'p', 'फ': 'ph', 'ब': 'b', 'भ': 'bh', 'म': 'm',
'य': 'y', 'र': 'r', 'ल': 'l', 'व': 'v', 'श': 'sh',
'ष': 'sh', 'स': 's', 'ह': 'h',
'क़': 'q', 'ख़': 'kh', 'ग़': 'g', 'ज़': 'z', 'ड़': 'd',
'ढ़': 'dh', 'फ़': 'f', 'य़': 'y',
}
# Devanagari vowels/matras
VOWEL_MAP = {
'अ': 'a', 'आ': 'aa', 'इ': 'i', 'ई': 'ee', 'उ': 'u', 'ऊ': 'oo',
'ए': 'e', 'ऐ': 'ai', 'ओ': 'o', 'औ': 'au', 'अं': 'an', 'अः': 'ah',
'ा': 'a', 'ि': 'i', 'ी': 'ee', 'ु': 'u', 'ू': 'oo',
'े': 'e', 'ै': 'ai', 'ो': 'o', 'ौ': 'au',
'ं': 'n', 'ः': 'h', '्': '', # Halant (vowel killer)
'ँ': 'n', # Chandrabindu
}
# Devanagari digits
DIGIT_MAP = {
'०': '0', '१': '1', '२': '2', '३': '3', '४': '4',
'५': '5', '६': '6', '७': '7', '८': '8', '९': '9',
}
def __init__(self, use_known_terms: bool = True):
"""
Initialize transliterator.
Args:
use_known_terms: Use dictionary of known Hindi address terms
"""
self.use_known_terms = use_known_terms
def contains_devanagari(self, text: str) -> bool:
"""Check if text contains Devanagari script."""
for char in text:
code = ord(char)
if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
return True
return False
def get_script_ratio(self, text: str) -> dict[str, float]:
"""
Get ratio of different scripts in text.
Returns dict with 'latin', 'devanagari', 'numeric', 'other' ratios.
"""
if not text:
return {'latin': 0.0, 'devanagari': 0.0, 'numeric': 0.0, 'other': 0.0}
counts: dict[str, float] = {'latin': 0, 'devanagari': 0, 'numeric': 0, 'other': 0}
total = 0
for char in text:
if char.isspace():
continue
total += 1
code = ord(char)
if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
counts['devanagari'] += 1
elif char.isascii() and char.isalpha():
counts['latin'] += 1
elif char.isdigit():
counts['numeric'] += 1
else:
counts['other'] += 1
if total == 0:
return counts
return {k: v / total for k, v in counts.items()}
def transliterate(self, text: str) -> str:
"""
Transliterate Devanagari text to Latin script.
Args:
text: Input text (may be mixed script)
Returns:
Transliterated text in Latin script
"""
if not self.contains_devanagari(text):
return text
# First, try to match known terms
if self.use_known_terms:
for hindi, latin in sorted(self.HINDI_TERMS.items(), key=lambda x: -len(x[0])):
text = text.replace(hindi, f' {latin} ')
# Then transliterate remaining Devanagari
result = []
i = 0
while i < len(text):
char = text[i]
code = ord(char)
if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
# Check digits first
if char in self.DIGIT_MAP:
result.append(self.DIGIT_MAP[char])
# Check vowels
elif char in self.VOWEL_MAP:
result.append(self.VOWEL_MAP[char])
# Check consonants
elif char in self.CONSONANT_MAP:
result.append(self.CONSONANT_MAP[char])
# Add implicit 'a' unless followed by matra or halant
if i + 1 < len(text):
next_char = text[i + 1]
next_code = ord(next_char)
# If next is a matra (0x093E-0x094D) or halant, don't add 'a'
if not (0x093E <= next_code <= 0x094D):
result.append('a')
else:
result.append('a')
else:
# Unknown Devanagari character
result.append(char)
else:
result.append(char)
i += 1
# Clean up
output = ''.join(result)
output = re.sub(r'\s+', ' ', output)
return output.strip().upper()
def normalize_mixed_script(self, text: str) -> str:
"""
Handle code-mixed (Hindi + English) addresses.
Transliterates Hindi portions while preserving English.
"""
# Split on whitespace to handle word by word
words = text.split()
result = []
for word in words:
if self.contains_devanagari(word):
# Check if it's a known term first
if self.use_known_terms and word in self.HINDI_TERMS:
result.append(self.HINDI_TERMS[word])
else:
result.append(self.transliterate(word))
else:
result.append(word.upper())
return ' '.join(result)
def detect_language(text: str) -> str:
"""
Simple language detection for address text.
Returns: 'hindi', 'english', or 'mixed'
"""
transliterator = HindiTransliterator()
ratios = transliterator.get_script_ratio(text)
if ratios['devanagari'] > 0.5:
return 'hindi'
elif ratios['latin'] > 0.5:
return 'english'
elif ratios['devanagari'] > 0 and ratios['latin'] > 0:
return 'mixed'
else:
return 'english'