| """Hindi transliteration and script handling for multilingual addresses."""
|
|
|
| import re
|
|
|
|
|
| class HindiTransliterator:
|
| """
|
| Handles Hindi (Devanagari) to Latin transliteration and script detection.
|
|
|
| Supports:
|
| - Devanagari to Latin conversion
|
| - Common Hindi address terms
|
| - Mixed script (code-switched) addresses
|
| """
|
|
|
|
|
| DEVANAGARI_START = 0x0900
|
| DEVANAGARI_END = 0x097F
|
|
|
|
|
| HINDI_TERMS = {
|
|
|
| 'गली': 'GALI',
|
| 'गलि': 'GALI',
|
| 'मोहल्ला': 'MOHALLA',
|
| 'नगर': 'NAGAR',
|
| 'विहार': 'VIHAR',
|
| 'पुरी': 'PURI',
|
| 'पुर': 'PUR',
|
| 'बाग': 'BAGH',
|
| 'मार्ग': 'MARG',
|
| 'रोड': 'ROAD',
|
| 'मंजिल': 'FLOOR',
|
| 'पहली': 'FIRST',
|
| 'दूसरी': 'SECOND',
|
| 'तीसरी': 'THIRD',
|
| 'चौथी': 'FOURTH',
|
| 'भूतल': 'GROUND FLOOR',
|
| 'तहखाना': 'BASEMENT',
|
| 'मकान': 'HOUSE',
|
| 'प्लॉट': 'PLOT',
|
| 'खसरा': 'KHASRA',
|
| 'ब्लॉक': 'BLOCK',
|
| 'सेक्टर': 'SECTOR',
|
| 'कॉलोनी': 'COLONY',
|
| 'इलाका': 'AREA',
|
| 'क्षेत्र': 'AREA',
|
| 'दिल्ली': 'DELHI',
|
| 'नई दिल्ली': 'NEW DELHI',
|
| 'नम्बर': 'NUMBER',
|
| 'नंबर': 'NUMBER',
|
| 'संख्या': 'NUMBER',
|
| 'पास': 'NEAR',
|
| 'सामने': 'OPPOSITE',
|
| 'पीछे': 'BEHIND',
|
| 'के पास': 'NEAR',
|
| 'के सामने': 'OPPOSITE',
|
| 'चौक': 'CHOWK',
|
| 'बाजार': 'BAZAAR',
|
| 'बस्ती': 'BASTI',
|
| 'पार्क': 'PARK',
|
| 'एक्सटेंशन': 'EXTENSION',
|
| 'फेज': 'PHASE',
|
| 'वार्ड': 'WARD',
|
| 'जोन': 'ZONE',
|
| }
|
|
|
|
|
| CONSONANT_MAP = {
|
| 'क': 'k', 'ख': 'kh', 'ग': 'g', 'घ': 'gh', 'ङ': 'ng',
|
| 'च': 'ch', 'छ': 'chh', 'ज': 'j', 'झ': 'jh', 'ञ': 'ny',
|
| 'ट': 't', 'ठ': 'th', 'ड': 'd', 'ढ': 'dh', 'ण': 'n',
|
| 'त': 't', 'थ': 'th', 'द': 'd', 'ध': 'dh', 'न': 'n',
|
| 'प': 'p', 'फ': 'ph', 'ब': 'b', 'भ': 'bh', 'म': 'm',
|
| 'य': 'y', 'र': 'r', 'ल': 'l', 'व': 'v', 'श': 'sh',
|
| 'ष': 'sh', 'स': 's', 'ह': 'h',
|
| 'क़': 'q', 'ख़': 'kh', 'ग़': 'g', 'ज़': 'z', 'ड़': 'd',
|
| 'ढ़': 'dh', 'फ़': 'f', 'य़': 'y',
|
| }
|
|
|
|
|
| VOWEL_MAP = {
|
| 'अ': 'a', 'आ': 'aa', 'इ': 'i', 'ई': 'ee', 'उ': 'u', 'ऊ': 'oo',
|
| 'ए': 'e', 'ऐ': 'ai', 'ओ': 'o', 'औ': 'au', 'अं': 'an', 'अः': 'ah',
|
| 'ा': 'a', 'ि': 'i', 'ी': 'ee', 'ु': 'u', 'ू': 'oo',
|
| 'े': 'e', 'ै': 'ai', 'ो': 'o', 'ौ': 'au',
|
| 'ं': 'n', 'ः': 'h', '्': '',
|
| 'ँ': 'n',
|
| }
|
|
|
|
|
| DIGIT_MAP = {
|
| '०': '0', '१': '1', '२': '2', '३': '3', '४': '4',
|
| '५': '5', '६': '6', '७': '7', '८': '8', '९': '9',
|
| }
|
|
|
| def __init__(self, use_known_terms: bool = True):
|
| """
|
| Initialize transliterator.
|
|
|
| Args:
|
| use_known_terms: Use dictionary of known Hindi address terms
|
| """
|
| self.use_known_terms = use_known_terms
|
|
|
| def contains_devanagari(self, text: str) -> bool:
|
| """Check if text contains Devanagari script."""
|
| for char in text:
|
| code = ord(char)
|
| if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
|
| return True
|
| return False
|
|
|
| def get_script_ratio(self, text: str) -> dict[str, float]:
|
| """
|
| Get ratio of different scripts in text.
|
|
|
| Returns dict with 'latin', 'devanagari', 'numeric', 'other' ratios.
|
| """
|
| if not text:
|
| return {'latin': 0.0, 'devanagari': 0.0, 'numeric': 0.0, 'other': 0.0}
|
|
|
| counts: dict[str, float] = {'latin': 0, 'devanagari': 0, 'numeric': 0, 'other': 0}
|
| total = 0
|
|
|
| for char in text:
|
| if char.isspace():
|
| continue
|
| total += 1
|
| code = ord(char)
|
|
|
| if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
|
| counts['devanagari'] += 1
|
| elif char.isascii() and char.isalpha():
|
| counts['latin'] += 1
|
| elif char.isdigit():
|
| counts['numeric'] += 1
|
| else:
|
| counts['other'] += 1
|
|
|
| if total == 0:
|
| return counts
|
|
|
| return {k: v / total for k, v in counts.items()}
|
|
|
| def transliterate(self, text: str) -> str:
|
| """
|
| Transliterate Devanagari text to Latin script.
|
|
|
| Args:
|
| text: Input text (may be mixed script)
|
|
|
| Returns:
|
| Transliterated text in Latin script
|
| """
|
| if not self.contains_devanagari(text):
|
| return text
|
|
|
|
|
| if self.use_known_terms:
|
| for hindi, latin in sorted(self.HINDI_TERMS.items(), key=lambda x: -len(x[0])):
|
| text = text.replace(hindi, f' {latin} ')
|
|
|
|
|
| result = []
|
| i = 0
|
| while i < len(text):
|
| char = text[i]
|
| code = ord(char)
|
|
|
| if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
|
|
|
| if char in self.DIGIT_MAP:
|
| result.append(self.DIGIT_MAP[char])
|
|
|
| elif char in self.VOWEL_MAP:
|
| result.append(self.VOWEL_MAP[char])
|
|
|
| elif char in self.CONSONANT_MAP:
|
| result.append(self.CONSONANT_MAP[char])
|
|
|
| if i + 1 < len(text):
|
| next_char = text[i + 1]
|
| next_code = ord(next_char)
|
|
|
| if not (0x093E <= next_code <= 0x094D):
|
| result.append('a')
|
| else:
|
| result.append('a')
|
| else:
|
|
|
| result.append(char)
|
| else:
|
| result.append(char)
|
|
|
| i += 1
|
|
|
|
|
| output = ''.join(result)
|
| output = re.sub(r'\s+', ' ', output)
|
| return output.strip().upper()
|
|
|
| def normalize_mixed_script(self, text: str) -> str:
|
| """
|
| Handle code-mixed (Hindi + English) addresses.
|
|
|
| Transliterates Hindi portions while preserving English.
|
| """
|
|
|
| words = text.split()
|
| result = []
|
|
|
| for word in words:
|
| if self.contains_devanagari(word):
|
|
|
| if self.use_known_terms and word in self.HINDI_TERMS:
|
| result.append(self.HINDI_TERMS[word])
|
| else:
|
| result.append(self.transliterate(word))
|
| else:
|
| result.append(word.upper())
|
|
|
| return ' '.join(result)
|
|
|
|
|
| def detect_language(text: str) -> str:
|
| """
|
| Simple language detection for address text.
|
|
|
| Returns: 'hindi', 'english', or 'mixed'
|
| """
|
| transliterator = HindiTransliterator()
|
| ratios = transliterator.get_script_ratio(text)
|
|
|
| if ratios['devanagari'] > 0.5:
|
| return 'hindi'
|
| elif ratios['latin'] > 0.5:
|
| return 'english'
|
| elif ratios['devanagari'] > 0 and ratios['latin'] > 0:
|
| return 'mixed'
|
| else:
|
| return 'english'
|
|
|