Spaces:
Sleeping
Sleeping
| """Hindi transliteration and script handling for multilingual addresses.""" | |
| import re | |
| class HindiTransliterator: | |
| """ | |
| Handles Hindi (Devanagari) to Latin transliteration and script detection. | |
| Supports: | |
| - Devanagari to Latin conversion | |
| - Common Hindi address terms | |
| - Mixed script (code-switched) addresses | |
| """ | |
| # Devanagari Unicode range | |
| DEVANAGARI_START = 0x0900 | |
| DEVANAGARI_END = 0x097F | |
| # Common Hindi address terms with transliterations | |
| HINDI_TERMS = { | |
| # Devanagari -> Latin | |
| 'गली': 'GALI', | |
| 'गलि': 'GALI', | |
| 'मोहल्ला': 'MOHALLA', | |
| 'नगर': 'NAGAR', | |
| 'विहार': 'VIHAR', | |
| 'पुरी': 'PURI', | |
| 'पुर': 'PUR', | |
| 'बाग': 'BAGH', | |
| 'मार्ग': 'MARG', | |
| 'रोड': 'ROAD', | |
| 'मंजिल': 'FLOOR', | |
| 'पहली': 'FIRST', | |
| 'दूसरी': 'SECOND', | |
| 'तीसरी': 'THIRD', | |
| 'चौथी': 'FOURTH', | |
| 'भूतल': 'GROUND FLOOR', | |
| 'तहखाना': 'BASEMENT', | |
| 'मकान': 'HOUSE', | |
| 'प्लॉट': 'PLOT', | |
| 'खसरा': 'KHASRA', | |
| 'ब्लॉक': 'BLOCK', | |
| 'सेक्टर': 'SECTOR', | |
| 'कॉलोनी': 'COLONY', | |
| 'इलाका': 'AREA', | |
| 'क्षेत्र': 'AREA', | |
| 'दिल्ली': 'DELHI', | |
| 'नई दिल्ली': 'NEW DELHI', | |
| 'नम्बर': 'NUMBER', | |
| 'नंबर': 'NUMBER', | |
| 'संख्या': 'NUMBER', | |
| 'पास': 'NEAR', | |
| 'सामने': 'OPPOSITE', | |
| 'पीछे': 'BEHIND', | |
| 'के पास': 'NEAR', | |
| 'के सामने': 'OPPOSITE', | |
| 'चौक': 'CHOWK', | |
| 'बाजार': 'BAZAAR', | |
| 'बस्ती': 'BASTI', | |
| 'पार्क': 'PARK', | |
| 'एक्सटेंशन': 'EXTENSION', | |
| 'फेज': 'PHASE', | |
| 'वार्ड': 'WARD', | |
| 'जोन': 'ZONE', | |
| } | |
| # Devanagari consonants to Latin (basic ITRANS-like mapping) | |
| CONSONANT_MAP = { | |
| 'क': 'k', 'ख': 'kh', 'ग': 'g', 'घ': 'gh', 'ङ': 'ng', | |
| 'च': 'ch', 'छ': 'chh', 'ज': 'j', 'झ': 'jh', 'ञ': 'ny', | |
| 'ट': 't', 'ठ': 'th', 'ड': 'd', 'ढ': 'dh', 'ण': 'n', | |
| 'त': 't', 'थ': 'th', 'द': 'd', 'ध': 'dh', 'न': 'n', | |
| 'प': 'p', 'फ': 'ph', 'ब': 'b', 'भ': 'bh', 'म': 'm', | |
| 'य': 'y', 'र': 'r', 'ल': 'l', 'व': 'v', 'श': 'sh', | |
| 'ष': 'sh', 'स': 's', 'ह': 'h', | |
| 'क़': 'q', 'ख़': 'kh', 'ग़': 'g', 'ज़': 'z', 'ड़': 'd', | |
| 'ढ़': 'dh', 'फ़': 'f', 'य़': 'y', | |
| } | |
| # Devanagari vowels/matras | |
| VOWEL_MAP = { | |
| 'अ': 'a', 'आ': 'aa', 'इ': 'i', 'ई': 'ee', 'उ': 'u', 'ऊ': 'oo', | |
| 'ए': 'e', 'ऐ': 'ai', 'ओ': 'o', 'औ': 'au', 'अं': 'an', 'अः': 'ah', | |
| 'ा': 'a', 'ि': 'i', 'ी': 'ee', 'ु': 'u', 'ू': 'oo', | |
| 'े': 'e', 'ै': 'ai', 'ो': 'o', 'ौ': 'au', | |
| 'ं': 'n', 'ः': 'h', '्': '', # Halant (vowel killer) | |
| 'ँ': 'n', # Chandrabindu | |
| } | |
| # Devanagari digits | |
| DIGIT_MAP = { | |
| '०': '0', '१': '1', '२': '2', '३': '3', '४': '4', | |
| '५': '5', '६': '6', '७': '7', '८': '8', '९': '9', | |
| } | |
| def __init__(self, use_known_terms: bool = True): | |
| """ | |
| Initialize transliterator. | |
| Args: | |
| use_known_terms: Use dictionary of known Hindi address terms | |
| """ | |
| self.use_known_terms = use_known_terms | |
| def contains_devanagari(self, text: str) -> bool: | |
| """Check if text contains Devanagari script.""" | |
| for char in text: | |
| code = ord(char) | |
| if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END: | |
| return True | |
| return False | |
| def get_script_ratio(self, text: str) -> dict[str, float]: | |
| """ | |
| Get ratio of different scripts in text. | |
| Returns dict with 'latin', 'devanagari', 'numeric', 'other' ratios. | |
| """ | |
| if not text: | |
| return {'latin': 0.0, 'devanagari': 0.0, 'numeric': 0.0, 'other': 0.0} | |
| counts: dict[str, float] = {'latin': 0, 'devanagari': 0, 'numeric': 0, 'other': 0} | |
| total = 0 | |
| for char in text: | |
| if char.isspace(): | |
| continue | |
| total += 1 | |
| code = ord(char) | |
| if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END: | |
| counts['devanagari'] += 1 | |
| elif char.isascii() and char.isalpha(): | |
| counts['latin'] += 1 | |
| elif char.isdigit(): | |
| counts['numeric'] += 1 | |
| else: | |
| counts['other'] += 1 | |
| if total == 0: | |
| return counts | |
| return {k: v / total for k, v in counts.items()} | |
| def transliterate(self, text: str) -> str: | |
| """ | |
| Transliterate Devanagari text to Latin script. | |
| Args: | |
| text: Input text (may be mixed script) | |
| Returns: | |
| Transliterated text in Latin script | |
| """ | |
| if not self.contains_devanagari(text): | |
| return text | |
| # First, try to match known terms | |
| if self.use_known_terms: | |
| for hindi, latin in sorted(self.HINDI_TERMS.items(), key=lambda x: -len(x[0])): | |
| text = text.replace(hindi, f' {latin} ') | |
| # Then transliterate remaining Devanagari | |
| result = [] | |
| i = 0 | |
| while i < len(text): | |
| char = text[i] | |
| code = ord(char) | |
| if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END: | |
| # Check digits first | |
| if char in self.DIGIT_MAP: | |
| result.append(self.DIGIT_MAP[char]) | |
| # Check vowels | |
| elif char in self.VOWEL_MAP: | |
| result.append(self.VOWEL_MAP[char]) | |
| # Check consonants | |
| elif char in self.CONSONANT_MAP: | |
| result.append(self.CONSONANT_MAP[char]) | |
| # Add implicit 'a' unless followed by matra or halant | |
| if i + 1 < len(text): | |
| next_char = text[i + 1] | |
| next_code = ord(next_char) | |
| # If next is a matra (0x093E-0x094D) or halant, don't add 'a' | |
| if not (0x093E <= next_code <= 0x094D): | |
| result.append('a') | |
| else: | |
| result.append('a') | |
| else: | |
| # Unknown Devanagari character | |
| result.append(char) | |
| else: | |
| result.append(char) | |
| i += 1 | |
| # Clean up | |
| output = ''.join(result) | |
| output = re.sub(r'\s+', ' ', output) | |
| return output.strip().upper() | |
| def normalize_mixed_script(self, text: str) -> str: | |
| """ | |
| Handle code-mixed (Hindi + English) addresses. | |
| Transliterates Hindi portions while preserving English. | |
| """ | |
| # Split on whitespace to handle word by word | |
| words = text.split() | |
| result = [] | |
| for word in words: | |
| if self.contains_devanagari(word): | |
| # Check if it's a known term first | |
| if self.use_known_terms and word in self.HINDI_TERMS: | |
| result.append(self.HINDI_TERMS[word]) | |
| else: | |
| result.append(self.transliterate(word)) | |
| else: | |
| result.append(word.upper()) | |
| return ' '.join(result) | |
| def detect_language(text: str) -> str: | |
| """ | |
| Simple language detection for address text. | |
| Returns: 'hindi', 'english', or 'mixed' | |
| """ | |
| transliterator = HindiTransliterator() | |
| ratios = transliterator.get_script_ratio(text) | |
| if ratios['devanagari'] > 0.5: | |
| return 'hindi' | |
| elif ratios['latin'] > 0.5: | |
| return 'english' | |
| elif ratios['devanagari'] > 0 and ratios['latin'] > 0: | |
| return 'mixed' | |
| else: | |
| return 'english' | |