""" Arabic Name Normalizer Handles spelling variations, title stripping, and stemming for Arabic names. Optimized for UAE/Saudi royal family name matching. """ import re import unicodedata from typing import List, Tuple class ArabicNameNormalizer: """ Normalizes Arabic names for consistent matching. Handles: - Title/honorific stripping (H.H., Sheikh, etc.) - Spelling standardization (Mohammed → Muhammad, Nahyan → Nahyan) - Particle normalization (ibn → bin, aal → al) - Search stemming (aggressive removal of particles) """ def __init__(self): # Titles and honorifics to strip (order matters - longer first) self.titles = [ 'H.R.H.', 'H.H.', 'H.E.', 'Maj. Gen.', 'Lt. Gen.', 'Brig. Gen.', 'Lt. Col.', 'Sheikh', 'Sheikha', 'Shaikh', 'Shaykh', 'Prince', 'Princess', 'Emir', 'Emira', 'Dr.', 'Eng.', 'Pilot', 'Mr.', 'Mrs.', 'Ms.', 'Sayyid', 'Sayyida', ] # Build title regex (case-insensitive) title_pattern = '|'.join(re.escape(t) for t in self.titles) self.title_regex = re.compile( rf'^({title_pattern})\.?\s*', re.IGNORECASE ) # Canonical spelling mappings (to → from variations) self.spelling_map = { # First names 'muhammad': ['mohammed', 'mohamed', 'muhammed', 'mohammad', 'mohamad'], 'ahmad': ['ahmed'], 'abdullah': ['abdulla', 'abdallah'], 'khalid': ['khaled'], 'rashid': ['rashed'], 'said': ['saeed', 'saeid'], 'mansour': ['mansoor'], 'tahnoun': ['tahnoon'], 'hazza': ['hazzaa'], 'thiab': ['theyab', 'diab', 'dhiab'], 'shakhbut': ['shakhboot'], 'lateefa': ['latifa'], 'maryam': ['mariam', 'maryem'], 'salim': ['salem'], 'jabir': ['jaber'], 'nasir': ['nasser'], 'fatimah': ['fatema', 'fatima'], 'nour': ['noor'], 'hamad': ['hamed'], # Important variation # Family names 'nahyan': ['nehayan', 'nayhan', 'nahayn'], 'maktoum': ['maktoom'], } # Build reverse lookup self._build_spelling_lookup() # Particles (connectors in Arabic names) self.particle_normalizations = { 'ibn': 'bin', 'ben': 'bin', 'aal': 'al', 'el': 'al', 'ad': 'al', 'as': 'al', 'at': 'al', 'az': 'al', 'an': 'al', 'ar': 'al', } # Particles to remove for aggressive stemming self.particles_to_remove = {'bin', 'ibn', 'ben', 'al', 'el', 'aal', 'bint'} def _build_spelling_lookup(self) -> None: """Build variant → canonical lookup""" self._spelling_lookup = {} for canonical, variants in self.spelling_map.items(): for variant in variants: self._spelling_lookup[variant] = canonical def clean(self, text: str) -> str: """ Basic cleaning: Unicode normalization, ASCII conversion, whitespace. Args: text: Raw input text Returns: Cleaned text """ if not text: return "" # Unicode normalization (decompose accents) text = unicodedata.normalize('NFKD', text) # Convert to ASCII (remove accents) text = text.encode('ASCII', 'ignore').decode('utf-8') # Normalize whitespace text = ' '.join(text.split()) return text.strip() def strip_titles(self, text: str) -> str: """ Remove titles and honorifics from beginning of text. Args: text: Text potentially starting with title Returns: Text with titles stripped """ # Repeatedly strip titles (for cases like "H.H. Sheikh") while True: new_text = self.title_regex.sub('', text).strip() if new_text == text: break text = new_text return text def normalize(self, text: str) -> str: """ Standard normalization: cleaning + title stripping + spelling standardization. Preserves particles (bin, al) for structured matching. Args: text: Raw text Returns: Normalized text (lowercase, standardized spelling) """ if not text: return "" # Step 1: Clean text = self.clean(text) # Step 2: Strip titles text = self.strip_titles(text) # Step 3: Lowercase and remove punctuation (except hyphens→spaces) text = text.replace('-', ' ') text = re.sub(r'[^\w\s]', '', text).lower() # Step 4: Normalize particles tokens = text.split() normalized = [] for token in tokens: # Particle normalization if token in self.particle_normalizations: normalized.append(self.particle_normalizations[token]) # Spelling normalization elif token in self._spelling_lookup: normalized.append(self._spelling_lookup[token]) else: normalized.append(token) return ' '.join(normalized) def stem(self, text: str) -> str: """ Aggressive stemming: normalization + particle removal. Used for broad matching when normalized match fails. Args: text: Text (can be raw or already normalized) Returns: Stemmed text (particles removed) """ # Ensure normalized first if not text.islower(): text = self.normalize(text) # Remove particles tokens = text.split() stemmed = [t for t in tokens if t not in self.particles_to_remove] return ' '.join(stemmed) def get_all_forms(self, text: str) -> Tuple[str, str, str]: """ Get all forms of text for matching. Args: text: Raw input text Returns: Tuple of (cleaned, normalized, stemmed) """ cleaned = self.clean(text) normalized = self.normalize(text) stemmed = self.stem(normalized) return (cleaned, normalized, stemmed) def extract_name_parts(self, name: str) -> dict: """ Extract structured parts from an Arabic name. Useful for debugging and structured matching. Args: name: Full name (e.g., "H.H. Sheikh Mohammed bin Zayed Al Nahyan") Returns: Dict with title, first_name, patronymic_chain, family_name """ original = name # Extract titles titles = [] while True: match = self.title_regex.match(name) if match: titles.append(match.group(1)) name = name[match.end():].strip() else: break # Normalize for parsing name_lower = name.lower() tokens = name_lower.split() # Find family name (usually after "al" at the end) family_name = "" if len(tokens) >= 2 and tokens[-2] == 'al': family_name = f"al {tokens[-1]}" tokens = tokens[:-2] elif tokens and tokens[-1] in ['nahyan', 'maktoum', 'qasimi', 'sharqi', 'nuaimi', 'mualla']: family_name = tokens[-1] tokens = tokens[:-1] # First name is first non-particle token first_name = "" patronymic = [] for i, token in enumerate(tokens): if token in self.particles_to_remove: continue if not first_name: first_name = token else: patronymic.append(token) return { "original": original, "titles": titles, "first_name": first_name, "patronymic": patronymic, # Father, grandfather names "family_name": family_name, } # Singleton instance for convenience _default_normalizer = None def get_normalizer() -> ArabicNameNormalizer: """Get the default normalizer instance""" global _default_normalizer if _default_normalizer is None: _default_normalizer = ArabicNameNormalizer() return _default_normalizer