|
|
""" |
|
|
Arabic Name Normalizer |
|
|
Handles spelling variations, title stripping, and stemming for Arabic names. |
|
|
Optimized for UAE/Saudi royal family name matching. |
|
|
""" |
|
|
|
|
|
import re |
|
|
import unicodedata |
|
|
from typing import List, Tuple |
|
|
|
|
|
|
|
|
class ArabicNameNormalizer: |
|
|
""" |
|
|
Normalizes Arabic names for consistent matching. |
|
|
|
|
|
Handles: |
|
|
- Title/honorific stripping (H.H., Sheikh, etc.) |
|
|
- Spelling standardization (Mohammed → Muhammad, Nahyan → Nahyan) |
|
|
- Particle normalization (ibn → bin, aal → al) |
|
|
- Search stemming (aggressive removal of particles) |
|
|
""" |
|
|
|
|
|
def __init__(self): |
|
|
|
|
|
self.titles = [ |
|
|
'H.R.H.', 'H.H.', 'H.E.', |
|
|
'Maj. Gen.', 'Lt. Gen.', 'Brig. Gen.', 'Lt. Col.', |
|
|
'Sheikh', 'Sheikha', 'Shaikh', 'Shaykh', |
|
|
'Prince', 'Princess', 'Emir', 'Emira', |
|
|
'Dr.', 'Eng.', 'Pilot', |
|
|
'Mr.', 'Mrs.', 'Ms.', |
|
|
'Sayyid', 'Sayyida', |
|
|
] |
|
|
|
|
|
|
|
|
title_pattern = '|'.join(re.escape(t) for t in self.titles) |
|
|
self.title_regex = re.compile( |
|
|
rf'^({title_pattern})\.?\s*', |
|
|
re.IGNORECASE |
|
|
) |
|
|
|
|
|
|
|
|
self.spelling_map = { |
|
|
|
|
|
'muhammad': ['mohammed', 'mohamed', 'muhammed', 'mohammad', 'mohamad'], |
|
|
'ahmad': ['ahmed'], |
|
|
'abdullah': ['abdulla', 'abdallah'], |
|
|
'khalid': ['khaled'], |
|
|
'rashid': ['rashed'], |
|
|
'said': ['saeed', 'saeid'], |
|
|
'mansour': ['mansoor'], |
|
|
'tahnoun': ['tahnoon'], |
|
|
'hazza': ['hazzaa'], |
|
|
'thiab': ['theyab', 'diab', 'dhiab'], |
|
|
'shakhbut': ['shakhboot'], |
|
|
'lateefa': ['latifa'], |
|
|
'maryam': ['mariam', 'maryem'], |
|
|
'salim': ['salem'], |
|
|
'jabir': ['jaber'], |
|
|
'nasir': ['nasser'], |
|
|
'fatimah': ['fatema', 'fatima'], |
|
|
'nour': ['noor'], |
|
|
'hamad': ['hamed'], |
|
|
|
|
|
|
|
|
'nahyan': ['nehayan', 'nayhan', 'nahayn'], |
|
|
'maktoum': ['maktoom'], |
|
|
} |
|
|
|
|
|
|
|
|
self._build_spelling_lookup() |
|
|
|
|
|
|
|
|
self.particle_normalizations = { |
|
|
'ibn': 'bin', |
|
|
'ben': 'bin', |
|
|
'aal': 'al', |
|
|
'el': 'al', |
|
|
'ad': 'al', |
|
|
'as': 'al', |
|
|
'at': 'al', |
|
|
'az': 'al', |
|
|
'an': 'al', |
|
|
'ar': 'al', |
|
|
} |
|
|
|
|
|
|
|
|
self.particles_to_remove = {'bin', 'ibn', 'ben', 'al', 'el', 'aal', 'bint'} |
|
|
|
|
|
def _build_spelling_lookup(self) -> None: |
|
|
"""Build variant → canonical lookup""" |
|
|
self._spelling_lookup = {} |
|
|
for canonical, variants in self.spelling_map.items(): |
|
|
for variant in variants: |
|
|
self._spelling_lookup[variant] = canonical |
|
|
|
|
|
def clean(self, text: str) -> str: |
|
|
""" |
|
|
Basic cleaning: Unicode normalization, ASCII conversion, whitespace. |
|
|
|
|
|
Args: |
|
|
text: Raw input text |
|
|
|
|
|
Returns: |
|
|
Cleaned text |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = unicodedata.normalize('NFKD', text) |
|
|
|
|
|
text = text.encode('ASCII', 'ignore').decode('utf-8') |
|
|
|
|
|
text = ' '.join(text.split()) |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
def strip_titles(self, text: str) -> str: |
|
|
""" |
|
|
Remove titles and honorifics from beginning of text. |
|
|
|
|
|
Args: |
|
|
text: Text potentially starting with title |
|
|
|
|
|
Returns: |
|
|
Text with titles stripped |
|
|
""" |
|
|
|
|
|
while True: |
|
|
new_text = self.title_regex.sub('', text).strip() |
|
|
if new_text == text: |
|
|
break |
|
|
text = new_text |
|
|
|
|
|
return text |
|
|
|
|
|
def normalize(self, text: str) -> str: |
|
|
""" |
|
|
Standard normalization: cleaning + title stripping + spelling standardization. |
|
|
Preserves particles (bin, al) for structured matching. |
|
|
|
|
|
Args: |
|
|
text: Raw text |
|
|
|
|
|
Returns: |
|
|
Normalized text (lowercase, standardized spelling) |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = self.clean(text) |
|
|
|
|
|
|
|
|
text = self.strip_titles(text) |
|
|
|
|
|
|
|
|
text = text.replace('-', ' ') |
|
|
text = re.sub(r'[^\w\s]', '', text).lower() |
|
|
|
|
|
|
|
|
tokens = text.split() |
|
|
normalized = [] |
|
|
for token in tokens: |
|
|
|
|
|
if token in self.particle_normalizations: |
|
|
normalized.append(self.particle_normalizations[token]) |
|
|
|
|
|
elif token in self._spelling_lookup: |
|
|
normalized.append(self._spelling_lookup[token]) |
|
|
else: |
|
|
normalized.append(token) |
|
|
|
|
|
return ' '.join(normalized) |
|
|
|
|
|
def stem(self, text: str) -> str: |
|
|
""" |
|
|
Aggressive stemming: normalization + particle removal. |
|
|
Used for broad matching when normalized match fails. |
|
|
|
|
|
Args: |
|
|
text: Text (can be raw or already normalized) |
|
|
|
|
|
Returns: |
|
|
Stemmed text (particles removed) |
|
|
""" |
|
|
|
|
|
if not text.islower(): |
|
|
text = self.normalize(text) |
|
|
|
|
|
|
|
|
tokens = text.split() |
|
|
stemmed = [t for t in tokens if t not in self.particles_to_remove] |
|
|
|
|
|
return ' '.join(stemmed) |
|
|
|
|
|
def get_all_forms(self, text: str) -> Tuple[str, str, str]: |
|
|
""" |
|
|
Get all forms of text for matching. |
|
|
|
|
|
Args: |
|
|
text: Raw input text |
|
|
|
|
|
Returns: |
|
|
Tuple of (cleaned, normalized, stemmed) |
|
|
""" |
|
|
cleaned = self.clean(text) |
|
|
normalized = self.normalize(text) |
|
|
stemmed = self.stem(normalized) |
|
|
|
|
|
return (cleaned, normalized, stemmed) |
|
|
|
|
|
def extract_name_parts(self, name: str) -> dict: |
|
|
""" |
|
|
Extract structured parts from an Arabic name. |
|
|
Useful for debugging and structured matching. |
|
|
|
|
|
Args: |
|
|
name: Full name (e.g., "H.H. Sheikh Mohammed bin Zayed Al Nahyan") |
|
|
|
|
|
Returns: |
|
|
Dict with title, first_name, patronymic_chain, family_name |
|
|
""" |
|
|
original = name |
|
|
|
|
|
|
|
|
titles = [] |
|
|
while True: |
|
|
match = self.title_regex.match(name) |
|
|
if match: |
|
|
titles.append(match.group(1)) |
|
|
name = name[match.end():].strip() |
|
|
else: |
|
|
break |
|
|
|
|
|
|
|
|
name_lower = name.lower() |
|
|
tokens = name_lower.split() |
|
|
|
|
|
|
|
|
family_name = "" |
|
|
if len(tokens) >= 2 and tokens[-2] == 'al': |
|
|
family_name = f"al {tokens[-1]}" |
|
|
tokens = tokens[:-2] |
|
|
elif tokens and tokens[-1] in ['nahyan', 'maktoum', 'qasimi', 'sharqi', 'nuaimi', 'mualla']: |
|
|
family_name = tokens[-1] |
|
|
tokens = tokens[:-1] |
|
|
|
|
|
|
|
|
first_name = "" |
|
|
patronymic = [] |
|
|
|
|
|
for i, token in enumerate(tokens): |
|
|
if token in self.particles_to_remove: |
|
|
continue |
|
|
if not first_name: |
|
|
first_name = token |
|
|
else: |
|
|
patronymic.append(token) |
|
|
|
|
|
return { |
|
|
"original": original, |
|
|
"titles": titles, |
|
|
"first_name": first_name, |
|
|
"patronymic": patronymic, |
|
|
"family_name": family_name, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
_default_normalizer = None |
|
|
|
|
|
def get_normalizer() -> ArabicNameNormalizer: |
|
|
"""Get the default normalizer instance""" |
|
|
global _default_normalizer |
|
|
if _default_normalizer is None: |
|
|
_default_normalizer = ArabicNameNormalizer() |
|
|
return _default_normalizer |
|
|
|
|
|
|