uae-kb / ir /normalizer.py
Demon1212122's picture
Initial UAE Knowledge System demo
8124364
"""
Arabic Name Normalizer
Handles spelling variations, title stripping, and stemming for Arabic names.
Optimized for UAE/Saudi royal family name matching.
"""
import re
import unicodedata
from typing import List, Tuple
class ArabicNameNormalizer:
"""
Normalizes Arabic names for consistent matching.
Handles:
- Title/honorific stripping (H.H., Sheikh, etc.)
- Spelling standardization (Mohammed → Muhammad, Nahyan → Nahyan)
- Particle normalization (ibn → bin, aal → al)
- Search stemming (aggressive removal of particles)
"""
def __init__(self):
# Titles and honorifics to strip (order matters - longer first)
self.titles = [
'H.R.H.', 'H.H.', 'H.E.',
'Maj. Gen.', 'Lt. Gen.', 'Brig. Gen.', 'Lt. Col.',
'Sheikh', 'Sheikha', 'Shaikh', 'Shaykh',
'Prince', 'Princess', 'Emir', 'Emira',
'Dr.', 'Eng.', 'Pilot',
'Mr.', 'Mrs.', 'Ms.',
'Sayyid', 'Sayyida',
]
# Build title regex (case-insensitive)
title_pattern = '|'.join(re.escape(t) for t in self.titles)
self.title_regex = re.compile(
rf'^({title_pattern})\.?\s*',
re.IGNORECASE
)
# Canonical spelling mappings (to → from variations)
self.spelling_map = {
# First names
'muhammad': ['mohammed', 'mohamed', 'muhammed', 'mohammad', 'mohamad'],
'ahmad': ['ahmed'],
'abdullah': ['abdulla', 'abdallah'],
'khalid': ['khaled'],
'rashid': ['rashed'],
'said': ['saeed', 'saeid'],
'mansour': ['mansoor'],
'tahnoun': ['tahnoon'],
'hazza': ['hazzaa'],
'thiab': ['theyab', 'diab', 'dhiab'],
'shakhbut': ['shakhboot'],
'lateefa': ['latifa'],
'maryam': ['mariam', 'maryem'],
'salim': ['salem'],
'jabir': ['jaber'],
'nasir': ['nasser'],
'fatimah': ['fatema', 'fatima'],
'nour': ['noor'],
'hamad': ['hamed'], # Important variation
# Family names
'nahyan': ['nehayan', 'nayhan', 'nahayn'],
'maktoum': ['maktoom'],
}
# Build reverse lookup
self._build_spelling_lookup()
# Particles (connectors in Arabic names)
self.particle_normalizations = {
'ibn': 'bin',
'ben': 'bin',
'aal': 'al',
'el': 'al',
'ad': 'al',
'as': 'al',
'at': 'al',
'az': 'al',
'an': 'al',
'ar': 'al',
}
# Particles to remove for aggressive stemming
self.particles_to_remove = {'bin', 'ibn', 'ben', 'al', 'el', 'aal', 'bint'}
def _build_spelling_lookup(self) -> None:
"""Build variant → canonical lookup"""
self._spelling_lookup = {}
for canonical, variants in self.spelling_map.items():
for variant in variants:
self._spelling_lookup[variant] = canonical
def clean(self, text: str) -> str:
"""
Basic cleaning: Unicode normalization, ASCII conversion, whitespace.
Args:
text: Raw input text
Returns:
Cleaned text
"""
if not text:
return ""
# Unicode normalization (decompose accents)
text = unicodedata.normalize('NFKD', text)
# Convert to ASCII (remove accents)
text = text.encode('ASCII', 'ignore').decode('utf-8')
# Normalize whitespace
text = ' '.join(text.split())
return text.strip()
def strip_titles(self, text: str) -> str:
"""
Remove titles and honorifics from beginning of text.
Args:
text: Text potentially starting with title
Returns:
Text with titles stripped
"""
# Repeatedly strip titles (for cases like "H.H. Sheikh")
while True:
new_text = self.title_regex.sub('', text).strip()
if new_text == text:
break
text = new_text
return text
def normalize(self, text: str) -> str:
"""
Standard normalization: cleaning + title stripping + spelling standardization.
Preserves particles (bin, al) for structured matching.
Args:
text: Raw text
Returns:
Normalized text (lowercase, standardized spelling)
"""
if not text:
return ""
# Step 1: Clean
text = self.clean(text)
# Step 2: Strip titles
text = self.strip_titles(text)
# Step 3: Lowercase and remove punctuation (except hyphens→spaces)
text = text.replace('-', ' ')
text = re.sub(r'[^\w\s]', '', text).lower()
# Step 4: Normalize particles
tokens = text.split()
normalized = []
for token in tokens:
# Particle normalization
if token in self.particle_normalizations:
normalized.append(self.particle_normalizations[token])
# Spelling normalization
elif token in self._spelling_lookup:
normalized.append(self._spelling_lookup[token])
else:
normalized.append(token)
return ' '.join(normalized)
def stem(self, text: str) -> str:
"""
Aggressive stemming: normalization + particle removal.
Used for broad matching when normalized match fails.
Args:
text: Text (can be raw or already normalized)
Returns:
Stemmed text (particles removed)
"""
# Ensure normalized first
if not text.islower():
text = self.normalize(text)
# Remove particles
tokens = text.split()
stemmed = [t for t in tokens if t not in self.particles_to_remove]
return ' '.join(stemmed)
def get_all_forms(self, text: str) -> Tuple[str, str, str]:
"""
Get all forms of text for matching.
Args:
text: Raw input text
Returns:
Tuple of (cleaned, normalized, stemmed)
"""
cleaned = self.clean(text)
normalized = self.normalize(text)
stemmed = self.stem(normalized)
return (cleaned, normalized, stemmed)
def extract_name_parts(self, name: str) -> dict:
"""
Extract structured parts from an Arabic name.
Useful for debugging and structured matching.
Args:
name: Full name (e.g., "H.H. Sheikh Mohammed bin Zayed Al Nahyan")
Returns:
Dict with title, first_name, patronymic_chain, family_name
"""
original = name
# Extract titles
titles = []
while True:
match = self.title_regex.match(name)
if match:
titles.append(match.group(1))
name = name[match.end():].strip()
else:
break
# Normalize for parsing
name_lower = name.lower()
tokens = name_lower.split()
# Find family name (usually after "al" at the end)
family_name = ""
if len(tokens) >= 2 and tokens[-2] == 'al':
family_name = f"al {tokens[-1]}"
tokens = tokens[:-2]
elif tokens and tokens[-1] in ['nahyan', 'maktoum', 'qasimi', 'sharqi', 'nuaimi', 'mualla']:
family_name = tokens[-1]
tokens = tokens[:-1]
# First name is first non-particle token
first_name = ""
patronymic = []
for i, token in enumerate(tokens):
if token in self.particles_to_remove:
continue
if not first_name:
first_name = token
else:
patronymic.append(token)
return {
"original": original,
"titles": titles,
"first_name": first_name,
"patronymic": patronymic, # Father, grandfather names
"family_name": family_name,
}
# Singleton instance for convenience
_default_normalizer = None
def get_normalizer() -> ArabicNameNormalizer:
"""Get the default normalizer instance"""
global _default_normalizer
if _default_normalizer is None:
_default_normalizer = ArabicNameNormalizer()
return _default_normalizer