Spaces:

LibrAI
/

uae-kb

Sleeping

File size: 8,815 Bytes
"""
Arabic Name Normalizer
Handles spelling variations, title stripping, and stemming for Arabic names.
Optimized for UAE/Saudi royal family name matching.
"""

import re
import unicodedata
from typing import List, Tuple


class ArabicNameNormalizer:
    """
    Normalizes Arabic names for consistent matching.
    
    Handles:
    - Title/honorific stripping (H.H., Sheikh, etc.)
    - Spelling standardization (Mohammed → Muhammad, Nahyan → Nahyan)
    - Particle normalization (ibn → bin, aal → al)
    - Search stemming (aggressive removal of particles)
    """
    
    def __init__(self):
        # Titles and honorifics to strip (order matters - longer first)
        self.titles = [
            'H.R.H.', 'H.H.', 'H.E.', 
            'Maj. Gen.', 'Lt. Gen.', 'Brig. Gen.', 'Lt. Col.',
            'Sheikh', 'Sheikha', 'Shaikh', 'Shaykh',
            'Prince', 'Princess', 'Emir', 'Emira',
            'Dr.', 'Eng.', 'Pilot', 
            'Mr.', 'Mrs.', 'Ms.', 
            'Sayyid', 'Sayyida',
        ]
        
        # Build title regex (case-insensitive)
        title_pattern = '|'.join(re.escape(t) for t in self.titles)
        self.title_regex = re.compile(
            rf'^({title_pattern})\.?\s*', 
            re.IGNORECASE
        )
        
        # Canonical spelling mappings (to → from variations)
        self.spelling_map = {
            # First names
            'muhammad': ['mohammed', 'mohamed', 'muhammed', 'mohammad', 'mohamad'],
            'ahmad': ['ahmed'],
            'abdullah': ['abdulla', 'abdallah'],
            'khalid': ['khaled'],
            'rashid': ['rashed'],
            'said': ['saeed', 'saeid'],
            'mansour': ['mansoor'],
            'tahnoun': ['tahnoon'],
            'hazza': ['hazzaa'],
            'thiab': ['theyab', 'diab', 'dhiab'],
            'shakhbut': ['shakhboot'],
            'lateefa': ['latifa'],
            'maryam': ['mariam', 'maryem'],
            'salim': ['salem'],
            'jabir': ['jaber'],
            'nasir': ['nasser'],
            'fatimah': ['fatema', 'fatima'],
            'nour': ['noor'],
            'hamad': ['hamed'],  # Important variation
            
            # Family names
            'nahyan': ['nehayan', 'nayhan', 'nahayn'],
            'maktoum': ['maktoom'],
        }
        
        # Build reverse lookup
        self._build_spelling_lookup()
        
        # Particles (connectors in Arabic names)
        self.particle_normalizations = {
            'ibn': 'bin',
            'ben': 'bin',
            'aal': 'al',
            'el': 'al',
            'ad': 'al',
            'as': 'al',
            'at': 'al',
            'az': 'al',
            'an': 'al',
            'ar': 'al',
        }
        
        # Particles to remove for aggressive stemming
        self.particles_to_remove = {'bin', 'ibn', 'ben', 'al', 'el', 'aal', 'bint'}
    
    def _build_spelling_lookup(self) -> None:
        """Build variant → canonical lookup"""
        self._spelling_lookup = {}
        for canonical, variants in self.spelling_map.items():
            for variant in variants:
                self._spelling_lookup[variant] = canonical
    
    def clean(self, text: str) -> str:
        """
        Basic cleaning: Unicode normalization, ASCII conversion, whitespace.
        
        Args:
            text: Raw input text
            
        Returns:
            Cleaned text
        """
        if not text:
            return ""
        
        # Unicode normalization (decompose accents)
        text = unicodedata.normalize('NFKD', text)
        # Convert to ASCII (remove accents)
        text = text.encode('ASCII', 'ignore').decode('utf-8')
        # Normalize whitespace
        text = ' '.join(text.split())
        
        return text.strip()
    
    def strip_titles(self, text: str) -> str:
        """
        Remove titles and honorifics from beginning of text.
        
        Args:
            text: Text potentially starting with title
            
        Returns:
            Text with titles stripped
        """
        # Repeatedly strip titles (for cases like "H.H. Sheikh")
        while True:
            new_text = self.title_regex.sub('', text).strip()
            if new_text == text:
                break
            text = new_text
        
        return text
    
    def normalize(self, text: str) -> str:
        """
        Standard normalization: cleaning + title stripping + spelling standardization.
        Preserves particles (bin, al) for structured matching.
        
        Args:
            text: Raw text
            
        Returns:
            Normalized text (lowercase, standardized spelling)
        """
        if not text:
            return ""
        
        # Step 1: Clean
        text = self.clean(text)
        
        # Step 2: Strip titles
        text = self.strip_titles(text)
        
        # Step 3: Lowercase and remove punctuation (except hyphens→spaces)
        text = text.replace('-', ' ')
        text = re.sub(r'[^\w\s]', '', text).lower()
        
        # Step 4: Normalize particles
        tokens = text.split()
        normalized = []
        for token in tokens:
            # Particle normalization
            if token in self.particle_normalizations:
                normalized.append(self.particle_normalizations[token])
            # Spelling normalization
            elif token in self._spelling_lookup:
                normalized.append(self._spelling_lookup[token])
            else:
                normalized.append(token)
        
        return ' '.join(normalized)
    
    def stem(self, text: str) -> str:
        """
        Aggressive stemming: normalization + particle removal.
        Used for broad matching when normalized match fails.
        
        Args:
            text: Text (can be raw or already normalized)
            
        Returns:
            Stemmed text (particles removed)
        """
        # Ensure normalized first
        if not text.islower():
            text = self.normalize(text)
        
        # Remove particles
        tokens = text.split()
        stemmed = [t for t in tokens if t not in self.particles_to_remove]
        
        return ' '.join(stemmed)
    
    def get_all_forms(self, text: str) -> Tuple[str, str, str]:
        """
        Get all forms of text for matching.
        
        Args:
            text: Raw input text
            
        Returns:
            Tuple of (cleaned, normalized, stemmed)
        """
        cleaned = self.clean(text)
        normalized = self.normalize(text)
        stemmed = self.stem(normalized)
        
        return (cleaned, normalized, stemmed)
    
    def extract_name_parts(self, name: str) -> dict:
        """
        Extract structured parts from an Arabic name.
        Useful for debugging and structured matching.
        
        Args:
            name: Full name (e.g., "H.H. Sheikh Mohammed bin Zayed Al Nahyan")
            
        Returns:
            Dict with title, first_name, patronymic_chain, family_name
        """
        original = name
        
        # Extract titles
        titles = []
        while True:
            match = self.title_regex.match(name)
            if match:
                titles.append(match.group(1))
                name = name[match.end():].strip()
            else:
                break
        
        # Normalize for parsing
        name_lower = name.lower()
        tokens = name_lower.split()
        
        # Find family name (usually after "al" at the end)
        family_name = ""
        if len(tokens) >= 2 and tokens[-2] == 'al':
            family_name = f"al {tokens[-1]}"
            tokens = tokens[:-2]
        elif tokens and tokens[-1] in ['nahyan', 'maktoum', 'qasimi', 'sharqi', 'nuaimi', 'mualla']:
            family_name = tokens[-1]
            tokens = tokens[:-1]
        
        # First name is first non-particle token
        first_name = ""
        patronymic = []
        
        for i, token in enumerate(tokens):
            if token in self.particles_to_remove:
                continue
            if not first_name:
                first_name = token
            else:
                patronymic.append(token)
        
        return {
            "original": original,
            "titles": titles,
            "first_name": first_name,
            "patronymic": patronymic,  # Father, grandfather names
            "family_name": family_name,
        }


# Singleton instance for convenience
_default_normalizer = None

def get_normalizer() -> ArabicNameNormalizer:
    """Get the default normalizer instance"""
    global _default_normalizer
    if _default_normalizer is None:
        _default_normalizer = ArabicNameNormalizer()
    return _default_normalizer