Spaces:

LibrAI
/

uae-kb

Running

App Files Files Community

uae-kb / ir /normalizer.py

Demon1212122

Initial UAE Knowledge System demo

8124364 about 1 month ago

raw

history blame contribute delete

8.82 kB

	"""
	Arabic Name Normalizer
	Handles spelling variations, title stripping, and stemming for Arabic names.
	Optimized for UAE/Saudi royal family name matching.
	"""

	import re
	import unicodedata
	from typing import List, Tuple


	class ArabicNameNormalizer:
	"""
	Normalizes Arabic names for consistent matching.

	Handles:
	- Title/honorific stripping (H.H., Sheikh, etc.)
	- Spelling standardization (Mohammed → Muhammad, Nahyan → Nahyan)
	- Particle normalization (ibn → bin, aal → al)
	- Search stemming (aggressive removal of particles)
	"""

	def __init__(self):
	# Titles and honorifics to strip (order matters - longer first)
	self.titles = [
	'H.R.H.', 'H.H.', 'H.E.',
	'Maj. Gen.', 'Lt. Gen.', 'Brig. Gen.', 'Lt. Col.',
	'Sheikh', 'Sheikha', 'Shaikh', 'Shaykh',
	'Prince', 'Princess', 'Emir', 'Emira',
	'Dr.', 'Eng.', 'Pilot',
	'Mr.', 'Mrs.', 'Ms.',
	'Sayyid', 'Sayyida',
	]

	# Build title regex (case-insensitive)
	title_pattern = '\|'.join(re.escape(t) for t in self.titles)
	self.title_regex = re.compile(
	rf'^({title_pattern})\.?\s*',
	re.IGNORECASE
	)

	# Canonical spelling mappings (to → from variations)
	self.spelling_map = {
	# First names
	'muhammad': ['mohammed', 'mohamed', 'muhammed', 'mohammad', 'mohamad'],
	'ahmad': ['ahmed'],
	'abdullah': ['abdulla', 'abdallah'],
	'khalid': ['khaled'],
	'rashid': ['rashed'],
	'said': ['saeed', 'saeid'],
	'mansour': ['mansoor'],
	'tahnoun': ['tahnoon'],
	'hazza': ['hazzaa'],
	'thiab': ['theyab', 'diab', 'dhiab'],
	'shakhbut': ['shakhboot'],
	'lateefa': ['latifa'],
	'maryam': ['mariam', 'maryem'],
	'salim': ['salem'],
	'jabir': ['jaber'],
	'nasir': ['nasser'],
	'fatimah': ['fatema', 'fatima'],
	'nour': ['noor'],
	'hamad': ['hamed'], # Important variation

	# Family names
	'nahyan': ['nehayan', 'nayhan', 'nahayn'],
	'maktoum': ['maktoom'],
	}

	# Build reverse lookup
	self._build_spelling_lookup()

	# Particles (connectors in Arabic names)
	self.particle_normalizations = {
	'ibn': 'bin',
	'ben': 'bin',
	'aal': 'al',
	'el': 'al',
	'ad': 'al',
	'as': 'al',
	'at': 'al',
	'az': 'al',
	'an': 'al',
	'ar': 'al',
	}

	# Particles to remove for aggressive stemming
	self.particles_to_remove = {'bin', 'ibn', 'ben', 'al', 'el', 'aal', 'bint'}

	def _build_spelling_lookup(self) -> None:
	"""Build variant → canonical lookup"""
	self._spelling_lookup = {}
	for canonical, variants in self.spelling_map.items():
	for variant in variants:
	self._spelling_lookup[variant] = canonical

	def clean(self, text: str) -> str:
	"""
	Basic cleaning: Unicode normalization, ASCII conversion, whitespace.

	Args:
	text: Raw input text

	Returns:
	Cleaned text
	"""
	if not text:
	return ""

	# Unicode normalization (decompose accents)
	text = unicodedata.normalize('NFKD', text)
	# Convert to ASCII (remove accents)
	text = text.encode('ASCII', 'ignore').decode('utf-8')
	# Normalize whitespace
	text = ' '.join(text.split())

	return text.strip()

	def strip_titles(self, text: str) -> str:
	"""
	Remove titles and honorifics from beginning of text.

	Args:
	text: Text potentially starting with title

	Returns:
	Text with titles stripped
	"""
	# Repeatedly strip titles (for cases like "H.H. Sheikh")
	while True:
	new_text = self.title_regex.sub('', text).strip()
	if new_text == text:
	break
	text = new_text

	return text

	def normalize(self, text: str) -> str:
	"""
	Standard normalization: cleaning + title stripping + spelling standardization.
	Preserves particles (bin, al) for structured matching.

	Args:
	text: Raw text

	Returns:
	Normalized text (lowercase, standardized spelling)
	"""
	if not text:
	return ""

	# Step 1: Clean
	text = self.clean(text)

	# Step 2: Strip titles
	text = self.strip_titles(text)

	# Step 3: Lowercase and remove punctuation (except hyphens→spaces)
	text = text.replace('-', ' ')
	text = re.sub(r'[^\w\s]', '', text).lower()

	# Step 4: Normalize particles
	tokens = text.split()
	normalized = []
	for token in tokens:
	# Particle normalization
	if token in self.particle_normalizations:
	normalized.append(self.particle_normalizations[token])
	# Spelling normalization
	elif token in self._spelling_lookup:
	normalized.append(self._spelling_lookup[token])
	else:
	normalized.append(token)

	return ' '.join(normalized)

	def stem(self, text: str) -> str:
	"""
	Aggressive stemming: normalization + particle removal.
	Used for broad matching when normalized match fails.

	Args:
	text: Text (can be raw or already normalized)

	Returns:
	Stemmed text (particles removed)
	"""
	# Ensure normalized first
	if not text.islower():
	text = self.normalize(text)

	# Remove particles
	tokens = text.split()
	stemmed = [t for t in tokens if t not in self.particles_to_remove]

	return ' '.join(stemmed)

	def get_all_forms(self, text: str) -> Tuple[str, str, str]:
	"""
	Get all forms of text for matching.

	Args:
	text: Raw input text

	Returns:
	Tuple of (cleaned, normalized, stemmed)
	"""
	cleaned = self.clean(text)
	normalized = self.normalize(text)
	stemmed = self.stem(normalized)

	return (cleaned, normalized, stemmed)

	def extract_name_parts(self, name: str) -> dict:
	"""
	Extract structured parts from an Arabic name.
	Useful for debugging and structured matching.

	Args:
	name: Full name (e.g., "H.H. Sheikh Mohammed bin Zayed Al Nahyan")

	Returns:
	Dict with title, first_name, patronymic_chain, family_name
	"""
	original = name

	# Extract titles
	titles = []
	while True:
	match = self.title_regex.match(name)
	if match:
	titles.append(match.group(1))
	name = name[match.end():].strip()
	else:
	break

	# Normalize for parsing
	name_lower = name.lower()
	tokens = name_lower.split()

	# Find family name (usually after "al" at the end)
	family_name = ""
	if len(tokens) >= 2 and tokens[-2] == 'al':
	family_name = f"al {tokens[-1]}"
	tokens = tokens[:-2]
	elif tokens and tokens[-1] in ['nahyan', 'maktoum', 'qasimi', 'sharqi', 'nuaimi', 'mualla']:
	family_name = tokens[-1]
	tokens = tokens[:-1]

	# First name is first non-particle token
	first_name = ""
	patronymic = []

	for i, token in enumerate(tokens):
	if token in self.particles_to_remove:
	continue
	if not first_name:
	first_name = token
	else:
	patronymic.append(token)

	return {
	"original": original,
	"titles": titles,
	"first_name": first_name,
	"patronymic": patronymic, # Father, grandfather names
	"family_name": family_name,
	}


	# Singleton instance for convenience
	_default_normalizer = None

	def get_normalizer() -> ArabicNameNormalizer:
	"""Get the default normalizer instance"""
	global _default_normalizer
	if _default_normalizer is None:
	_default_normalizer = ArabicNameNormalizer()
	return _default_normalizer