""" Arabic Text Utilities ===================== Utilities for normalizing and processing Arabic text for function calling evaluation. """ import re import unicodedata class ArabicNormalizer: """Normalize Arabic text for consistent comparison.""" # Arabic diacritics (tashkeel) to remove ARABIC_DIACRITICS = re.compile(r'[\u064B-\u065F\u0670]') # Alef variants to normalize ALEF_VARIANTS = { '\u0622': '\u0627', # آ -> ا '\u0623': '\u0627', # أ -> ا '\u0625': '\u0627', # إ -> ا '\u0671': '\u0627', # ٱ -> ا } # Ta marbuta to ha TA_MARBUTA = '\u0629' HA = '\u0647' # Arabic-Indic numerals to Western ARABIC_INDIC_NUMERALS = { '\u0660': '0', '\u0661': '1', '\u0662': '2', '\u0663': '3', '\u0664': '4', '\u0665': '5', '\u0666': '6', '\u0667': '7', '\u0668': '8', '\u0669': '9', } # Extended Arabic-Indic numerals (Persian/Urdu) EXTENDED_NUMERALS = { '\u06F0': '0', '\u06F1': '1', '\u06F2': '2', '\u06F3': '3', '\u06F4': '4', '\u06F5': '5', '\u06F6': '6', '\u06F7': '7', '\u06F8': '8', '\u06F9': '9', } def __init__( self, remove_diacritics: bool = True, normalize_alef: bool = True, normalize_ta_marbuta: bool = False, normalize_numerals: bool = True, lowercase: bool = True, strip_whitespace: bool = True ): self.remove_diacritics = remove_diacritics self.normalize_alef = normalize_alef self.normalize_ta_marbuta = normalize_ta_marbuta self.normalize_numerals = normalize_numerals self.lowercase = lowercase self.strip_whitespace = strip_whitespace def normalize(self, text: str) -> str: """Apply all configured normalizations to text.""" if not text: return "" # Unicode normalization text = unicodedata.normalize('NFC', text) # Remove diacritics if self.remove_diacritics: text = self.ARABIC_DIACRITICS.sub('', text) # Normalize alef variants if self.normalize_alef: for variant, replacement in self.ALEF_VARIANTS.items(): text = text.replace(variant, replacement) # Normalize ta marbuta if self.normalize_ta_marbuta: text = text.replace(self.TA_MARBUTA, self.HA) # Normalize numerals if self.normalize_numerals: for arabic, western in self.ARABIC_INDIC_NUMERALS.items(): text = text.replace(arabic, western) for persian, western in self.EXTENDED_NUMERALS.items(): text = text.replace(persian, western) # Lowercase (for Latin characters in function names) if self.lowercase: text = text.lower() # Strip and normalize whitespace if self.strip_whitespace: text = ' '.join(text.split()) return text def normalize_for_comparison(self, text: str) -> str: """Aggressive normalization for fuzzy matching.""" text = self.normalize(text) # Remove all punctuation text = re.sub(r'[^\w\s]', '', text) # Remove extra whitespace text = ' '.join(text.split()) return text def extract_arabic_numbers(text: str) -> list: """Extract numbers from Arabic text (both Arabic-Indic and Western).""" normalizer = ArabicNormalizer(normalize_numerals=True) normalized = normalizer.normalize(text) return re.findall(r'\d+(?:\.\d+)?', normalized) def is_arabic_text(text: str) -> bool: """Check if text contains Arabic characters.""" arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+') return bool(arabic_pattern.search(text)) def detect_dialect(text: str) -> str: """ Simple dialect detection based on common markers. Returns: 'egyptian', 'gulf', 'levantine', or 'msa' (Modern Standard Arabic) """ text_lower = text.lower() # Egyptian markers egyptian_markers = ['ازاي', 'عايز', 'كده', 'ده', 'دي', 'بتاع', 'اوي', 'خالص'] if any(marker in text for marker in egyptian_markers): return 'egyptian' # Gulf markers gulf_markers = ['شلون', 'ابي', 'ابغى', 'وايد', 'زين', 'حيل', 'يالله'] if any(marker in text for marker in gulf_markers): return 'gulf' # Levantine markers levantine_markers = ['كيفك', 'شو', 'هيك', 'منيح', 'كتير', 'هلق', 'بدي'] if any(marker in text for marker in levantine_markers): return 'levantine' return 'msa'