HeshamHaroon's picture
Initial release: Arabic Function Calling Leaderboard
566d03e verified
"""
Arabic Text Utilities
=====================
Utilities for normalizing and processing Arabic text for function calling evaluation.
"""
import re
import unicodedata
class ArabicNormalizer:
"""Normalize Arabic text for consistent comparison."""
# Arabic diacritics (tashkeel) to remove
ARABIC_DIACRITICS = re.compile(r'[\u064B-\u065F\u0670]')
# Alef variants to normalize
ALEF_VARIANTS = {
'\u0622': '\u0627', # ุข -> ุง
'\u0623': '\u0627', # ุฃ -> ุง
'\u0625': '\u0627', # ุฅ -> ุง
'\u0671': '\u0627', # ูฑ -> ุง
}
# Ta marbuta to ha
TA_MARBUTA = '\u0629'
HA = '\u0647'
# Arabic-Indic numerals to Western
ARABIC_INDIC_NUMERALS = {
'\u0660': '0', '\u0661': '1', '\u0662': '2', '\u0663': '3', '\u0664': '4',
'\u0665': '5', '\u0666': '6', '\u0667': '7', '\u0668': '8', '\u0669': '9',
}
# Extended Arabic-Indic numerals (Persian/Urdu)
EXTENDED_NUMERALS = {
'\u06F0': '0', '\u06F1': '1', '\u06F2': '2', '\u06F3': '3', '\u06F4': '4',
'\u06F5': '5', '\u06F6': '6', '\u06F7': '7', '\u06F8': '8', '\u06F9': '9',
}
def __init__(
self,
remove_diacritics: bool = True,
normalize_alef: bool = True,
normalize_ta_marbuta: bool = False,
normalize_numerals: bool = True,
lowercase: bool = True,
strip_whitespace: bool = True
):
self.remove_diacritics = remove_diacritics
self.normalize_alef = normalize_alef
self.normalize_ta_marbuta = normalize_ta_marbuta
self.normalize_numerals = normalize_numerals
self.lowercase = lowercase
self.strip_whitespace = strip_whitespace
def normalize(self, text: str) -> str:
"""Apply all configured normalizations to text."""
if not text:
return ""
# Unicode normalization
text = unicodedata.normalize('NFC', text)
# Remove diacritics
if self.remove_diacritics:
text = self.ARABIC_DIACRITICS.sub('', text)
# Normalize alef variants
if self.normalize_alef:
for variant, replacement in self.ALEF_VARIANTS.items():
text = text.replace(variant, replacement)
# Normalize ta marbuta
if self.normalize_ta_marbuta:
text = text.replace(self.TA_MARBUTA, self.HA)
# Normalize numerals
if self.normalize_numerals:
for arabic, western in self.ARABIC_INDIC_NUMERALS.items():
text = text.replace(arabic, western)
for persian, western in self.EXTENDED_NUMERALS.items():
text = text.replace(persian, western)
# Lowercase (for Latin characters in function names)
if self.lowercase:
text = text.lower()
# Strip and normalize whitespace
if self.strip_whitespace:
text = ' '.join(text.split())
return text
def normalize_for_comparison(self, text: str) -> str:
"""Aggressive normalization for fuzzy matching."""
text = self.normalize(text)
# Remove all punctuation
text = re.sub(r'[^\w\s]', '', text)
# Remove extra whitespace
text = ' '.join(text.split())
return text
def extract_arabic_numbers(text: str) -> list:
"""Extract numbers from Arabic text (both Arabic-Indic and Western)."""
normalizer = ArabicNormalizer(normalize_numerals=True)
normalized = normalizer.normalize(text)
return re.findall(r'\d+(?:\.\d+)?', normalized)
def is_arabic_text(text: str) -> bool:
"""Check if text contains Arabic characters."""
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+')
return bool(arabic_pattern.search(text))
def detect_dialect(text: str) -> str:
"""
Simple dialect detection based on common markers.
Returns: 'egyptian', 'gulf', 'levantine', or 'msa' (Modern Standard Arabic)
"""
text_lower = text.lower()
# Egyptian markers
egyptian_markers = ['ุงุฒุงูŠ', 'ุนุงูŠุฒ', 'ูƒุฏู‡', 'ุฏู‡', 'ุฏูŠ', 'ุจุชุงุน', 'ุงูˆูŠ', 'ุฎุงู„ุต']
if any(marker in text for marker in egyptian_markers):
return 'egyptian'
# Gulf markers
gulf_markers = ['ุดู„ูˆู†', 'ุงุจูŠ', 'ุงุจุบู‰', 'ูˆุงูŠุฏ', 'ุฒูŠู†', 'ุญูŠู„', 'ูŠุงู„ู„ู‡']
if any(marker in text for marker in gulf_markers):
return 'gulf'
# Levantine markers
levantine_markers = ['ูƒูŠููƒ', 'ุดูˆ', 'ู‡ูŠูƒ', 'ู…ู†ูŠุญ', 'ูƒุชูŠุฑ', 'ู‡ู„ู‚', 'ุจุฏูŠ']
if any(marker in text for marker in levantine_markers):
return 'levantine'
return 'msa'