Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

App Files Files Community

Arabic-Function-Calling-Leaderboard / afcl /evaluators /arabic_utils.py

HeshamHaroon

Initial release: Arabic Function Calling Leaderboard

566d03e verified 15 days ago

raw

history blame contribute delete

4.63 kB

	"""
	Arabic Text Utilities
	=====================

	Utilities for normalizing and processing Arabic text for function calling evaluation.
	"""

	import re
	import unicodedata


	class ArabicNormalizer:
	"""Normalize Arabic text for consistent comparison."""

	# Arabic diacritics (tashkeel) to remove
	ARABIC_DIACRITICS = re.compile(r'[\u064B-\u065F\u0670]')

	# Alef variants to normalize
	ALEF_VARIANTS = {
	'\u0622': '\u0627', # آ -> ا
	'\u0623': '\u0627', # أ -> ا
	'\u0625': '\u0627', # إ -> ا
	'\u0671': '\u0627', # ٱ -> ا
	}

	# Ta marbuta to ha
	TA_MARBUTA = '\u0629'
	HA = '\u0647'

	# Arabic-Indic numerals to Western
	ARABIC_INDIC_NUMERALS = {
	'\u0660': '0', '\u0661': '1', '\u0662': '2', '\u0663': '3', '\u0664': '4',
	'\u0665': '5', '\u0666': '6', '\u0667': '7', '\u0668': '8', '\u0669': '9',
	}

	# Extended Arabic-Indic numerals (Persian/Urdu)
	EXTENDED_NUMERALS = {
	'\u06F0': '0', '\u06F1': '1', '\u06F2': '2', '\u06F3': '3', '\u06F4': '4',
	'\u06F5': '5', '\u06F6': '6', '\u06F7': '7', '\u06F8': '8', '\u06F9': '9',
	}

	def __init__(
	self,
	remove_diacritics: bool = True,
	normalize_alef: bool = True,
	normalize_ta_marbuta: bool = False,
	normalize_numerals: bool = True,
	lowercase: bool = True,
	strip_whitespace: bool = True
	):
	self.remove_diacritics = remove_diacritics
	self.normalize_alef = normalize_alef
	self.normalize_ta_marbuta = normalize_ta_marbuta
	self.normalize_numerals = normalize_numerals
	self.lowercase = lowercase
	self.strip_whitespace = strip_whitespace

	def normalize(self, text: str) -> str:
	"""Apply all configured normalizations to text."""
	if not text:
	return ""

	# Unicode normalization
	text = unicodedata.normalize('NFC', text)

	# Remove diacritics
	if self.remove_diacritics:
	text = self.ARABIC_DIACRITICS.sub('', text)

	# Normalize alef variants
	if self.normalize_alef:
	for variant, replacement in self.ALEF_VARIANTS.items():
	text = text.replace(variant, replacement)

	# Normalize ta marbuta
	if self.normalize_ta_marbuta:
	text = text.replace(self.TA_MARBUTA, self.HA)

	# Normalize numerals
	if self.normalize_numerals:
	for arabic, western in self.ARABIC_INDIC_NUMERALS.items():
	text = text.replace(arabic, western)
	for persian, western in self.EXTENDED_NUMERALS.items():
	text = text.replace(persian, western)

	# Lowercase (for Latin characters in function names)
	if self.lowercase:
	text = text.lower()

	# Strip and normalize whitespace
	if self.strip_whitespace:
	text = ' '.join(text.split())

	return text

	def normalize_for_comparison(self, text: str) -> str:
	"""Aggressive normalization for fuzzy matching."""
	text = self.normalize(text)
	# Remove all punctuation
	text = re.sub(r'[^\w\s]', '', text)
	# Remove extra whitespace
	text = ' '.join(text.split())
	return text


	def extract_arabic_numbers(text: str) -> list:
	"""Extract numbers from Arabic text (both Arabic-Indic and Western)."""
	normalizer = ArabicNormalizer(normalize_numerals=True)
	normalized = normalizer.normalize(text)
	return re.findall(r'\d+(?:\.\d+)?', normalized)


	def is_arabic_text(text: str) -> bool:
	"""Check if text contains Arabic characters."""
	arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+')
	return bool(arabic_pattern.search(text))


	def detect_dialect(text: str) -> str:
	"""
	Simple dialect detection based on common markers.
	Returns: 'egyptian', 'gulf', 'levantine', or 'msa' (Modern Standard Arabic)
	"""
	text_lower = text.lower()

	# Egyptian markers
	egyptian_markers = ['ازاي', 'عايز', 'كده', 'ده', 'دي', 'بتاع', 'اوي', 'خالص']
	if any(marker in text for marker in egyptian_markers):
	return 'egyptian'

	# Gulf markers
	gulf_markers = ['شلون', 'ابي', 'ابغى', 'وايد', 'زين', 'حيل', 'يالله']
	if any(marker in text for marker in gulf_markers):
	return 'gulf'

	# Levantine markers
	levantine_markers = ['كيفك', 'شو', 'هيك', 'منيح', 'كتير', 'هلق', 'بدي']
	if any(marker in text for marker in levantine_markers):
	return 'levantine'

	return 'msa'