Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

File size: 4,631 Bytes

566d03e

"""
Arabic Text Utilities
=====================

Utilities for normalizing and processing Arabic text for function calling evaluation.
"""

import re
import unicodedata


class ArabicNormalizer:
    """Normalize Arabic text for consistent comparison."""

    # Arabic diacritics (tashkeel) to remove
    ARABIC_DIACRITICS = re.compile(r'[\u064B-\u065F\u0670]')

    # Alef variants to normalize
    ALEF_VARIANTS = {
        '\u0622': '\u0627',  # آ -> ا
        '\u0623': '\u0627',  # أ -> ا
        '\u0625': '\u0627',  # إ -> ا
        '\u0671': '\u0627',  # ٱ -> ا
    }

    # Ta marbuta to ha
    TA_MARBUTA = '\u0629'
    HA = '\u0647'

    # Arabic-Indic numerals to Western
    ARABIC_INDIC_NUMERALS = {
        '\u0660': '0', '\u0661': '1', '\u0662': '2', '\u0663': '3', '\u0664': '4',
        '\u0665': '5', '\u0666': '6', '\u0667': '7', '\u0668': '8', '\u0669': '9',
    }

    # Extended Arabic-Indic numerals (Persian/Urdu)
    EXTENDED_NUMERALS = {
        '\u06F0': '0', '\u06F1': '1', '\u06F2': '2', '\u06F3': '3', '\u06F4': '4',
        '\u06F5': '5', '\u06F6': '6', '\u06F7': '7', '\u06F8': '8', '\u06F9': '9',
    }

    def __init__(
        self,
        remove_diacritics: bool = True,
        normalize_alef: bool = True,
        normalize_ta_marbuta: bool = False,
        normalize_numerals: bool = True,
        lowercase: bool = True,
        strip_whitespace: bool = True
    ):
        self.remove_diacritics = remove_diacritics
        self.normalize_alef = normalize_alef
        self.normalize_ta_marbuta = normalize_ta_marbuta
        self.normalize_numerals = normalize_numerals
        self.lowercase = lowercase
        self.strip_whitespace = strip_whitespace

    def normalize(self, text: str) -> str:
        """Apply all configured normalizations to text."""
        if not text:
            return ""

        # Unicode normalization
        text = unicodedata.normalize('NFC', text)

        # Remove diacritics
        if self.remove_diacritics:
            text = self.ARABIC_DIACRITICS.sub('', text)

        # Normalize alef variants
        if self.normalize_alef:
            for variant, replacement in self.ALEF_VARIANTS.items():
                text = text.replace(variant, replacement)

        # Normalize ta marbuta
        if self.normalize_ta_marbuta:
            text = text.replace(self.TA_MARBUTA, self.HA)

        # Normalize numerals
        if self.normalize_numerals:
            for arabic, western in self.ARABIC_INDIC_NUMERALS.items():
                text = text.replace(arabic, western)
            for persian, western in self.EXTENDED_NUMERALS.items():
                text = text.replace(persian, western)

        # Lowercase (for Latin characters in function names)
        if self.lowercase:
            text = text.lower()

        # Strip and normalize whitespace
        if self.strip_whitespace:
            text = ' '.join(text.split())

        return text

    def normalize_for_comparison(self, text: str) -> str:
        """Aggressive normalization for fuzzy matching."""
        text = self.normalize(text)
        # Remove all punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text


def extract_arabic_numbers(text: str) -> list:
    """Extract numbers from Arabic text (both Arabic-Indic and Western)."""
    normalizer = ArabicNormalizer(normalize_numerals=True)
    normalized = normalizer.normalize(text)
    return re.findall(r'\d+(?:\.\d+)?', normalized)


def is_arabic_text(text: str) -> bool:
    """Check if text contains Arabic characters."""
    arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+')
    return bool(arabic_pattern.search(text))


def detect_dialect(text: str) -> str:
    """
    Simple dialect detection based on common markers.
    Returns: 'egyptian', 'gulf', 'levantine', or 'msa' (Modern Standard Arabic)
    """
    text_lower = text.lower()

    # Egyptian markers
    egyptian_markers = ['ازاي', 'عايز', 'كده', 'ده', 'دي', 'بتاع', 'اوي', 'خالص']
    if any(marker in text for marker in egyptian_markers):
        return 'egyptian'

    # Gulf markers
    gulf_markers = ['شلون', 'ابي', 'ابغى', 'وايد', 'زين', 'حيل', 'يالله']
    if any(marker in text for marker in gulf_markers):
        return 'gulf'

    # Levantine markers
    levantine_markers = ['كيفك', 'شو', 'هيك', 'منيح', 'كتير', 'هلق', 'بدي']
    if any(marker in text for marker in levantine_markers):
        return 'levantine'

    return 'msa'