File size: 4,631 Bytes
566d03e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Arabic Text Utilities
=====================

Utilities for normalizing and processing Arabic text for function calling evaluation.
"""

import re
import unicodedata


class ArabicNormalizer:
    """Normalize Arabic text for consistent comparison."""

    # Arabic diacritics (tashkeel) to remove
    ARABIC_DIACRITICS = re.compile(r'[\u064B-\u065F\u0670]')

    # Alef variants to normalize
    ALEF_VARIANTS = {
        '\u0622': '\u0627',  # آ -> ا
        '\u0623': '\u0627',  # أ -> ا
        '\u0625': '\u0627',  # إ -> ا
        '\u0671': '\u0627',  # ٱ -> ا
    }

    # Ta marbuta to ha
    TA_MARBUTA = '\u0629'
    HA = '\u0647'

    # Arabic-Indic numerals to Western
    ARABIC_INDIC_NUMERALS = {
        '\u0660': '0', '\u0661': '1', '\u0662': '2', '\u0663': '3', '\u0664': '4',
        '\u0665': '5', '\u0666': '6', '\u0667': '7', '\u0668': '8', '\u0669': '9',
    }

    # Extended Arabic-Indic numerals (Persian/Urdu)
    EXTENDED_NUMERALS = {
        '\u06F0': '0', '\u06F1': '1', '\u06F2': '2', '\u06F3': '3', '\u06F4': '4',
        '\u06F5': '5', '\u06F6': '6', '\u06F7': '7', '\u06F8': '8', '\u06F9': '9',
    }

    def __init__(
        self,
        remove_diacritics: bool = True,
        normalize_alef: bool = True,
        normalize_ta_marbuta: bool = False,
        normalize_numerals: bool = True,
        lowercase: bool = True,
        strip_whitespace: bool = True
    ):
        self.remove_diacritics = remove_diacritics
        self.normalize_alef = normalize_alef
        self.normalize_ta_marbuta = normalize_ta_marbuta
        self.normalize_numerals = normalize_numerals
        self.lowercase = lowercase
        self.strip_whitespace = strip_whitespace

    def normalize(self, text: str) -> str:
        """Apply all configured normalizations to text."""
        if not text:
            return ""

        # Unicode normalization
        text = unicodedata.normalize('NFC', text)

        # Remove diacritics
        if self.remove_diacritics:
            text = self.ARABIC_DIACRITICS.sub('', text)

        # Normalize alef variants
        if self.normalize_alef:
            for variant, replacement in self.ALEF_VARIANTS.items():
                text = text.replace(variant, replacement)

        # Normalize ta marbuta
        if self.normalize_ta_marbuta:
            text = text.replace(self.TA_MARBUTA, self.HA)

        # Normalize numerals
        if self.normalize_numerals:
            for arabic, western in self.ARABIC_INDIC_NUMERALS.items():
                text = text.replace(arabic, western)
            for persian, western in self.EXTENDED_NUMERALS.items():
                text = text.replace(persian, western)

        # Lowercase (for Latin characters in function names)
        if self.lowercase:
            text = text.lower()

        # Strip and normalize whitespace
        if self.strip_whitespace:
            text = ' '.join(text.split())

        return text

    def normalize_for_comparison(self, text: str) -> str:
        """Aggressive normalization for fuzzy matching."""
        text = self.normalize(text)
        # Remove all punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text


def extract_arabic_numbers(text: str) -> list:
    """Extract numbers from Arabic text (both Arabic-Indic and Western)."""
    normalizer = ArabicNormalizer(normalize_numerals=True)
    normalized = normalizer.normalize(text)
    return re.findall(r'\d+(?:\.\d+)?', normalized)


def is_arabic_text(text: str) -> bool:
    """Check if text contains Arabic characters."""
    arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+')
    return bool(arabic_pattern.search(text))


def detect_dialect(text: str) -> str:
    """
    Simple dialect detection based on common markers.
    Returns: 'egyptian', 'gulf', 'levantine', or 'msa' (Modern Standard Arabic)
    """
    text_lower = text.lower()

    # Egyptian markers
    egyptian_markers = ['ازاي', 'عايز', 'كده', 'ده', 'دي', 'بتاع', 'اوي', 'خالص']
    if any(marker in text for marker in egyptian_markers):
        return 'egyptian'

    # Gulf markers
    gulf_markers = ['شلون', 'ابي', 'ابغى', 'وايد', 'زين', 'حيل', 'يالله']
    if any(marker in text for marker in gulf_markers):
        return 'gulf'

    # Levantine markers
    levantine_markers = ['كيفك', 'شو', 'هيك', 'منيح', 'كتير', 'هلق', 'بدي']
    if any(marker in text for marker in levantine_markers):
        return 'levantine'

    return 'msa'