File size: 4,631 Bytes
566d03e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
"""
Arabic Text Utilities
=====================
Utilities for normalizing and processing Arabic text for function calling evaluation.
"""
import re
import unicodedata
class ArabicNormalizer:
"""Normalize Arabic text for consistent comparison."""
# Arabic diacritics (tashkeel) to remove
ARABIC_DIACRITICS = re.compile(r'[\u064B-\u065F\u0670]')
# Alef variants to normalize
ALEF_VARIANTS = {
'\u0622': '\u0627', # آ -> ا
'\u0623': '\u0627', # أ -> ا
'\u0625': '\u0627', # إ -> ا
'\u0671': '\u0627', # ٱ -> ا
}
# Ta marbuta to ha
TA_MARBUTA = '\u0629'
HA = '\u0647'
# Arabic-Indic numerals to Western
ARABIC_INDIC_NUMERALS = {
'\u0660': '0', '\u0661': '1', '\u0662': '2', '\u0663': '3', '\u0664': '4',
'\u0665': '5', '\u0666': '6', '\u0667': '7', '\u0668': '8', '\u0669': '9',
}
# Extended Arabic-Indic numerals (Persian/Urdu)
EXTENDED_NUMERALS = {
'\u06F0': '0', '\u06F1': '1', '\u06F2': '2', '\u06F3': '3', '\u06F4': '4',
'\u06F5': '5', '\u06F6': '6', '\u06F7': '7', '\u06F8': '8', '\u06F9': '9',
}
def __init__(
self,
remove_diacritics: bool = True,
normalize_alef: bool = True,
normalize_ta_marbuta: bool = False,
normalize_numerals: bool = True,
lowercase: bool = True,
strip_whitespace: bool = True
):
self.remove_diacritics = remove_diacritics
self.normalize_alef = normalize_alef
self.normalize_ta_marbuta = normalize_ta_marbuta
self.normalize_numerals = normalize_numerals
self.lowercase = lowercase
self.strip_whitespace = strip_whitespace
def normalize(self, text: str) -> str:
"""Apply all configured normalizations to text."""
if not text:
return ""
# Unicode normalization
text = unicodedata.normalize('NFC', text)
# Remove diacritics
if self.remove_diacritics:
text = self.ARABIC_DIACRITICS.sub('', text)
# Normalize alef variants
if self.normalize_alef:
for variant, replacement in self.ALEF_VARIANTS.items():
text = text.replace(variant, replacement)
# Normalize ta marbuta
if self.normalize_ta_marbuta:
text = text.replace(self.TA_MARBUTA, self.HA)
# Normalize numerals
if self.normalize_numerals:
for arabic, western in self.ARABIC_INDIC_NUMERALS.items():
text = text.replace(arabic, western)
for persian, western in self.EXTENDED_NUMERALS.items():
text = text.replace(persian, western)
# Lowercase (for Latin characters in function names)
if self.lowercase:
text = text.lower()
# Strip and normalize whitespace
if self.strip_whitespace:
text = ' '.join(text.split())
return text
def normalize_for_comparison(self, text: str) -> str:
"""Aggressive normalization for fuzzy matching."""
text = self.normalize(text)
# Remove all punctuation
text = re.sub(r'[^\w\s]', '', text)
# Remove extra whitespace
text = ' '.join(text.split())
return text
def extract_arabic_numbers(text: str) -> list:
"""Extract numbers from Arabic text (both Arabic-Indic and Western)."""
normalizer = ArabicNormalizer(normalize_numerals=True)
normalized = normalizer.normalize(text)
return re.findall(r'\d+(?:\.\d+)?', normalized)
def is_arabic_text(text: str) -> bool:
"""Check if text contains Arabic characters."""
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+')
return bool(arabic_pattern.search(text))
def detect_dialect(text: str) -> str:
"""
Simple dialect detection based on common markers.
Returns: 'egyptian', 'gulf', 'levantine', or 'msa' (Modern Standard Arabic)
"""
text_lower = text.lower()
# Egyptian markers
egyptian_markers = ['ازاي', 'عايز', 'كده', 'ده', 'دي', 'بتاع', 'اوي', 'خالص']
if any(marker in text for marker in egyptian_markers):
return 'egyptian'
# Gulf markers
gulf_markers = ['شلون', 'ابي', 'ابغى', 'وايد', 'زين', 'حيل', 'يالله']
if any(marker in text for marker in gulf_markers):
return 'gulf'
# Levantine markers
levantine_markers = ['كيفك', 'شو', 'هيك', 'منيح', 'كتير', 'هلق', 'بدي']
if any(marker in text for marker in levantine_markers):
return 'levantine'
return 'msa'
|