|
|
""" |
|
|
Arabic Text Utilities |
|
|
===================== |
|
|
|
|
|
Utilities for normalizing and processing Arabic text for function calling evaluation. |
|
|
""" |
|
|
|
|
|
import re |
|
|
import unicodedata |
|
|
|
|
|
|
|
|
class ArabicNormalizer: |
|
|
"""Normalize Arabic text for consistent comparison.""" |
|
|
|
|
|
|
|
|
ARABIC_DIACRITICS = re.compile(r'[\u064B-\u065F\u0670]') |
|
|
|
|
|
|
|
|
ALEF_VARIANTS = { |
|
|
'\u0622': '\u0627', |
|
|
'\u0623': '\u0627', |
|
|
'\u0625': '\u0627', |
|
|
'\u0671': '\u0627', |
|
|
} |
|
|
|
|
|
|
|
|
TA_MARBUTA = '\u0629' |
|
|
HA = '\u0647' |
|
|
|
|
|
|
|
|
ARABIC_INDIC_NUMERALS = { |
|
|
'\u0660': '0', '\u0661': '1', '\u0662': '2', '\u0663': '3', '\u0664': '4', |
|
|
'\u0665': '5', '\u0666': '6', '\u0667': '7', '\u0668': '8', '\u0669': '9', |
|
|
} |
|
|
|
|
|
|
|
|
EXTENDED_NUMERALS = { |
|
|
'\u06F0': '0', '\u06F1': '1', '\u06F2': '2', '\u06F3': '3', '\u06F4': '4', |
|
|
'\u06F5': '5', '\u06F6': '6', '\u06F7': '7', '\u06F8': '8', '\u06F9': '9', |
|
|
} |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
remove_diacritics: bool = True, |
|
|
normalize_alef: bool = True, |
|
|
normalize_ta_marbuta: bool = False, |
|
|
normalize_numerals: bool = True, |
|
|
lowercase: bool = True, |
|
|
strip_whitespace: bool = True |
|
|
): |
|
|
self.remove_diacritics = remove_diacritics |
|
|
self.normalize_alef = normalize_alef |
|
|
self.normalize_ta_marbuta = normalize_ta_marbuta |
|
|
self.normalize_numerals = normalize_numerals |
|
|
self.lowercase = lowercase |
|
|
self.strip_whitespace = strip_whitespace |
|
|
|
|
|
def normalize(self, text: str) -> str: |
|
|
"""Apply all configured normalizations to text.""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = unicodedata.normalize('NFC', text) |
|
|
|
|
|
|
|
|
if self.remove_diacritics: |
|
|
text = self.ARABIC_DIACRITICS.sub('', text) |
|
|
|
|
|
|
|
|
if self.normalize_alef: |
|
|
for variant, replacement in self.ALEF_VARIANTS.items(): |
|
|
text = text.replace(variant, replacement) |
|
|
|
|
|
|
|
|
if self.normalize_ta_marbuta: |
|
|
text = text.replace(self.TA_MARBUTA, self.HA) |
|
|
|
|
|
|
|
|
if self.normalize_numerals: |
|
|
for arabic, western in self.ARABIC_INDIC_NUMERALS.items(): |
|
|
text = text.replace(arabic, western) |
|
|
for persian, western in self.EXTENDED_NUMERALS.items(): |
|
|
text = text.replace(persian, western) |
|
|
|
|
|
|
|
|
if self.lowercase: |
|
|
text = text.lower() |
|
|
|
|
|
|
|
|
if self.strip_whitespace: |
|
|
text = ' '.join(text.split()) |
|
|
|
|
|
return text |
|
|
|
|
|
def normalize_for_comparison(self, text: str) -> str: |
|
|
"""Aggressive normalization for fuzzy matching.""" |
|
|
text = self.normalize(text) |
|
|
|
|
|
text = re.sub(r'[^\w\s]', '', text) |
|
|
|
|
|
text = ' '.join(text.split()) |
|
|
return text |
|
|
|
|
|
|
|
|
def extract_arabic_numbers(text: str) -> list: |
|
|
"""Extract numbers from Arabic text (both Arabic-Indic and Western).""" |
|
|
normalizer = ArabicNormalizer(normalize_numerals=True) |
|
|
normalized = normalizer.normalize(text) |
|
|
return re.findall(r'\d+(?:\.\d+)?', normalized) |
|
|
|
|
|
|
|
|
def is_arabic_text(text: str) -> bool: |
|
|
"""Check if text contains Arabic characters.""" |
|
|
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+') |
|
|
return bool(arabic_pattern.search(text)) |
|
|
|
|
|
|
|
|
def detect_dialect(text: str) -> str: |
|
|
""" |
|
|
Simple dialect detection based on common markers. |
|
|
Returns: 'egyptian', 'gulf', 'levantine', or 'msa' (Modern Standard Arabic) |
|
|
""" |
|
|
text_lower = text.lower() |
|
|
|
|
|
|
|
|
egyptian_markers = ['ุงุฒุงู', 'ุนุงูุฒ', 'ูุฏู', 'ุฏู', 'ุฏู', 'ุจุชุงุน', 'ุงูู', 'ุฎุงูุต'] |
|
|
if any(marker in text for marker in egyptian_markers): |
|
|
return 'egyptian' |
|
|
|
|
|
|
|
|
gulf_markers = ['ุดููู', 'ุงุจู', 'ุงุจุบู', 'ูุงูุฏ', 'ุฒูู', 'ุญูู', 'ูุงููู'] |
|
|
if any(marker in text for marker in gulf_markers): |
|
|
return 'gulf' |
|
|
|
|
|
|
|
|
levantine_markers = ['ูููู', 'ุดู', 'ููู', 'ู
ููุญ', 'ูุชูุฑ', 'ููู', 'ุจุฏู'] |
|
|
if any(marker in text for marker in levantine_markers): |
|
|
return 'levantine' |
|
|
|
|
|
return 'msa' |
|
|
|