| """ |
| Arabic text preprocessing for XTTS-v2 inference. |
| |
| Handles: |
| 1. Text cleaning (normalize whitespace, punctuation, special chars) |
| 2. Hamza normalization (correct common hamza placement errors) |
| 3. Tashkeel (add diacritics for pronunciation accuracy) |
| 4. Number-to-word conversion (Arabic numerals and percentages) |
| 5. Symbol expansion (Arabic-appropriate replacements) |
| |
| Usage: |
| from scripts.arabic_preprocessor import ArabicPreprocessor |
| preprocessor = ArabicPreprocessor() |
| clean_text = preprocessor.process("مرحبا بكم في 2026") |
| """ |
|
|
| import re |
| import unicodedata |
| from num2words import num2words |
|
|
| |
| _vocalizer = None |
|
|
|
|
| def _get_vocalizer(): |
| global _vocalizer |
| if _vocalizer is None: |
| import mishkal.tashkeel as tashkeel |
| _vocalizer = tashkeel.TashkeelClass() |
| return _vocalizer |
|
|
|
|
| |
| |
| |
| HAMZA_CORRECTIONS = { |
| |
| "ان": "أن", |
| "انا": "أنا", |
| "انت": "أنت", |
| "انتم": "أنتم", |
| "اكثر": "أكثر", |
| "اقل": "أقل", |
| "اول": "أول", |
| "اي": "أي", |
| "ايضا": "أيضاً", |
| "اذا": "إذا", |
| "امام": "أمام", |
| "اصبح": "أصبح", |
| "اصبحت": "أصبحت", |
| "اخرى": "أخرى", |
| "اخر": "آخر", |
| "اكبر": "أكبر", |
| "اكد": "أكد", |
| "اعلن": "أعلن", |
| "اهم": "أهم", |
| "امر": "أمر", |
| "اساس": "أساس", |
| "اساسي": "أساسي", |
| "امن": "أمن", |
| "امل": "أمل", |
| "اسلام": "إسلام", |
| "ادارة": "إدارة", |
| "انتاج": "إنتاج", |
| "انسان": "إنسان", |
| "اعلام": "إعلام", |
| |
| "الى": "إلى", |
| "اذ": "إذ", |
| "اذن": "إذن", |
| "انما": "إنما", |
| "انه": "إنه", |
| "انها": "إنها", |
| "انهم": "إنهم", |
| |
| "الان": "الآن", |
| "القران": "القرآن", |
| "الالات": "الآلات", |
| "الالة": "الآلة", |
| "اخرون": "آخرون", |
| |
| "مسوول": "مسؤول", |
| "مسوولية": "مسؤولية", |
| "روية": "رؤية", |
| "تاثير": "تأثير", |
| "تاكد": "تأكد", |
| "مساله": "مسألة", |
| "سوال": "سؤال", |
| } |
|
|
| |
| SYMBOL_MAP = { |
| "&": " و ", |
| "@": " على ", |
| "%": " بالمئة", |
| "$": " دولار", |
| "£": " جنيه", |
| "€": " يورو", |
| "°": " درجة", |
| "+": " زائد ", |
| "=": " يساوي ", |
| } |
|
|
|
|
| class ArabicPreprocessor: |
| """Full Arabic text preprocessing pipeline for TTS inference.""" |
|
|
| def __init__(self, enable_tashkeel=False): |
| self.enable_tashkeel = enable_tashkeel |
| self._hamza_pattern = re.compile( |
| r"\b(" + "|".join(re.escape(k) for k in HAMZA_CORRECTIONS) + r")\b" |
| ) |
|
|
| def clean_text(self, text): |
| """Basic text cleaning: normalize whitespace, punctuation, unicode.""" |
| |
| text = unicodedata.normalize("NFC", text) |
| |
| text = re.sub(r"[\u200b\u200c\u200d\u200e\u200f\ufeff]", "", text) |
| |
| text = text.replace("٪", "%") |
| text = text.replace("،", "،") |
| text = text.replace("؛", "؛") |
| |
| text = re.sub(r"\s+", " ", text) |
| |
| text = text.strip() |
| return text |
|
|
| def fix_hamza(self, text): |
| """Correct common hamza placement errors in Arabic text.""" |
| def _replace(match): |
| word = match.group(0) |
| return HAMZA_CORRECTIONS.get(word, word) |
| return self._hamza_pattern.sub(_replace, text) |
|
|
| def expand_numbers(self, text): |
| """Convert numbers to Arabic words.""" |
| def _number_to_words(match): |
| num_str = match.group(0) |
| try: |
| num = float(num_str) if "." in num_str else int(num_str) |
| return num2words(num, lang="ar") |
| except (ValueError, OverflowError): |
| return num_str |
|
|
| |
| def _percent_to_words(match): |
| num_str = match.group(1) |
| try: |
| num = float(num_str) if "." in num_str else int(num_str) |
| return num2words(num, lang="ar") + " بالمئة" |
| except (ValueError, OverflowError): |
| return match.group(0) |
|
|
| text = re.sub(r"(\d+(?:\.\d+)?)\s*[%٪]", _percent_to_words, text) |
| |
| text = re.sub(r"\d+(?:\.\d+)?", _number_to_words, text) |
| return text |
|
|
| def expand_symbols(self, text): |
| """Replace symbols with Arabic words.""" |
| for symbol, replacement in SYMBOL_MAP.items(): |
| text = text.replace(symbol, replacement) |
| return text |
|
|
| def add_tashkeel(self, text): |
| """Add diacritical marks using Mishkal.""" |
| if not self.enable_tashkeel: |
| return text |
| vocalizer = _get_vocalizer() |
| return vocalizer.tashkeel(text) |
|
|
| def process(self, text, tashkeel=None): |
| """ |
| Full preprocessing pipeline. |
| |
| Args: |
| text: Raw Arabic text. |
| tashkeel: Override tashkeel setting (True/False/None=use default). |
| |
| Returns: |
| Processed text ready for XTTS-v2 inference. |
| """ |
| text = self.clean_text(text) |
| text = self.fix_hamza(text) |
| text = self.expand_numbers(text) |
| text = self.expand_symbols(text) |
| text = self.clean_text(text) |
|
|
| use_tashkeel = tashkeel if tashkeel is not None else self.enable_tashkeel |
| if use_tashkeel: |
| text = self.add_tashkeel(text) |
|
|
| return text |
|
|
|
|
| |
| if __name__ == "__main__": |
| preprocessor = ArabicPreprocessor() |
|
|
| test_texts = [ |
| "الذكاء الاصطناعي يتطور بسرعة كبيرة، ويدخل في كل مجالات الحياة.", |
| "اكثر من 70% من الشركات الكبرى تستخدم الذكاء الاصطناعي اليوم.", |
| "الالات اصبحت قادرة على التعلم، واتخاذ قرارات معقدة بمفردها.", |
| "ان مستقبل البشرية سيتشكل بناءً على كيفية تعاملنا مع هذه التقنية.", |
| "هذا المشروع يكلف 500$ و يحقق نمو 25%", |
| ] |
|
|
| print("=" * 70) |
| print("Arabic Preprocessor Test") |
| print("=" * 70) |
|
|
| for text in test_texts: |
| |
| cleaned = preprocessor.clean_text(text) |
| hamza_fixed = preprocessor.fix_hamza(cleaned) |
| numbers_expanded = preprocessor.expand_numbers(hamza_fixed) |
| symbols_expanded = preprocessor.expand_symbols(numbers_expanded) |
| final = preprocessor.process(text) |
|
|
| print(f"\nOriginal: {text}") |
| if hamza_fixed != cleaned: |
| print(f"Hamza: {hamza_fixed}") |
| if numbers_expanded != hamza_fixed: |
| print(f"Numbers: {numbers_expanded}") |
| if symbols_expanded != numbers_expanded: |
| print(f"Symbols: {symbols_expanded}") |
| print(f"Final: {final}") |
| print("-" * 70) |
|
|