File size: 7,595 Bytes
97e3499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
"""
Arabic text preprocessing for XTTS-v2 inference.

Handles:
  1. Text cleaning (normalize whitespace, punctuation, special chars)
  2. Hamza normalization (correct common hamza placement errors)
  3. Tashkeel (add diacritics for pronunciation accuracy)
  4. Number-to-word conversion (Arabic numerals and percentages)
  5. Symbol expansion (Arabic-appropriate replacements)

Usage:
    from scripts.arabic_preprocessor import ArabicPreprocessor
    preprocessor = ArabicPreprocessor()
    clean_text = preprocessor.process("مرحبا بكم في 2026")
"""

import re
import unicodedata
from num2words import num2words

# Lazy-load tashkeel (heavy import)
_vocalizer = None


def _get_vocalizer():
    global _vocalizer
    if _vocalizer is None:
        import mishkal.tashkeel as tashkeel
        _vocalizer = tashkeel.TashkeelClass()
    return _vocalizer


# --- Hamza correction map ---
# Common words where hamza is frequently dropped or misplaced.
# Format: incorrect -> correct
HAMZA_CORRECTIONS = {
    # Alef with hamza above (أ)
    "ان": "أن",
    "انا": "أنا",
    "انت": "أنت",
    "انتم": "أنتم",
    "اكثر": "أكثر",
    "اقل": "أقل",
    "اول": "أول",
    "اي": "أي",
    "ايضا": "أيضاً",
    "اذا": "إذا",
    "امام": "أمام",
    "اصبح": "أصبح",
    "اصبحت": "أصبحت",
    "اخرى": "أخرى",
    "اخر": "آخر",
    "اكبر": "أكبر",
    "اكد": "أكد",
    "اعلن": "أعلن",
    "اهم": "أهم",
    "امر": "أمر",
    "اساس": "أساس",
    "اساسي": "أساسي",
    "امن": "أمن",
    "امل": "أمل",
    "اسلام": "إسلام",
    "ادارة": "إدارة",
    "انتاج": "إنتاج",
    "انسان": "إنسان",
    "اعلام": "إعلام",
    # Alef with hamza below (إ)
    "الى": "إلى",
    "اذ": "إذ",
    "اذن": "إذن",
    "انما": "إنما",
    "انه": "إنه",
    "انها": "إنها",
    "انهم": "إنهم",
    # Alef madda (آ)
    "الان": "الآن",
    "القران": "القرآن",
    "الالات": "الآلات",
    "الالة": "الآلة",
    "اخرون": "آخرون",
    # Hamza on waw (ؤ)
    "مسوول": "مسؤول",
    "مسوولية": "مسؤولية",
    "روية": "رؤية",
    "تاثير": "تأثير",
    "تاكد": "تأكد",
    "مساله": "مسألة",
    "سوال": "سؤال",
}

# --- Symbol expansion ---
SYMBOL_MAP = {
    "&": " و ",
    "@": " على ",
    "%": " بالمئة",
    "$": " دولار",
    "£": " جنيه",
    "€": " يورو",
    "°": " درجة",
    "+": " زائد ",
    "=": " يساوي ",
}


class ArabicPreprocessor:
    """Full Arabic text preprocessing pipeline for TTS inference."""

    def __init__(self, enable_tashkeel=False):
        self.enable_tashkeel = enable_tashkeel
        self._hamza_pattern = re.compile(
            r"\b(" + "|".join(re.escape(k) for k in HAMZA_CORRECTIONS) + r")\b"
        )

    def clean_text(self, text):
        """Basic text cleaning: normalize whitespace, punctuation, unicode."""
        # Normalize unicode
        text = unicodedata.normalize("NFC", text)
        # Remove zero-width characters
        text = re.sub(r"[\u200b\u200c\u200d\u200e\u200f\ufeff]", "", text)
        # Normalize Arabic-specific punctuation
        text = text.replace("٪", "%")
        text = text.replace("،", "،")  # keep Arabic comma
        text = text.replace("؛", "؛")  # keep Arabic semicolon
        # Normalize multiple spaces
        text = re.sub(r"\s+", " ", text)
        # Remove leading/trailing whitespace
        text = text.strip()
        return text

    def fix_hamza(self, text):
        """Correct common hamza placement errors in Arabic text."""
        def _replace(match):
            word = match.group(0)
            return HAMZA_CORRECTIONS.get(word, word)
        return self._hamza_pattern.sub(_replace, text)

    def expand_numbers(self, text):
        """Convert numbers to Arabic words."""
        def _number_to_words(match):
            num_str = match.group(0)
            try:
                num = float(num_str) if "." in num_str else int(num_str)
                return num2words(num, lang="ar")
            except (ValueError, OverflowError):
                return num_str

        # Handle percentages first (e.g., "70%" -> "سبعون بالمئة")
        def _percent_to_words(match):
            num_str = match.group(1)
            try:
                num = float(num_str) if "." in num_str else int(num_str)
                return num2words(num, lang="ar") + " بالمئة"
            except (ValueError, OverflowError):
                return match.group(0)

        text = re.sub(r"(\d+(?:\.\d+)?)\s*[%٪]", _percent_to_words, text)
        # Then standalone numbers
        text = re.sub(r"\d+(?:\.\d+)?", _number_to_words, text)
        return text

    def expand_symbols(self, text):
        """Replace symbols with Arabic words."""
        for symbol, replacement in SYMBOL_MAP.items():
            text = text.replace(symbol, replacement)
        return text

    def add_tashkeel(self, text):
        """Add diacritical marks using Mishkal."""
        if not self.enable_tashkeel:
            return text
        vocalizer = _get_vocalizer()
        return vocalizer.tashkeel(text)

    def process(self, text, tashkeel=None):
        """
        Full preprocessing pipeline.

        Args:
            text: Raw Arabic text.
            tashkeel: Override tashkeel setting (True/False/None=use default).

        Returns:
            Processed text ready for XTTS-v2 inference.
        """
        text = self.clean_text(text)
        text = self.fix_hamza(text)
        text = self.expand_numbers(text)
        text = self.expand_symbols(text)
        text = self.clean_text(text)  # clean again after expansions

        use_tashkeel = tashkeel if tashkeel is not None else self.enable_tashkeel
        if use_tashkeel:
            text = self.add_tashkeel(text)

        return text


# --- CLI for testing ---
if __name__ == "__main__":
    preprocessor = ArabicPreprocessor()

    test_texts = [
        "الذكاء الاصطناعي يتطور بسرعة كبيرة، ويدخل في كل مجالات الحياة.",
        "اكثر من 70% من الشركات الكبرى تستخدم الذكاء الاصطناعي اليوم.",
        "الالات اصبحت قادرة على التعلم، واتخاذ قرارات معقدة بمفردها.",
        "ان مستقبل البشرية سيتشكل بناءً على كيفية تعاملنا مع هذه التقنية.",
        "هذا المشروع يكلف 500$ و يحقق نمو 25%",
    ]

    print("=" * 70)
    print("Arabic Preprocessor Test")
    print("=" * 70)

    for text in test_texts:
        # Show each step
        cleaned = preprocessor.clean_text(text)
        hamza_fixed = preprocessor.fix_hamza(cleaned)
        numbers_expanded = preprocessor.expand_numbers(hamza_fixed)
        symbols_expanded = preprocessor.expand_symbols(numbers_expanded)
        final = preprocessor.process(text)

        print(f"\nOriginal:  {text}")
        if hamza_fixed != cleaned:
            print(f"Hamza:     {hamza_fixed}")
        if numbers_expanded != hamza_fixed:
            print(f"Numbers:   {numbers_expanded}")
        if symbols_expanded != numbers_expanded:
            print(f"Symbols:   {symbols_expanded}")
        print(f"Final:     {final}")
        print("-" * 70)