File size: 7,595 Bytes
97e3499 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | """
Arabic text preprocessing for XTTS-v2 inference.
Handles:
1. Text cleaning (normalize whitespace, punctuation, special chars)
2. Hamza normalization (correct common hamza placement errors)
3. Tashkeel (add diacritics for pronunciation accuracy)
4. Number-to-word conversion (Arabic numerals and percentages)
5. Symbol expansion (Arabic-appropriate replacements)
Usage:
from scripts.arabic_preprocessor import ArabicPreprocessor
preprocessor = ArabicPreprocessor()
clean_text = preprocessor.process("مرحبا بكم في 2026")
"""
import re
import unicodedata
from num2words import num2words
# Lazy-load tashkeel (heavy import)
_vocalizer = None
def _get_vocalizer():
global _vocalizer
if _vocalizer is None:
import mishkal.tashkeel as tashkeel
_vocalizer = tashkeel.TashkeelClass()
return _vocalizer
# --- Hamza correction map ---
# Common words where hamza is frequently dropped or misplaced.
# Format: incorrect -> correct
HAMZA_CORRECTIONS = {
# Alef with hamza above (أ)
"ان": "أن",
"انا": "أنا",
"انت": "أنت",
"انتم": "أنتم",
"اكثر": "أكثر",
"اقل": "أقل",
"اول": "أول",
"اي": "أي",
"ايضا": "أيضاً",
"اذا": "إذا",
"امام": "أمام",
"اصبح": "أصبح",
"اصبحت": "أصبحت",
"اخرى": "أخرى",
"اخر": "آخر",
"اكبر": "أكبر",
"اكد": "أكد",
"اعلن": "أعلن",
"اهم": "أهم",
"امر": "أمر",
"اساس": "أساس",
"اساسي": "أساسي",
"امن": "أمن",
"امل": "أمل",
"اسلام": "إسلام",
"ادارة": "إدارة",
"انتاج": "إنتاج",
"انسان": "إنسان",
"اعلام": "إعلام",
# Alef with hamza below (إ)
"الى": "إلى",
"اذ": "إذ",
"اذن": "إذن",
"انما": "إنما",
"انه": "إنه",
"انها": "إنها",
"انهم": "إنهم",
# Alef madda (آ)
"الان": "الآن",
"القران": "القرآن",
"الالات": "الآلات",
"الالة": "الآلة",
"اخرون": "آخرون",
# Hamza on waw (ؤ)
"مسوول": "مسؤول",
"مسوولية": "مسؤولية",
"روية": "رؤية",
"تاثير": "تأثير",
"تاكد": "تأكد",
"مساله": "مسألة",
"سوال": "سؤال",
}
# --- Symbol expansion ---
SYMBOL_MAP = {
"&": " و ",
"@": " على ",
"%": " بالمئة",
"$": " دولار",
"£": " جنيه",
"€": " يورو",
"°": " درجة",
"+": " زائد ",
"=": " يساوي ",
}
class ArabicPreprocessor:
"""Full Arabic text preprocessing pipeline for TTS inference."""
def __init__(self, enable_tashkeel=False):
self.enable_tashkeel = enable_tashkeel
self._hamza_pattern = re.compile(
r"\b(" + "|".join(re.escape(k) for k in HAMZA_CORRECTIONS) + r")\b"
)
def clean_text(self, text):
"""Basic text cleaning: normalize whitespace, punctuation, unicode."""
# Normalize unicode
text = unicodedata.normalize("NFC", text)
# Remove zero-width characters
text = re.sub(r"[\u200b\u200c\u200d\u200e\u200f\ufeff]", "", text)
# Normalize Arabic-specific punctuation
text = text.replace("٪", "%")
text = text.replace("،", "،") # keep Arabic comma
text = text.replace("؛", "؛") # keep Arabic semicolon
# Normalize multiple spaces
text = re.sub(r"\s+", " ", text)
# Remove leading/trailing whitespace
text = text.strip()
return text
def fix_hamza(self, text):
"""Correct common hamza placement errors in Arabic text."""
def _replace(match):
word = match.group(0)
return HAMZA_CORRECTIONS.get(word, word)
return self._hamza_pattern.sub(_replace, text)
def expand_numbers(self, text):
"""Convert numbers to Arabic words."""
def _number_to_words(match):
num_str = match.group(0)
try:
num = float(num_str) if "." in num_str else int(num_str)
return num2words(num, lang="ar")
except (ValueError, OverflowError):
return num_str
# Handle percentages first (e.g., "70%" -> "سبعون بالمئة")
def _percent_to_words(match):
num_str = match.group(1)
try:
num = float(num_str) if "." in num_str else int(num_str)
return num2words(num, lang="ar") + " بالمئة"
except (ValueError, OverflowError):
return match.group(0)
text = re.sub(r"(\d+(?:\.\d+)?)\s*[%٪]", _percent_to_words, text)
# Then standalone numbers
text = re.sub(r"\d+(?:\.\d+)?", _number_to_words, text)
return text
def expand_symbols(self, text):
"""Replace symbols with Arabic words."""
for symbol, replacement in SYMBOL_MAP.items():
text = text.replace(symbol, replacement)
return text
def add_tashkeel(self, text):
"""Add diacritical marks using Mishkal."""
if not self.enable_tashkeel:
return text
vocalizer = _get_vocalizer()
return vocalizer.tashkeel(text)
def process(self, text, tashkeel=None):
"""
Full preprocessing pipeline.
Args:
text: Raw Arabic text.
tashkeel: Override tashkeel setting (True/False/None=use default).
Returns:
Processed text ready for XTTS-v2 inference.
"""
text = self.clean_text(text)
text = self.fix_hamza(text)
text = self.expand_numbers(text)
text = self.expand_symbols(text)
text = self.clean_text(text) # clean again after expansions
use_tashkeel = tashkeel if tashkeel is not None else self.enable_tashkeel
if use_tashkeel:
text = self.add_tashkeel(text)
return text
# --- CLI for testing ---
if __name__ == "__main__":
preprocessor = ArabicPreprocessor()
test_texts = [
"الذكاء الاصطناعي يتطور بسرعة كبيرة، ويدخل في كل مجالات الحياة.",
"اكثر من 70% من الشركات الكبرى تستخدم الذكاء الاصطناعي اليوم.",
"الالات اصبحت قادرة على التعلم، واتخاذ قرارات معقدة بمفردها.",
"ان مستقبل البشرية سيتشكل بناءً على كيفية تعاملنا مع هذه التقنية.",
"هذا المشروع يكلف 500$ و يحقق نمو 25%",
]
print("=" * 70)
print("Arabic Preprocessor Test")
print("=" * 70)
for text in test_texts:
# Show each step
cleaned = preprocessor.clean_text(text)
hamza_fixed = preprocessor.fix_hamza(cleaned)
numbers_expanded = preprocessor.expand_numbers(hamza_fixed)
symbols_expanded = preprocessor.expand_symbols(numbers_expanded)
final = preprocessor.process(text)
print(f"\nOriginal: {text}")
if hamza_fixed != cleaned:
print(f"Hamza: {hamza_fixed}")
if numbers_expanded != hamza_fixed:
print(f"Numbers: {numbers_expanded}")
if symbols_expanded != numbers_expanded:
print(f"Symbols: {symbols_expanded}")
print(f"Final: {final}")
print("-" * 70)
|