arabic-tts-xtts-v2 / scripts /arabic_preprocessor.py

Add all scripts with prosody improvements

97e3499 verified 20 days ago

7.6 kB

	"""
	Arabic text preprocessing for XTTS-v2 inference.

	Handles:
	1. Text cleaning (normalize whitespace, punctuation, special chars)
	2. Hamza normalization (correct common hamza placement errors)
	3. Tashkeel (add diacritics for pronunciation accuracy)
	4. Number-to-word conversion (Arabic numerals and percentages)
	5. Symbol expansion (Arabic-appropriate replacements)

	Usage:
	from scripts.arabic_preprocessor import ArabicPreprocessor
	preprocessor = ArabicPreprocessor()
	clean_text = preprocessor.process("مرحبا بكم في 2026")
	"""

	import re
	import unicodedata
	from num2words import num2words

	# Lazy-load tashkeel (heavy import)
	_vocalizer = None


	def _get_vocalizer():
	global _vocalizer
	if _vocalizer is None:
	import mishkal.tashkeel as tashkeel
	_vocalizer = tashkeel.TashkeelClass()
	return _vocalizer


	# --- Hamza correction map ---
	# Common words where hamza is frequently dropped or misplaced.
	# Format: incorrect -> correct
	HAMZA_CORRECTIONS = {
	# Alef with hamza above (أ)
	"ان": "أن",
	"انا": "أنا",
	"انت": "أنت",
	"انتم": "أنتم",
	"اكثر": "أكثر",
	"اقل": "أقل",
	"اول": "أول",
	"اي": "أي",
	"ايضا": "أيضاً",
	"اذا": "إذا",
	"امام": "أمام",
	"اصبح": "أصبح",
	"اصبحت": "أصبحت",
	"اخرى": "أخرى",
	"اخر": "آخر",
	"اكبر": "أكبر",
	"اكد": "أكد",
	"اعلن": "أعلن",
	"اهم": "أهم",
	"امر": "أمر",
	"اساس": "أساس",
	"اساسي": "أساسي",
	"امن": "أمن",
	"امل": "أمل",
	"اسلام": "إسلام",
	"ادارة": "إدارة",
	"انتاج": "إنتاج",
	"انسان": "إنسان",
	"اعلام": "إعلام",
	# Alef with hamza below (إ)
	"الى": "إلى",
	"اذ": "إذ",
	"اذن": "إذن",
	"انما": "إنما",
	"انه": "إنه",
	"انها": "إنها",
	"انهم": "إنهم",
	# Alef madda (آ)
	"الان": "الآن",
	"القران": "القرآن",
	"الالات": "الآلات",
	"الالة": "الآلة",
	"اخرون": "آخرون",
	# Hamza on waw (ؤ)
	"مسوول": "مسؤول",
	"مسوولية": "مسؤولية",
	"روية": "رؤية",
	"تاثير": "تأثير",
	"تاكد": "تأكد",
	"مساله": "مسألة",
	"سوال": "سؤال",
	}

	# --- Symbol expansion ---
	SYMBOL_MAP = {
	"&": " و ",
	"@": " على ",
	"%": " بالمئة",
	"$": " دولار",
	"£": " جنيه",
	"€": " يورو",
	"°": " درجة",
	"+": " زائد ",
	"=": " يساوي ",
	}


	class ArabicPreprocessor:
	"""Full Arabic text preprocessing pipeline for TTS inference."""

	def __init__(self, enable_tashkeel=False):
	self.enable_tashkeel = enable_tashkeel
	self._hamza_pattern = re.compile(
	r"\b(" + "\|".join(re.escape(k) for k in HAMZA_CORRECTIONS) + r")\b"
	)

	def clean_text(self, text):
	"""Basic text cleaning: normalize whitespace, punctuation, unicode."""
	# Normalize unicode
	text = unicodedata.normalize("NFC", text)
	# Remove zero-width characters
	text = re.sub(r"[\u200b\u200c\u200d\u200e\u200f\ufeff]", "", text)
	# Normalize Arabic-specific punctuation
	text = text.replace("٪", "%")
	text = text.replace("،", "،") # keep Arabic comma
	text = text.replace("؛", "؛") # keep Arabic semicolon
	# Normalize multiple spaces
	text = re.sub(r"\s+", " ", text)
	# Remove leading/trailing whitespace
	text = text.strip()
	return text

	def fix_hamza(self, text):
	"""Correct common hamza placement errors in Arabic text."""
	def _replace(match):
	word = match.group(0)
	return HAMZA_CORRECTIONS.get(word, word)
	return self._hamza_pattern.sub(_replace, text)

	def expand_numbers(self, text):
	"""Convert numbers to Arabic words."""
	def _number_to_words(match):
	num_str = match.group(0)
	try:
	num = float(num_str) if "." in num_str else int(num_str)
	return num2words(num, lang="ar")
	except (ValueError, OverflowError):
	return num_str

	# Handle percentages first (e.g., "70%" -> "سبعون بالمئة")
	def _percent_to_words(match):
	num_str = match.group(1)
	try:
	num = float(num_str) if "." in num_str else int(num_str)
	return num2words(num, lang="ar") + " بالمئة"
	except (ValueError, OverflowError):
	return match.group(0)

	text = re.sub(r"(\d+(?:\.\d+)?)\s*[%٪]", _percent_to_words, text)
	# Then standalone numbers
	text = re.sub(r"\d+(?:\.\d+)?", _number_to_words, text)
	return text

	def expand_symbols(self, text):
	"""Replace symbols with Arabic words."""
	for symbol, replacement in SYMBOL_MAP.items():
	text = text.replace(symbol, replacement)
	return text

	def add_tashkeel(self, text):
	"""Add diacritical marks using Mishkal."""
	if not self.enable_tashkeel:
	return text
	vocalizer = _get_vocalizer()
	return vocalizer.tashkeel(text)

	def process(self, text, tashkeel=None):
	"""
	Full preprocessing pipeline.

	Args:
	text: Raw Arabic text.
	tashkeel: Override tashkeel setting (True/False/None=use default).

	Returns:
	Processed text ready for XTTS-v2 inference.
	"""
	text = self.clean_text(text)
	text = self.fix_hamza(text)
	text = self.expand_numbers(text)
	text = self.expand_symbols(text)
	text = self.clean_text(text) # clean again after expansions

	use_tashkeel = tashkeel if tashkeel is not None else self.enable_tashkeel
	if use_tashkeel:
	text = self.add_tashkeel(text)

	return text


	# --- CLI for testing ---
	if __name__ == "__main__":
	preprocessor = ArabicPreprocessor()

	test_texts = [
	"الذكاء الاصطناعي يتطور بسرعة كبيرة، ويدخل في كل مجالات الحياة.",
	"اكثر من 70% من الشركات الكبرى تستخدم الذكاء الاصطناعي اليوم.",
	"الالات اصبحت قادرة على التعلم، واتخاذ قرارات معقدة بمفردها.",
	"ان مستقبل البشرية سيتشكل بناءً على كيفية تعاملنا مع هذه التقنية.",
	"هذا المشروع يكلف 500$ و يحقق نمو 25%",
	]

	print("=" * 70)
	print("Arabic Preprocessor Test")
	print("=" * 70)

	for text in test_texts:
	# Show each step
	cleaned = preprocessor.clean_text(text)
	hamza_fixed = preprocessor.fix_hamza(cleaned)
	numbers_expanded = preprocessor.expand_numbers(hamza_fixed)
	symbols_expanded = preprocessor.expand_symbols(numbers_expanded)
	final = preprocessor.process(text)

	print(f"\nOriginal: {text}")
	if hamza_fixed != cleaned:
	print(f"Hamza: {hamza_fixed}")
	if numbers_expanded != hamza_fixed:
	print(f"Numbers: {numbers_expanded}")
	if symbols_expanded != numbers_expanded:
	print(f"Symbols: {symbols_expanded}")
	print(f"Final: {final}")
	print("-" * 70)