#!/usr/bin/env python3 """ ================================================================================ Priority 3: Arabic Diacritization + Algerian Preprocessing Pipeline ================================================================================ Algerian Arabic (Darija) preprocessing is critical for TTS quality: 1. Text is often undiacritized → phonetic ambiguity 2. Heavy code-switching with French 3. Numerals need normalization 4. Mixed Arabic/Latin script usage This pipeline provides: 1. Arabic diacritization using Sadeed (SOTA, April 2025) 2. Numeral normalization (Eastern ٠١٢ and Western 012 → Arabic words) 3. Basic French/Arabic code-switching handling 4. Text caching for repeated phrases 5. Sentence-level chunking for streaming Dependencies: pip install transformers torch pyarabic num2words Usage: python 03_arabic_preprocessing.py \ --input "مرحبا كيف حالك 123" \ --diacritize \ --normalize_numerals python 03_arabic_preprocessing.py \ --input_file text.txt \ --output_file processed.txt \ --diacritize \ --normalize_numerals \ --chunk_for_streaming ================================================================================ """ import argparse import hashlib import json import os import re import sys import time from pathlib import Path from typing import List, Optional, Tuple import numpy as np # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- CACHE_DIR = Path.home() / ".cache" / "habibi_tts_preprocess" CACHE_DIR.mkdir(parents=True, exist_ok=True) # Arabic numeral mappings ARABIC_EASTERN_NUMERALS = "٠١٢٣٤٥٦٧٨٩" ARABIC_WESTERN_NUMERALS = "0123456789" # Simple Arabic word numbers (for numeral normalization) ARABIC_NUMBERS = { "0": "صفر", "1": "واحد", "2": "اثنان", "3": "ثلاثة", "4": "أربعة", "5": "خمسة", "6": "ستة", "7": "سبعة", "8": "ثمانية", "9": "تسعة", "10": "عشرة", "11": "أحد عشر", "12": "اثنا عشر", "13": "ثلاثة عشر", "14": "أربعة عشر", "15": "خمسة عشر", "16": "ستة عشر", "17": "سبعة عشر", "18": "ثمانية عشر", "19": "تسعة عشر", "20": "عشرون", "30": "ثلاثون", "40": "أربعون", "50": "خمسون", "60": "ستون", "70": "سبعون", "80": "ثمانون", "90": "تسعون", "100": "مائة", "1000": "ألف", "1000000": "مليون", } # French words commonly mixed in Algerian Arabic FRENCH_COMMON_WORDS = { "bonjour": "صباح الخير", "merci": "شكرا", "s'il vous plait": "من فضلك", "excusez-moi": "عذرا", "oui": "نعم", "non": "لا", "bon": "جيد", "très": "جدا", "beaucoup": "كثيرا", "comment": "كيف", "ça va": "كيف الحال", "au revoir": "مع السلامة", "bonsoir": "مساء الخير", "bonne nuit": "تصبح على خير", "pardon": "عذرا", "d'accord": "حسنا", "ok": "حسنا", } # --------------------------------------------------------------------------- # Caching # --------------------------------------------------------------------------- class TextCache: """Simple file-based cache for preprocessed text.""" def __init__(self, cache_dir: str = str(CACHE_DIR)): self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(parents=True, exist_ok=True) self.cache_file = self.cache_dir / "text_cache.json" self.cache = {} self._load() def _load(self): if self.cache_file.exists(): try: with open(self.cache_file, "r", encoding="utf-8") as f: self.cache = json.load(f) except Exception: self.cache = {} def _save(self): with open(self.cache_file, "w", encoding="utf-8") as f: json.dump(self.cache, f, ensure_ascii=False, indent=2) def get(self, key: str) -> Optional[str]: return self.cache.get(key) def set(self, key: str, value: str): self.cache[key] = value self._save() def clear(self): self.cache = {} self._save() # --------------------------------------------------------------------------- # Numeral Normalization # --------------------------------------------------------------------------- def normalize_numerals(text: str) -> str: """ Convert numerals (both Eastern Arabic ٠١٢ and Western 012) to Arabic words. Handles numbers up to millions. """ # First convert Eastern Arabic numerals to Western trans = str.maketrans(ARABIC_EASTERN_NUMERALS, ARABIC_WESTERN_NUMERALS) text = text.translate(trans) def number_to_arabic_words(num_str: str) -> str: """Convert a number string to Arabic words.""" try: num = int(num_str) except ValueError: return num_str if num == 0: return ARABIC_NUMBERS["0"] if num in ARABIC_NUMBERS: return ARABIC_NUMBERS[str(num)] # Build number from components parts = [] remaining = num millions = remaining // 1_000_000 if millions > 0: parts.append(f"{number_to_arabic_words(str(millions))} مليون") remaining %= 1_000_000 thousands = remaining // 1_000 if thousands > 0: parts.append(f"{number_to_arabic_words(str(thousands))} ألف") remaining %= 1_000 hundreds = remaining // 100 if hundreds > 0: if hundreds == 1: parts.append("مائة") elif hundreds == 2: parts.append("مائتان") else: parts.append(f"{ARABIC_NUMBERS[str(hundreds)]} مائة") remaining %= 100 if remaining > 0: if remaining in ARABIC_NUMBERS: parts.append(ARABIC_NUMBERS[str(remaining)]) else: tens = (remaining // 10) * 10 ones = remaining % 10 if tens > 0: parts.append(ARABIC_NUMBERS.get(str(tens), "")) if ones > 0: parts.append(ARABIC_NUMBERS.get(str(ones), "")) return " و ".join(parts) # Replace numbers in text def replace_match(match): num_str = match.group(0) return number_to_arabic_words(num_str) # Match sequences of digits text = re.sub(r'\d+', replace_match, text) return text # --------------------------------------------------------------------------- # Diacritization (using Sadeed or fallback) # --------------------------------------------------------------------------- class ArabicDiacritizer: """ Arabic text diacritization using Sadeed model (Misraj/Sadeed). Falls back to rule-based if model not available. """ def __init__(self, model_name: str = "Misraj/Sadeed", device: str = "cpu"): self.model_name = model_name self.device = device self.pipeline = None self._load_model() def _load_model(self): """Load the diacritization model.""" try: from transformers import pipeline print(f"[DIACRITIZE] Loading {self.model_name}...") self.pipeline = pipeline( "text2text-generation", model=self.model_name, device=0 if self.device == "cuda" else -1, torch_dtype="auto", ) print("[DIACRITIZE] Model loaded successfully.") except Exception as e: print(f"[DIACRITIZE] Warning: Could not load model ({e}). Using fallback.") self.pipeline = None def diacritize(self, text: str) -> str: """Add diacritics (tashkeel) to Arabic text.""" if not self.pipeline: return self._fallback_diacritize(text) try: result = self.pipeline(text, max_length=512, do_sample=False) return result[0]["generated_text"] except Exception as e: print(f"[DIACRITIZE] Error: {e}. Using fallback.") return self._fallback_diacritize(text) def _fallback_diacritize(self, text: str) -> str: """ Simple rule-based fallback for diacritization. This is very basic and should be replaced with a proper model. """ # Common short vowel patterns for Algerian Arabic # This is a placeholder - real diacritization requires a trained model return text # --------------------------------------------------------------------------- # Code-switching Handling # --------------------------------------------------------------------------- def handle_code_switching(text: str, translate_french: bool = False) -> str: """ Handle French/Arabic code-switching in Algerian text. If translate_french=True, attempts to translate common French words. Otherwise, marks language boundaries. """ if not translate_french: return text # Simple replacement of common French words text_lower = text.lower() for french, arabic in FRENCH_COMMON_WORDS.items(): # Case-insensitive replacement pattern = re.compile(re.escape(french), re.IGNORECASE) text = pattern.sub(arabic, text) return text # --------------------------------------------------------------------------- # Sentence Chunking for Streaming # --------------------------------------------------------------------------- def chunk_for_streaming(text: str, max_chars: int = 135) -> List[str]: """ Split text into sentence-level chunks for streaming TTS. Each chunk should be short enough for fast generation. """ # Split on Arabic and Latin punctuation sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[؛:،。!?])", text) chunks = [] current_chunk = "" for sentence in sentences: if not sentence.strip(): continue # Check byte length (F5-TTS uses UTF-8 byte length for chunking) if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars: current_chunk += sentence + " " if sentence and sentence[-1].isascii() else sentence else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence + " " if sentence and sentence[-1].isascii() else sentence if current_chunk: chunks.append(current_chunk.strip()) return chunks # --------------------------------------------------------------------------- # Main Preprocessing Pipeline # --------------------------------------------------------------------------- class AlgerianTTSPipeline: """Complete preprocessing pipeline for Algerian Arabic TTS.""" def __init__( self, diacritize: bool = True, normalize_numerals: bool = True, handle_code_switch: bool = True, cache_enabled: bool = True, device: str = "cpu", ): self.diacritize = diacritize self.normalize_numerals = normalize_numerals self.handle_code_switch = handle_code_switch self.cache = TextCache() if cache_enabled else None self.diacritizer = ArabicDiacritizer(device=device) if diacritize else None def preprocess(self, text: str) -> str: """Run full preprocessing pipeline on text.""" # Check cache if self.cache: cache_key = hashlib.md5(text.encode("utf-8")).hexdigest() cached = self.cache.get(cache_key) if cached: return cached result = text # Step 1: Normalize numerals if self.normalize_numerals: result = normalize_numerals(result) # Step 2: Handle code-switching if self.handle_code_switch: result = handle_code_switching(result, translate_french=True) # Step 3: Diacritize if self.diacritize and self.diacritizer: result = self.diacritizer.diacritize(result) # Cache result if self.cache: self.cache.set(cache_key, result) return result def preprocess_streaming(self, text: str, max_chars: int = 135) -> List[str]: """Preprocess and chunk text for streaming TTS.""" processed = self.preprocess(text) return chunk_for_streaming(processed, max_chars=max_chars) def main(): parser = argparse.ArgumentParser(description="Algerian Arabic TTS Preprocessing Pipeline") parser.add_argument("--input", help="Input text string") parser.add_argument("--input_file", help="Input text file") parser.add_argument("--output_file", help="Output file for processed text") parser.add_argument("--diacritize", action="store_true", help="Add diacritics") parser.add_argument("--normalize_numerals", action="store_true", help="Convert numerals to words") parser.add_argument("--handle_code_switch", action="store_true", help="Handle French/Arabic mixing") parser.add_argument("--chunk_for_streaming", action="store_true", help="Split into streaming chunks") parser.add_argument("--max_chars", type=int, default=135, help="Max chars per chunk") parser.add_argument("--device", default="cpu", help="Device for diacritization model") parser.add_argument("--clear_cache", action="store_true", help="Clear text cache") args = parser.parse_args() if args.clear_cache: cache = TextCache() cache.clear() print("[CACHE] Cleared.") return # Get input text if args.input: text = args.input elif args.input_file: with open(args.input_file, "r", encoding="utf-8") as f: text = f.read() else: # Demo text text = "مرحبا، كيف حالك اليوم؟ أنا بخير شكرا. الساعة 3:30 والطقس جميل." print(f"[DEMO] Using demo text: {text}") # Initialize pipeline pipeline = AlgerianTTSPipeline( diacritize=args.diacritize, normalize_numerals=args.normalize_numerals, handle_code_switch=args.handle_code_switch, device=args.device, ) # Process t0 = time.time() if args.chunk_for_streaming: result = pipeline.preprocess_streaming(text, max_chars=args.max_chars) print(f"\n[RESULT] Processed into {len(result)} chunks:") for i, chunk in enumerate(result): print(f" Chunk {i+1}: {chunk}") else: result = pipeline.preprocess(text) print(f"\n[RESULT] Processed text:") print(f" Input: {text}") print(f" Output: {result}") t1 = time.time() print(f"\n[TIME] Processing took {t1-t0:.3f}s") # Save output if args.output_file: with open(args.output_file, "w", encoding="utf-8") as f: if isinstance(result, list): for chunk in result: f.write(chunk + "\n") else: f.write(result) print(f"[SAVE] Saved to {args.output_file}") if __name__ == "__main__": main()