| |
| """ |
| ================================================================================ |
| Priority 3: Arabic Diacritization + Algerian Preprocessing Pipeline |
| ================================================================================ |
| |
| Algerian Arabic (Darija) preprocessing is critical for TTS quality: |
| 1. Text is often undiacritized → phonetic ambiguity |
| 2. Heavy code-switching with French |
| 3. Numerals need normalization |
| 4. Mixed Arabic/Latin script usage |
| |
| This pipeline provides: |
| 1. Arabic diacritization using Sadeed (SOTA, April 2025) |
| 2. Numeral normalization (Eastern ٠١٢ and Western 012 → Arabic words) |
| 3. Basic French/Arabic code-switching handling |
| 4. Text caching for repeated phrases |
| 5. Sentence-level chunking for streaming |
| |
| Dependencies: |
| pip install transformers torch pyarabic num2words |
| |
| Usage: |
| python 03_arabic_preprocessing.py \ |
| --input "مرحبا كيف حالك 123" \ |
| --diacritize \ |
| --normalize_numerals |
| |
| python 03_arabic_preprocessing.py \ |
| --input_file text.txt \ |
| --output_file processed.txt \ |
| --diacritize \ |
| --normalize_numerals \ |
| --chunk_for_streaming |
| |
| ================================================================================ |
| """ |
|
|
| import argparse |
| import hashlib |
| import json |
| import os |
| import re |
| import sys |
| import time |
| from pathlib import Path |
| from typing import List, Optional, Tuple |
|
|
| import numpy as np |
|
|
| |
| |
| |
|
|
| CACHE_DIR = Path.home() / ".cache" / "habibi_tts_preprocess" |
| CACHE_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| |
| ARABIC_EASTERN_NUMERALS = "٠١٢٣٤٥٦٧٨٩" |
| ARABIC_WESTERN_NUMERALS = "0123456789" |
|
|
| |
| ARABIC_NUMBERS = { |
| "0": "صفر", "1": "واحد", "2": "اثنان", "3": "ثلاثة", |
| "4": "أربعة", "5": "خمسة", "6": "ستة", "7": "سبعة", |
| "8": "ثمانية", "9": "تسعة", "10": "عشرة", |
| "11": "أحد عشر", "12": "اثنا عشر", "13": "ثلاثة عشر", |
| "14": "أربعة عشر", "15": "خمسة عشر", "16": "ستة عشر", |
| "17": "سبعة عشر", "18": "ثمانية عشر", "19": "تسعة عشر", |
| "20": "عشرون", "30": "ثلاثون", "40": "أربعون", |
| "50": "خمسون", "60": "ستون", "70": "سبعون", |
| "80": "ثمانون", "90": "تسعون", "100": "مائة", |
| "1000": "ألف", "1000000": "مليون", |
| } |
|
|
| |
| FRENCH_COMMON_WORDS = { |
| "bonjour": "صباح الخير", "merci": "شكرا", "s'il vous plait": "من فضلك", |
| "excusez-moi": "عذرا", "oui": "نعم", "non": "لا", |
| "bon": "جيد", "très": "جدا", "beaucoup": "كثيرا", |
| "comment": "كيف", "ça va": "كيف الحال", "au revoir": "مع السلامة", |
| "bonsoir": "مساء الخير", "bonne nuit": "تصبح على خير", |
| "pardon": "عذرا", "d'accord": "حسنا", "ok": "حسنا", |
| } |
|
|
|
|
| |
| |
| |
|
|
| class TextCache: |
| """Simple file-based cache for preprocessed text.""" |
|
|
| def __init__(self, cache_dir: str = str(CACHE_DIR)): |
| self.cache_dir = Path(cache_dir) |
| self.cache_dir.mkdir(parents=True, exist_ok=True) |
| self.cache_file = self.cache_dir / "text_cache.json" |
| self.cache = {} |
| self._load() |
|
|
| def _load(self): |
| if self.cache_file.exists(): |
| try: |
| with open(self.cache_file, "r", encoding="utf-8") as f: |
| self.cache = json.load(f) |
| except Exception: |
| self.cache = {} |
|
|
| def _save(self): |
| with open(self.cache_file, "w", encoding="utf-8") as f: |
| json.dump(self.cache, f, ensure_ascii=False, indent=2) |
|
|
| def get(self, key: str) -> Optional[str]: |
| return self.cache.get(key) |
|
|
| def set(self, key: str, value: str): |
| self.cache[key] = value |
| self._save() |
|
|
| def clear(self): |
| self.cache = {} |
| self._save() |
|
|
|
|
| |
| |
| |
|
|
|
|
| def normalize_numerals(text: str) -> str: |
| """ |
| Convert numerals (both Eastern Arabic ٠١٢ and Western 012) to Arabic words. |
| Handles numbers up to millions. |
| """ |
| |
| trans = str.maketrans(ARABIC_EASTERN_NUMERALS, ARABIC_WESTERN_NUMERALS) |
| text = text.translate(trans) |
|
|
| def number_to_arabic_words(num_str: str) -> str: |
| """Convert a number string to Arabic words.""" |
| try: |
| num = int(num_str) |
| except ValueError: |
| return num_str |
|
|
| if num == 0: |
| return ARABIC_NUMBERS["0"] |
|
|
| if num in ARABIC_NUMBERS: |
| return ARABIC_NUMBERS[str(num)] |
|
|
| |
| parts = [] |
| remaining = num |
|
|
| millions = remaining // 1_000_000 |
| if millions > 0: |
| parts.append(f"{number_to_arabic_words(str(millions))} مليون") |
| remaining %= 1_000_000 |
|
|
| thousands = remaining // 1_000 |
| if thousands > 0: |
| parts.append(f"{number_to_arabic_words(str(thousands))} ألف") |
| remaining %= 1_000 |
|
|
| hundreds = remaining // 100 |
| if hundreds > 0: |
| if hundreds == 1: |
| parts.append("مائة") |
| elif hundreds == 2: |
| parts.append("مائتان") |
| else: |
| parts.append(f"{ARABIC_NUMBERS[str(hundreds)]} مائة") |
| remaining %= 100 |
|
|
| if remaining > 0: |
| if remaining in ARABIC_NUMBERS: |
| parts.append(ARABIC_NUMBERS[str(remaining)]) |
| else: |
| tens = (remaining // 10) * 10 |
| ones = remaining % 10 |
| if tens > 0: |
| parts.append(ARABIC_NUMBERS.get(str(tens), "")) |
| if ones > 0: |
| parts.append(ARABIC_NUMBERS.get(str(ones), "")) |
|
|
| return " و ".join(parts) |
|
|
| |
| def replace_match(match): |
| num_str = match.group(0) |
| return number_to_arabic_words(num_str) |
|
|
| |
| text = re.sub(r'\d+', replace_match, text) |
| return text |
|
|
|
|
| |
| |
| |
|
|
|
|
| class ArabicDiacritizer: |
| """ |
| Arabic text diacritization using Sadeed model (Misraj/Sadeed). |
| Falls back to rule-based if model not available. |
| """ |
|
|
| def __init__(self, model_name: str = "Misraj/Sadeed", device: str = "cpu"): |
| self.model_name = model_name |
| self.device = device |
| self.pipeline = None |
| self._load_model() |
|
|
| def _load_model(self): |
| """Load the diacritization model.""" |
| try: |
| from transformers import pipeline |
| print(f"[DIACRITIZE] Loading {self.model_name}...") |
| self.pipeline = pipeline( |
| "text2text-generation", |
| model=self.model_name, |
| device=0 if self.device == "cuda" else -1, |
| torch_dtype="auto", |
| ) |
| print("[DIACRITIZE] Model loaded successfully.") |
| except Exception as e: |
| print(f"[DIACRITIZE] Warning: Could not load model ({e}). Using fallback.") |
| self.pipeline = None |
|
|
| def diacritize(self, text: str) -> str: |
| """Add diacritics (tashkeel) to Arabic text.""" |
| if not self.pipeline: |
| return self._fallback_diacritize(text) |
|
|
| try: |
| result = self.pipeline(text, max_length=512, do_sample=False) |
| return result[0]["generated_text"] |
| except Exception as e: |
| print(f"[DIACRITIZE] Error: {e}. Using fallback.") |
| return self._fallback_diacritize(text) |
|
|
| def _fallback_diacritize(self, text: str) -> str: |
| """ |
| Simple rule-based fallback for diacritization. |
| This is very basic and should be replaced with a proper model. |
| """ |
| |
| |
| return text |
|
|
|
|
| |
| |
| |
|
|
|
|
| def handle_code_switching(text: str, translate_french: bool = False) -> str: |
| """ |
| Handle French/Arabic code-switching in Algerian text. |
| If translate_french=True, attempts to translate common French words. |
| Otherwise, marks language boundaries. |
| """ |
| if not translate_french: |
| return text |
|
|
| |
| text_lower = text.lower() |
| for french, arabic in FRENCH_COMMON_WORDS.items(): |
| |
| pattern = re.compile(re.escape(french), re.IGNORECASE) |
| text = pattern.sub(arabic, text) |
|
|
| return text |
|
|
|
|
| |
| |
| |
|
|
|
|
| def chunk_for_streaming(text: str, max_chars: int = 135) -> List[str]: |
| """ |
| Split text into sentence-level chunks for streaming TTS. |
| Each chunk should be short enough for fast generation. |
| """ |
| |
| sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[؛:،。!?])", text) |
|
|
| chunks = [] |
| current_chunk = "" |
|
|
| for sentence in sentences: |
| if not sentence.strip(): |
| continue |
| |
| if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars: |
| current_chunk += sentence + " " if sentence and sentence[-1].isascii() else sentence |
| else: |
| if current_chunk: |
| chunks.append(current_chunk.strip()) |
| current_chunk = sentence + " " if sentence and sentence[-1].isascii() else sentence |
|
|
| if current_chunk: |
| chunks.append(current_chunk.strip()) |
|
|
| return chunks |
|
|
|
|
| |
| |
| |
|
|
|
|
| class AlgerianTTSPipeline: |
| """Complete preprocessing pipeline for Algerian Arabic TTS.""" |
|
|
| def __init__( |
| self, |
| diacritize: bool = True, |
| normalize_numerals: bool = True, |
| handle_code_switch: bool = True, |
| cache_enabled: bool = True, |
| device: str = "cpu", |
| ): |
| self.diacritize = diacritize |
| self.normalize_numerals = normalize_numerals |
| self.handle_code_switch = handle_code_switch |
| self.cache = TextCache() if cache_enabled else None |
| self.diacritizer = ArabicDiacritizer(device=device) if diacritize else None |
|
|
| def preprocess(self, text: str) -> str: |
| """Run full preprocessing pipeline on text.""" |
| |
| if self.cache: |
| cache_key = hashlib.md5(text.encode("utf-8")).hexdigest() |
| cached = self.cache.get(cache_key) |
| if cached: |
| return cached |
|
|
| result = text |
|
|
| |
| if self.normalize_numerals: |
| result = normalize_numerals(result) |
|
|
| |
| if self.handle_code_switch: |
| result = handle_code_switching(result, translate_french=True) |
|
|
| |
| if self.diacritize and self.diacritizer: |
| result = self.diacritizer.diacritize(result) |
|
|
| |
| if self.cache: |
| self.cache.set(cache_key, result) |
|
|
| return result |
|
|
| def preprocess_streaming(self, text: str, max_chars: int = 135) -> List[str]: |
| """Preprocess and chunk text for streaming TTS.""" |
| processed = self.preprocess(text) |
| return chunk_for_streaming(processed, max_chars=max_chars) |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Algerian Arabic TTS Preprocessing Pipeline") |
| parser.add_argument("--input", help="Input text string") |
| parser.add_argument("--input_file", help="Input text file") |
| parser.add_argument("--output_file", help="Output file for processed text") |
| parser.add_argument("--diacritize", action="store_true", help="Add diacritics") |
| parser.add_argument("--normalize_numerals", action="store_true", help="Convert numerals to words") |
| parser.add_argument("--handle_code_switch", action="store_true", help="Handle French/Arabic mixing") |
| parser.add_argument("--chunk_for_streaming", action="store_true", help="Split into streaming chunks") |
| parser.add_argument("--max_chars", type=int, default=135, help="Max chars per chunk") |
| parser.add_argument("--device", default="cpu", help="Device for diacritization model") |
| parser.add_argument("--clear_cache", action="store_true", help="Clear text cache") |
| args = parser.parse_args() |
|
|
| if args.clear_cache: |
| cache = TextCache() |
| cache.clear() |
| print("[CACHE] Cleared.") |
| return |
|
|
| |
| if args.input: |
| text = args.input |
| elif args.input_file: |
| with open(args.input_file, "r", encoding="utf-8") as f: |
| text = f.read() |
| else: |
| |
| text = "مرحبا، كيف حالك اليوم؟ أنا بخير شكرا. الساعة 3:30 والطقس جميل." |
| print(f"[DEMO] Using demo text: {text}") |
|
|
| |
| pipeline = AlgerianTTSPipeline( |
| diacritize=args.diacritize, |
| normalize_numerals=args.normalize_numerals, |
| handle_code_switch=args.handle_code_switch, |
| device=args.device, |
| ) |
|
|
| |
| t0 = time.time() |
| if args.chunk_for_streaming: |
| result = pipeline.preprocess_streaming(text, max_chars=args.max_chars) |
| print(f"\n[RESULT] Processed into {len(result)} chunks:") |
| for i, chunk in enumerate(result): |
| print(f" Chunk {i+1}: {chunk}") |
| else: |
| result = pipeline.preprocess(text) |
| print(f"\n[RESULT] Processed text:") |
| print(f" Input: {text}") |
| print(f" Output: {result}") |
|
|
| t1 = time.time() |
| print(f"\n[TIME] Processing took {t1-t0:.3f}s") |
|
|
| |
| if args.output_file: |
| with open(args.output_file, "w", encoding="utf-8") as f: |
| if isinstance(result, list): |
| for chunk in result: |
| f.write(chunk + "\n") |
| else: |
| f.write(result) |
| print(f"[SAVE] Saved to {args.output_file}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|