#!/usr/bin/env python3 """ TajweedSST - Step 1: Tajweed Rule Parser Generates two parallel text streams and a Rule Map: - Visual Stream: Standard Uthmani text - Phonetic Stream: Pronounced text for MFA - Tajweed Map: Tags for physics validation Tajweed Rules Implemented: - Idgham (Assimilation) - Iqlab (Conversion) - Ikhfa (Concealment) - Qalqalah (Bounce) - Ghunnah (Nasalization) - Madd (Elongation) - Tafkheem/Tarqeeq (Heavy/Light) """ import re from dataclasses import dataclass, field from typing import List, Dict, Tuple, Optional from enum import Enum class TajweedType(Enum): NONE = "None" QALQALAH_SUGHRA = "Qalqalah_Sughra" QALQALAH_KUBRA = "Qalqalah_Kubra" GHUNNAH = "Ghunnah" IDGHAM_FULL = "Idgham_Full" IDGHAM_PARTIAL = "Idgham_Partial" IQLAB = "Iqlab" IKHFA = "Ikhfa" MADD_ASLI = "Madd_Asli" MADD_WAJIB = "Madd_Wajib" MADD_LAZIM = "Madd_Lazim" TAFKHEEM = "Tafkheem" TARQEEQ = "Tarqeeq" SILENT = "Silent" class PhysicsCheck(Enum): CHECK_RMS_BOUNCE = "Check_RMS_Bounce" CHECK_DURATION = "Check_Duration" CHECK_GHUNNAH = "Check_Ghunnah" CHECK_FORMANT_F2 = "Check_Formant_F2" NONE = "None" @dataclass class LetterTag: """Tag for a single Arabic letter with Tajweed info""" char_visual: str char_phonetic: str position: int tajweed_type: TajweedType = TajweedType.NONE physics_check: PhysicsCheck = PhysicsCheck.NONE is_silent: bool = False madd_count: int = 0 # 0=none, 2=asli, 4=wajib, 6=lazim @dataclass class WordTags: """Tajweed tags for a complete word""" word_text: str letters: List[LetterTag] = field(default_factory=list) phonetic_stream: str = "" class TajweedParser: """Parses Uthmani Quran text and generates Tajweed rule tags""" # Qalqalah letters: ق ط ب ج د QALQALAH_LETTERS = set('قطبجد') # Heavy letters (Tafkheem): خ ص ض غ ط ق ظ TAFKHEEM_LETTERS = set('خصضغطقظ') # Idgham letters after Nun Sakinah: ي ر م ل و ن IDGHAM_LETTERS = set('يرملون') IDGHAM_WITH_GHUNNAH = set('ينمو') # With Ghunnah IDGHAM_WITHOUT_GHUNNAH = set('رل') # Without Ghunnah # Ikhfa letters (15 letters) IKHFA_LETTERS = set('تثجدذزسشصضطظفقك') # Harakat (vowel marks) FATHA = '\u064E' DAMMA = '\u064F' KASRA = '\u0650' SUKUN = '\u0652' SHADDA = '\u0651' TANWEEN_FATH = '\u064B' TANWEEN_DAMM = '\u064C' TANWEEN_KASR = '\u064D' # Madd letters MADD_ALIF = 'ا' MADD_WAW = 'و' MADD_YA = 'ي' # Phonetic mapping (simplified Buckwalter-like) PHONETIC_MAP = { 'ا': 'ā', 'ب': 'b', 'ت': 't', 'ث': 'ṯ', 'ج': 'j', 'ح': 'ḥ', 'خ': 'ḫ', 'د': 'd', 'ذ': 'ḏ', 'ر': 'r', 'ز': 'z', 'س': 's', 'ش': 'š', 'ص': 'ṣ', 'ض': 'ḍ', 'ط': 'ṭ', 'ظ': 'ẓ', 'ع': 'ʿ', 'غ': 'ġ', 'ف': 'f', 'ق': 'q', 'ك': 'k', 'ل': 'l', 'م': 'm', 'ن': 'n', 'ه': 'h', 'و': 'w', 'ي': 'y', 'ء': 'ʾ', 'ة': 'h', 'ى': 'ā', 'ئ': 'ʾ', 'ؤ': 'ʾ', 'أ': 'ʾa', 'إ': 'ʾi', 'آ': 'ʾā' } def __init__(self): self.debug = False def parse_text(self, text: str) -> List[WordTags]: """Parse Uthmani text and return tagged words""" words = text.strip().split() result = [] for word in words: word_tags = self._parse_word(word) result.append(word_tags) # Cross-word analysis (Nun Sakinah rules across words) self._analyze_cross_word_rules(result) return result def _parse_word(self, word: str) -> WordTags: """Parse a single word and generate letter tags""" word_tags = WordTags(word_text=word) # Extract base letters and diacritics letters_with_harakat = self._split_letters(word) for idx, (letter, harakat) in enumerate(letters_with_harakat): tag = self._analyze_letter( letter=letter, harakat=harakat, position=idx, context=(letters_with_harakat, idx), word=word ) word_tags.letters.append(tag) # Generate phonetic stream word_tags.phonetic_stream = self._generate_phonetic_stream(word_tags.letters) return word_tags def _split_letters(self, word: str) -> List[Tuple[str, str]]: """Split word into (letter, harakat) pairs""" result = [] i = 0 harakat_chars = set([self.FATHA, self.DAMMA, self.KASRA, self.SUKUN, self.SHADDA, self.TANWEEN_FATH, self.TANWEEN_DAMM, self.TANWEEN_KASR, '\u0653', '\u0654', '\u0655', '\u0656', '\u0657', '\u0658', '\u065C', '\u0670']) while i < len(word): char = word[i] # Skip if it's a harakat if char in harakat_chars: i += 1 continue # Collect harakat following this letter harakat = "" j = i + 1 while j < len(word) and word[j] in harakat_chars: harakat += word[j] j += 1 result.append((char, harakat)) i = j return result def _analyze_letter(self, letter: str, harakat: str, position: int, context: Tuple[List, int], word: str) -> LetterTag: """Analyze a single letter and assign Tajweed rules""" letters_list, idx = context is_last = idx == len(letters_list) - 1 has_sukun = self.SUKUN in harakat has_shadda = self.SHADDA in harakat tag = LetterTag( char_visual=letter, char_phonetic=self.PHONETIC_MAP.get(letter, letter), position=position ) # Rule 1: Qalqalah (ق ط ب ج د with Sukun) if letter in self.QALQALAH_LETTERS and (has_sukun or is_last): if is_last: tag.tajweed_type = TajweedType.QALQALAH_KUBRA else: tag.tajweed_type = TajweedType.QALQALAH_SUGHRA tag.physics_check = PhysicsCheck.CHECK_RMS_BOUNCE # Rule 2: Tafkheem (Heavy letters) elif letter in self.TAFKHEEM_LETTERS: tag.tajweed_type = TajweedType.TAFKHEEM tag.physics_check = PhysicsCheck.CHECK_FORMANT_F2 # Rule 3: Madd (Elongation) - check preceding vowel elif letter in [self.MADD_ALIF, self.MADD_WAW, self.MADD_YA]: # Check for Madd conditions if idx > 0: prev_letter, prev_harakat = letters_list[idx - 1] if (letter == self.MADD_ALIF and self.FATHA in prev_harakat) or \ (letter == self.MADD_WAW and self.DAMMA in prev_harakat) or \ (letter == self.MADD_YA and self.KASRA in prev_harakat): # Check what follows for Madd type if is_last: tag.tajweed_type = TajweedType.MADD_ASLI tag.madd_count = 2 elif idx + 1 < len(letters_list): next_letter, next_harakat = letters_list[idx + 1] if self.SHADDA in next_harakat or self.SUKUN in next_harakat: tag.tajweed_type = TajweedType.MADD_LAZIM tag.madd_count = 6 else: tag.tajweed_type = TajweedType.MADD_WAJIB tag.madd_count = 4 tag.physics_check = PhysicsCheck.CHECK_DURATION # Rule 4: Ghunnah (Nun/Meem with Shadda) if letter in 'نم' and has_shadda: tag.tajweed_type = TajweedType.GHUNNAH tag.physics_check = PhysicsCheck.CHECK_GHUNNAH # Rule 5: Nun Sakinah / Tanween rules if letter == 'ن' and has_sukun: if idx + 1 < len(letters_list): next_letter, _ = letters_list[idx + 1] # Iqlab: Nun + Ba → Mim + Ba if next_letter == 'ب': tag.tajweed_type = TajweedType.IQLAB tag.char_phonetic = 'm' # Pronounced as Mim tag.physics_check = PhysicsCheck.CHECK_GHUNNAH # Idgham elif next_letter in self.IDGHAM_LETTERS: if next_letter in self.IDGHAM_WITH_GHUNNAH: tag.tajweed_type = TajweedType.IDGHAM_PARTIAL else: tag.tajweed_type = TajweedType.IDGHAM_FULL tag.physics_check = PhysicsCheck.CHECK_DURATION # Ikhfa elif next_letter in self.IKHFA_LETTERS: tag.tajweed_type = TajweedType.IKHFA tag.physics_check = PhysicsCheck.CHECK_GHUNNAH # Handle Tanween similarly if any(tanween in harakat for tanween in [self.TANWEEN_FATH, self.TANWEEN_DAMM, self.TANWEEN_KASR]): if idx + 1 < len(letters_list): next_letter, _ = letters_list[idx + 1] if next_letter == 'ب': tag.tajweed_type = TajweedType.IQLAB tag.physics_check = PhysicsCheck.CHECK_GHUNNAH elif next_letter in self.IKHFA_LETTERS: tag.tajweed_type = TajweedType.IKHFA tag.physics_check = PhysicsCheck.CHECK_GHUNNAH # Silent letters (Alif after Waw al-Jama'a, etc.) if letter == 'ا' and not harakat and idx > 0: prev_letter, prev_harakat = letters_list[idx - 1] if prev_letter == 'و' and (self.DAMMA in prev_harakat or self.SUKUN in prev_harakat): tag.is_silent = True tag.tajweed_type = TajweedType.SILENT tag.char_phonetic = '' return tag def _analyze_cross_word_rules(self, words: List[WordTags]) -> None: """Analyze Tajweed rules that span word boundaries""" for i in range(len(words) - 1): current_word = words[i] next_word = words[i + 1] if not current_word.letters or not next_word.letters: continue last_letter = current_word.letters[-1] first_of_next = next_word.letters[0] # Check Nun Sakinah at end of word + next word's first letter if last_letter.char_visual == 'ن' and last_letter.tajweed_type == TajweedType.NONE: if first_of_next.char_visual == 'ب': last_letter.tajweed_type = TajweedType.IQLAB last_letter.char_phonetic = 'm' last_letter.physics_check = PhysicsCheck.CHECK_GHUNNAH elif first_of_next.char_visual in self.IDGHAM_LETTERS: if first_of_next.char_visual in self.IDGHAM_WITH_GHUNNAH: last_letter.tajweed_type = TajweedType.IDGHAM_PARTIAL else: last_letter.tajweed_type = TajweedType.IDGHAM_FULL last_letter.physics_check = PhysicsCheck.CHECK_DURATION elif first_of_next.char_visual in self.IKHFA_LETTERS: last_letter.tajweed_type = TajweedType.IKHFA last_letter.physics_check = PhysicsCheck.CHECK_GHUNNAH def _generate_phonetic_stream(self, letters: List[LetterTag]) -> str: """Generate phonetic transcription for MFA""" phonemes = [] for letter in letters: if not letter.is_silent and letter.char_phonetic: phonemes.append(letter.char_phonetic) return ' '.join(phonemes) def main(): """Test the Tajweed parser""" parser = TajweedParser() # Test with Surah Al-Ikhlas test_text = "قُلْ هُوَ اللَّهُ أَحَدٌ" print("=" * 50) print("TajweedSST Parser Test") print("=" * 50) print(f"Input: {test_text}") print() words = parser.parse_text(test_text) for word in words: print(f"Word: {word.word_text}") print(f" Phonetic: {word.phonetic_stream}") for letter in word.letters: if letter.tajweed_type != TajweedType.NONE: print(f" [{letter.char_visual}] → {letter.tajweed_type.value} ({letter.physics_check.value})") print() if __name__ == "__main__": main()