""" Phonemization logic and verse metadata loading. """ import os import sys import json import yaml from pathlib import Path # Get config after ensuring sys.path is set sys.path.insert(0, str(Path(__file__).parent.parent)) from config import SURAH_INFO_PATH, DEFAULT_STOPS, PHONEMIZER_RESULT_CACHE_SIZE # Phonemizer resources path (for HF Spaces compatibility) # Resources are now inside the core package: phonemizer/core/resources/ PHONEMIZER_RESOURCES_DIR = Path(__file__).parent.parent.parent / "phonemizer" / "core" / "resources" RECITATION_DATA_DIR = Path(__file__).parent.parent / "data" # Module-level cache _phonemizer_cache = { "phonemizer": None, "surah_info": None, "loaded": False, "error": None } # LRU cache for phonemizer results (verse_ref -> result) from functools import lru_cache # Simple cache for recent phonemizer results _result_cache = {} def get_cached_phonemizer_result(verse_ref: str, stops: list = None): """ Get cached phonemizer result or compute and cache it. Args: verse_ref: Verse reference (e.g., "1:1", "2:255") stops: List of stops (default: ["compulsory_stop"]) Returns: PhonemizeResult or None if error """ if stops is None: stops = ["compulsory_stop"] # Create cache key from verse_ref and sorted stops stops_key = ",".join(sorted(stops)) cache_key = f"{verse_ref}:{stops_key}" if cache_key in _result_cache: return _result_cache[cache_key] phonemizer, error = load_phonemizer() if not phonemizer: return None try: result = phonemizer.phonemize(ref=verse_ref, stops=stops) # Evict oldest if cache is full if len(_result_cache) >= PHONEMIZER_RESULT_CACHE_SIZE: oldest_key = next(iter(_result_cache)) del _result_cache[oldest_key] _result_cache[cache_key] = result return result except Exception as e: print(f"[PHONEMIZER] Error phonemizing {verse_ref}: {e}") return None def clear_phonemizer_result_cache(): """Clear the phonemizer result cache.""" _result_cache.clear() def set_ikhfaa_shafawi_phoneme_setting(value: str): """ Update ikhfaa shafawi phoneme in phonemizer registry and clear cache. Called when user changes the "Iqlab/Ikhfaa Shafawi Sound" radio button. Args: value: "meem ghunnah" or "ikhfaa" """ from core.phoneme_registry import set_phoneme_override, clear_phoneme_overrides clear_phoneme_overrides() if value == "meem ghunnah": set_phoneme_override("iqlab", "phoneme", "m̃") set_phoneme_override("ikhfaa", "shafawi_phoneme", "m̃") else: # value == "ikhfaa" # Must explicitly override iqlab since YAML default is m̃ (meem ghunnah) # ikhfaa shafawi's YAML default is already ŋ, so no override needed set_phoneme_override("iqlab", "phoneme", "ŋ") # Clear result cache so next phonemization uses new setting clear_phonemizer_result_cache() def load_phonemizer(): """ Load the Phonemizer instance with caching. Returns: Phonemizer instance or None if not available """ if _phonemizer_cache["loaded"]: return _phonemizer_cache["phonemizer"], _phonemizer_cache["error"] try: # Import from the installed phonemizer package from core.phonemizer import Phonemizer # Use local resources if available, otherwise use defaults db_path = PHONEMIZER_RESOURCES_DIR / "Quran.json" if PHONEMIZER_RESOURCES_DIR.exists() else None map_path = PHONEMIZER_RESOURCES_DIR / "base_phonemes.yaml" if PHONEMIZER_RESOURCES_DIR.exists() else None special_words_path = PHONEMIZER_RESOURCES_DIR / "special_words.yaml" if PHONEMIZER_RESOURCES_DIR.exists() else None if db_path and db_path.exists() and map_path and map_path.exists() and special_words_path and special_words_path.exists(): phonemizer = Phonemizer( db_path=db_path, map_path=map_path, special_words_path=special_words_path, ) else: phonemizer = Phonemizer() _phonemizer_cache["phonemizer"] = phonemizer _phonemizer_cache["loaded"] = True _phonemizer_cache["error"] = None print("✓ Phonemizer loaded successfully") return phonemizer, None except ImportError as e: import traceback error_msg = f"Failed to import phonemizer. Make sure quranic-phonemizer is installed: {str(e)}" _phonemizer_cache["error"] = error_msg _phonemizer_cache["loaded"] = True print(f"✗ {error_msg}") print(traceback.format_exc()) return None, error_msg except Exception as e: import traceback error_msg = f"Failed to load phonemizer: {str(e)}" _phonemizer_cache["error"] = error_msg _phonemizer_cache["loaded"] = True print(f"✗ {error_msg}") print(traceback.format_exc()) return None, error_msg def load_surah_info(): """ Load surah information from JSON file. Returns: Dictionary with surah information or None if not available """ if _phonemizer_cache["surah_info"] is not None: return _phonemizer_cache["surah_info"] try: with open(SURAH_INFO_PATH, 'r', encoding='utf-8') as f: surah_info = json.load(f) _phonemizer_cache["surah_info"] = surah_info print(f"✓ Loaded surah info with {len(surah_info)} chapters") return surah_info except Exception as e: print(f"✗ Failed to load surah info: {str(e)}") return None def get_chapter_list(): """ Get list of chapter numbers with names. Returns: List of tuples (chapter_number, chapter_name_en, chapter_name_ar) """ surah_info = load_surah_info() if not surah_info: return [] chapters = [] for chapter_num_str, chapter_data in surah_info.items(): chapter_num = int(chapter_num_str) name_en = chapter_data.get("name_en", f"Chapter {chapter_num}") name_ar = chapter_data.get("name_ar", "") chapters.append((chapter_num, name_en, name_ar)) return sorted(chapters, key=lambda x: x[0]) def get_verses_for_chapter(chapter_num): """ Get list of verse numbers for a given chapter. Args: chapter_num: Chapter number (int) Returns: List of verse numbers """ surah_info = load_surah_info() if not surah_info: return [] chapter_data = surah_info.get(str(chapter_num)) if not chapter_data: return [] verses = [v["verse"] for v in chapter_data.get("verses", [])] return sorted(verses) def phonemize_verse(verse_ref, stops=None): """ Phonemize a verse reference and return text and phonemes. Args: verse_ref: Verse reference (e.g., "1:1", "2:255", "1:1-1:7") stops: List of stop types to include Returns: Tuple of (arabic_text_html, arabic_text_clean, phonemes_string, success, error_message) """ try: # Use default stops if not provided if stops is None: stops = DEFAULT_STOPS.copy() elif "compulsory_stop" not in stops: stops = stops + ["compulsory_stop"] # Use cached phonemizer result result = get_cached_phonemizer_result(verse_ref, stops) if result is None: return "", "", "", False, "Failed to phonemize verse" phonemes_str = result.phonemes_str(phoneme_sep=" ", word_sep="") # Build canonical structure using frozen models # This ensures extensions (dagger alef) appear before other symbols (tatweel) from recitation_analysis.result_builder import get_result_builder builder = get_result_builder() # Main verse display: apply tanween transforms only, NOT word transforms # (stopping/starting/Allah). Word transforms are applied in the # Error Analysis, Ghunnah Analysis, and Madd Analysis tabs. recitation_result = builder.build_from_phonemizer_result( result, verse_ref, apply_word_transforms=False ) # Render text with correct symbol ordering and tanween substitution # 1. HTML version for display (with verse markers) arabic_text_html = _render_verse_text(recitation_result.canonical_words, clean_text=False) # 2. Clean version for processing (no markers, just words) arabic_text_clean = _render_verse_text(recitation_result.canonical_words, clean_text=True) return arabic_text_html, arabic_text_clean, phonemes_str, True, None except Exception as e: error_msg = f"Phonemization error: {str(e)}" return "", "", "", False, error_msg def _render_verse_text(words, clean_text=False) -> str: """ Render a tuple of WordData as plain text with correct symbol ordering. This ensures proper ordering of extensions and other symbols, and applies tanween substitutions for idgham/ikhfaa/iqlab contexts. Args: words: Tuple of WordData to render clean_text: If True, omit verse markers Returns: Arabic text string with correct symbol ordering """ from recitation_analysis.text_display.rendering import ( substitute_open_tanween, substitute_iqlab_tanween, ) from recitation_analysis.ui.verse_markers import format_verse_marker from recitation_analysis.text_display.special_word_builder import get_display_swap # Shaddah character (frozen model uses bool, not object) SHADDAH_CHAR = '\u0651' # ّ word_texts = [] prev_verse_num = None for word in words: parts = [] # Leading symbols (rub el hizb, etc.) - now strings directly for sym in word.leading_symbols: parts.append(sym) if word.leading_symbols: parts.append(' ') # Special words (e.g., الم، يس، حم) have no letters - use text directly if not word.letters and word.text: parts.append(word.text) # Render each letter with proper symbol ordering for letter in word.letters: # Base letter (apply display swap for special words) letter_char = get_display_swap(word.location, letter.char) or letter.char parts.append(letter_char) # Check for iqlab tanween substitution # In frozen model: diacritic_char is the character, diacritic is the name diac_char = letter.diacritic_char iqlab_base_diac, iqlab_meem = substitute_iqlab_tanween(diac_char, letter.letter_rules) is_iqlab_tanween = iqlab_meem is not None # Check for iqlab noon (noon sakinah before baa) # In frozen model: diacritic is the name string (or None) is_iqlab_noon = ( letter.letter_rules and 'iqlab_noon' in letter.letter_rules and (letter.diacritic is None or letter.diacritic == 'SUKUN') ) # Diacritic/Shaddah (proper stacking order) # In frozen model: shaddah is bool, not object if letter.shaddah: parts.append(SHADDAH_CHAR) if letter.diacritic_char: if is_iqlab_tanween: # Iqlab tanween: use base diacritic instead of tanween parts.append(iqlab_base_diac) else: # Apply open tanween substitution for idgham/ikhfaa diac = substitute_open_tanween(letter.diacritic_char, letter.letter_rules) parts.append(diac) # Extensions (dagger alef, maddah, etc.) - now strings directly for ext in letter.extensions: # Apply display swap for special words ext_char = get_display_swap(word.location, ext) or ext parts.append(ext_char) # Other symbols (tatweel, etc.) - now strings directly for sym in letter.other_symbols: parts.append(sym) # Iqlab small meem (after other symbols) if is_iqlab_tanween and iqlab_meem: parts.append(iqlab_meem) elif is_iqlab_noon: # For iqlab noon: add mini meem above after the noon parts.append('\u06E2') # MINI_MEEM_ABOVE # Trailing symbols (stop signs) - now strings directly for sym in word.trailing_symbols: parts.append(sym) word_text = ''.join(parts) # Extract verse number from word location (format: "surah:verse:word") location_parts = word.location.split(':') if len(location_parts) >= 2: current_verse_num = int(location_parts[1]) else: current_verse_num = None # Add verse marker when verse changes (for verse ranges) if not clean_text and prev_verse_num is not None and current_verse_num != prev_verse_num: word_texts.append(format_verse_marker(prev_verse_num)) word_texts.append(word_text) prev_verse_num = current_verse_num # Add final verse marker if not clean_text and prev_verse_num is not None: word_texts.append(format_verse_marker(prev_verse_num)) return ' '.join(word_texts) + ' ' def _apply_open_tanween_to_text(result, text: str) -> str: """ Apply open tanween substitution to Arabic text based on phonemizer rules. Substitutes standard tanween characters with open tanween for letters that have idgham/ikhfaa/iqlab rules. Args: result: PhonemizeResult from phonemizer text: Arabic text from result.text() Returns: Text with open tanween substituted where appropriate """ try: from utils.phoneme_map import OPEN_TANWEEN_RULES # Map diacritic names to (char, open_char) pairs for idgham/ikhfaa TANWEEN_NAME_TO_OPEN = { 'FATHATAN': ('\u064B', '\u08F0'), # FATHATAN -> open fathatan (DigitalKhatt V2) 'DAMMATAN': ('\u064C', '\u08F1'), # DAMMATAN -> open dammatan (DigitalKhatt V2) 'KASRATAN': ('\u064D', '\u08F2'), # KASRATAN -> open kasratan (DigitalKhatt V2) } # Map diacritic names to (normal_char, base_diac, small_meem) for iqlab TANWEEN_NAME_TO_IQLAB = { 'FATHATAN': ('\u064B', '\u064E', '\u06E2'), # -> FATHA + mini meem above 'DAMMATAN': ('\u064C', '\u064F', '\u06E2'), # -> DAMMA + mini meem above 'KASRATAN': ('\u064D', '\u0650', '\u06ED'), # -> KASRA + mini meem below } # Get mapping to check letter rules mapping = result.get_mapping() # Build list of (word_text, substitutions) to apply # Each substitution is (old_char, new_chars) for that word word_substitutions = [] for word in mapping.words: subs_for_word = [] for letter in word.letter_mappings: if not letter.letter_rules: continue diac_name = letter.diacritic if not diac_name or diac_name not in TANWEEN_NAME_TO_OPEN: continue # Check for iqlab first (special treatment) if 'iqlab_tanween' in letter.letter_rules: normal_char, base_diac, small_meem = TANWEEN_NAME_TO_IQLAB[diac_name] subs_for_word.append((normal_char, base_diac + small_meem)) continue # Check for open tanween rules (idgham/ikhfaa) has_open_rule = any(rule in OPEN_TANWEEN_RULES for rule in letter.letter_rules) if has_open_rule: normal_char, open_char = TANWEEN_NAME_TO_OPEN[diac_name] subs_for_word.append((normal_char, open_char)) if subs_for_word: word_substitutions.append((word.text, subs_for_word)) # Apply substitutions word by word in the text result_text = text for word_text, subs in word_substitutions: # Find this word in the remaining text and apply substitutions # Strip rule tags from word text for matching import re clean_word = re.sub(r'<[^>]*>', '', word_text) # Find the word position word_idx = result_text.find(clean_word) if word_idx == -1: # Try without some diacritics for fuzzy match continue # Extract the word, apply substitutions, replace word_end = word_idx + len(clean_word) word_chars = list(result_text[word_idx:word_end]) for old_char, new_chars in subs: for i, c in enumerate(word_chars): if c == old_char: word_chars[i] = new_chars break # Only replace first occurrence in this word result_text = result_text[:word_idx] + ''.join(word_chars) + result_text[word_end:] return result_text except Exception as e: # If anything fails, return original text import traceback traceback.print_exc() return text def format_verse_reference(from_chapter, from_verse, to_verse): """ Format verse selection into a reference string for the phonemizer. Args: from_chapter: Chapter number or None from_verse: Starting verse number or None to_verse: Ending verse number or None Returns: Formatted reference string or None if invalid """ # If only from_chapter is selected, return just the chapter if from_chapter and not from_verse: return str(from_chapter) # If from_chapter and from_verse are selected if from_chapter and from_verse: start_ref = f"{from_chapter}:{from_verse}" # If no to_verse, return single verse if not to_verse: return start_ref # If to_verse is same as from_verse, return single verse if to_verse == from_verse: return start_ref # Return range within same chapter end_ref = f"{from_chapter}:{to_verse}" return f"{start_ref}-{end_ref}" return None def match_text_to_verse(transcribed_text: str, verse_ref: str, stops: list = None): """ Match transcribed text to a verse reference using the phonemizer. This is used for text matching in segmented mode - takes ASR output and finds which portion of the canonical text it matches. Args: transcribed_text: Arabic text from ASR transcription verse_ref: Verse reference to match against (e.g., "1:2" or "1:2-1:7") stops: List of stop types (default: ["compulsory_stop"]) Returns: Tuple of (matched_text, phonemes, match_score, matched_ref) - matched_text: The canonical text portion that was matched - phonemes: Phoneme string for the matched portion - match_score: Confidence score (0-1) of the match - matched_ref: The specific verse reference matched (e.g., "1:2:1-1:2:4") """ if stops is None: stops = ["compulsory_stop"] phonemizer, error = load_phonemizer() if phonemizer is None: return "", "", 0.0, verse_ref try: result = phonemizer.phonemize( ref_text=transcribed_text, ref=verse_ref, stops=stops ) text = result.text() phonemes = result.phonemes_str(phoneme_sep=" ", word_sep="", verse_sep="") match_score = result.match_score matched_ref = result.ref return text, phonemes, match_score, matched_ref except Exception as e: print(f"[PHONEMIZER] Text matching error: {e}") return "", "", 0.0, verse_ref def get_total_words_for_verse_range(verse_ref: str) -> int: """ Get the total number of words for a verse reference from surah_info.json. Args: verse_ref: Verse reference like "1:2" or "1:2-1:5" Returns: Total number of words across the verse range """ surah_info = load_surah_info() if not surah_info: return 0 try: # Parse verse reference if '-' in verse_ref: # Range: "1:2-1:5" start_part, end_part = verse_ref.split('-') start_surah, start_verse = map(int, start_part.split(':')) end_surah, end_verse = map(int, end_part.split(':')) else: # Single verse: "1:2" start_surah, start_verse = map(int, verse_ref.split(':')) end_surah, end_verse = start_surah, start_verse total_words = 0 # Handle single surah case if start_surah == end_surah: surah_data = surah_info.get(str(start_surah)) if surah_data and "verses" in surah_data: for verse_data in surah_data["verses"]: verse_num = verse_data["verse"] if start_verse <= verse_num <= end_verse: total_words += verse_data.get("num_words", 0) else: # Multi-surah range (rare but possible) for surah_num in range(start_surah, end_surah + 1): surah_data = surah_info.get(str(surah_num)) if not surah_data or "verses" not in surah_data: continue for verse_data in surah_data["verses"]: verse_num = verse_data["verse"] # First surah: from start_verse onwards if surah_num == start_surah and verse_num >= start_verse: total_words += verse_data.get("num_words", 0) # Middle surahs: all verses elif start_surah < surah_num < end_surah: total_words += verse_data.get("num_words", 0) # Last surah: up to end_verse elif surah_num == end_surah and verse_num <= end_verse: total_words += verse_data.get("num_words", 0) return total_words except Exception as e: print(f"[PHONEMIZER] Error getting word count for {verse_ref}: {e}") return 0