Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Phonemization logic and verse metadata loading. | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import yaml | |
| from pathlib import Path | |
| # Get config after ensuring sys.path is set | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from config import SURAH_INFO_PATH, DEFAULT_STOPS, PHONEMIZER_RESULT_CACHE_SIZE | |
| # Phonemizer resources path (for HF Spaces compatibility) | |
| # Resources are now inside the core package: phonemizer/core/resources/ | |
| PHONEMIZER_RESOURCES_DIR = Path(__file__).parent.parent.parent / "phonemizer" / "core" / "resources" | |
| RECITATION_DATA_DIR = Path(__file__).parent.parent / "data" | |
| # Module-level cache | |
| _phonemizer_cache = { | |
| "phonemizer": None, | |
| "surah_info": None, | |
| "loaded": False, | |
| "error": None | |
| } | |
| # LRU cache for phonemizer results (verse_ref -> result) | |
| from functools import lru_cache | |
| # Simple cache for recent phonemizer results | |
| _result_cache = {} | |
| def get_cached_phonemizer_result(verse_ref: str, stops: list = None): | |
| """ | |
| Get cached phonemizer result or compute and cache it. | |
| Args: | |
| verse_ref: Verse reference (e.g., "1:1", "2:255") | |
| stops: List of stops (default: ["compulsory_stop"]) | |
| Returns: | |
| PhonemizeResult or None if error | |
| """ | |
| if stops is None: | |
| stops = ["compulsory_stop"] | |
| # Create cache key from verse_ref and sorted stops | |
| stops_key = ",".join(sorted(stops)) | |
| cache_key = f"{verse_ref}:{stops_key}" | |
| if cache_key in _result_cache: | |
| return _result_cache[cache_key] | |
| phonemizer, error = load_phonemizer() | |
| if not phonemizer: | |
| return None | |
| try: | |
| result = phonemizer.phonemize(ref=verse_ref, stops=stops) | |
| # Evict oldest if cache is full | |
| if len(_result_cache) >= PHONEMIZER_RESULT_CACHE_SIZE: | |
| oldest_key = next(iter(_result_cache)) | |
| del _result_cache[oldest_key] | |
| _result_cache[cache_key] = result | |
| return result | |
| except Exception as e: | |
| print(f"[PHONEMIZER] Error phonemizing {verse_ref}: {e}") | |
| return None | |
| def clear_phonemizer_result_cache(): | |
| """Clear the phonemizer result cache.""" | |
| _result_cache.clear() | |
| def set_ikhfaa_shafawi_phoneme_setting(value: str): | |
| """ | |
| Update ikhfaa shafawi phoneme in phonemizer registry and clear cache. | |
| Called when user changes the "Iqlab/Ikhfaa Shafawi Sound" radio button. | |
| Args: | |
| value: "meem ghunnah" or "ikhfaa" | |
| """ | |
| from core.phoneme_registry import set_phoneme_override, clear_phoneme_overrides | |
| clear_phoneme_overrides() | |
| if value == "meem ghunnah": | |
| set_phoneme_override("iqlab", "phoneme", "m̃") | |
| set_phoneme_override("ikhfaa", "shafawi_phoneme", "m̃") | |
| else: # value == "ikhfaa" | |
| # Must explicitly override iqlab since YAML default is m̃ (meem ghunnah) | |
| # ikhfaa shafawi's YAML default is already ŋ, so no override needed | |
| set_phoneme_override("iqlab", "phoneme", "ŋ") | |
| # Clear result cache so next phonemization uses new setting | |
| clear_phonemizer_result_cache() | |
| def load_phonemizer(): | |
| """ | |
| Load the Phonemizer instance with caching. | |
| Returns: | |
| Phonemizer instance or None if not available | |
| """ | |
| if _phonemizer_cache["loaded"]: | |
| return _phonemizer_cache["phonemizer"], _phonemizer_cache["error"] | |
| try: | |
| # Import from the installed phonemizer package | |
| from core.phonemizer import Phonemizer | |
| # Use local resources if available, otherwise use defaults | |
| db_path = PHONEMIZER_RESOURCES_DIR / "Quran.json" if PHONEMIZER_RESOURCES_DIR.exists() else None | |
| map_path = PHONEMIZER_RESOURCES_DIR / "base_phonemes.yaml" if PHONEMIZER_RESOURCES_DIR.exists() else None | |
| special_words_path = PHONEMIZER_RESOURCES_DIR / "special_words.yaml" if PHONEMIZER_RESOURCES_DIR.exists() else None | |
| if db_path and db_path.exists() and map_path and map_path.exists() and special_words_path and special_words_path.exists(): | |
| phonemizer = Phonemizer( | |
| db_path=db_path, | |
| map_path=map_path, | |
| special_words_path=special_words_path, | |
| ) | |
| else: | |
| phonemizer = Phonemizer() | |
| _phonemizer_cache["phonemizer"] = phonemizer | |
| _phonemizer_cache["loaded"] = True | |
| _phonemizer_cache["error"] = None | |
| print("✓ Phonemizer loaded successfully") | |
| return phonemizer, None | |
| except ImportError as e: | |
| import traceback | |
| error_msg = f"Failed to import phonemizer. Make sure quranic-phonemizer is installed: {str(e)}" | |
| _phonemizer_cache["error"] = error_msg | |
| _phonemizer_cache["loaded"] = True | |
| print(f"✗ {error_msg}") | |
| print(traceback.format_exc()) | |
| return None, error_msg | |
| except Exception as e: | |
| import traceback | |
| error_msg = f"Failed to load phonemizer: {str(e)}" | |
| _phonemizer_cache["error"] = error_msg | |
| _phonemizer_cache["loaded"] = True | |
| print(f"✗ {error_msg}") | |
| print(traceback.format_exc()) | |
| return None, error_msg | |
| def load_surah_info(): | |
| """ | |
| Load surah information from JSON file. | |
| Returns: | |
| Dictionary with surah information or None if not available | |
| """ | |
| if _phonemizer_cache["surah_info"] is not None: | |
| return _phonemizer_cache["surah_info"] | |
| try: | |
| with open(SURAH_INFO_PATH, 'r', encoding='utf-8') as f: | |
| surah_info = json.load(f) | |
| _phonemizer_cache["surah_info"] = surah_info | |
| print(f"✓ Loaded surah info with {len(surah_info)} chapters") | |
| return surah_info | |
| except Exception as e: | |
| print(f"✗ Failed to load surah info: {str(e)}") | |
| return None | |
| def get_chapter_list(): | |
| """ | |
| Get list of chapter numbers with names. | |
| Returns: | |
| List of tuples (chapter_number, chapter_name_en, chapter_name_ar) | |
| """ | |
| surah_info = load_surah_info() | |
| if not surah_info: | |
| return [] | |
| chapters = [] | |
| for chapter_num_str, chapter_data in surah_info.items(): | |
| chapter_num = int(chapter_num_str) | |
| name_en = chapter_data.get("name_en", f"Chapter {chapter_num}") | |
| name_ar = chapter_data.get("name_ar", "") | |
| chapters.append((chapter_num, name_en, name_ar)) | |
| return sorted(chapters, key=lambda x: x[0]) | |
| def get_verses_for_chapter(chapter_num): | |
| """ | |
| Get list of verse numbers for a given chapter. | |
| Args: | |
| chapter_num: Chapter number (int) | |
| Returns: | |
| List of verse numbers | |
| """ | |
| surah_info = load_surah_info() | |
| if not surah_info: | |
| return [] | |
| chapter_data = surah_info.get(str(chapter_num)) | |
| if not chapter_data: | |
| return [] | |
| verses = [v["verse"] for v in chapter_data.get("verses", [])] | |
| return sorted(verses) | |
| def phonemize_verse(verse_ref, stops=None): | |
| """ | |
| Phonemize a verse reference and return text and phonemes. | |
| Args: | |
| verse_ref: Verse reference (e.g., "1:1", "2:255", "1:1-1:7") | |
| stops: List of stop types to include | |
| Returns: | |
| Tuple of (arabic_text_html, arabic_text_clean, phonemes_string, success, error_message) | |
| """ | |
| try: | |
| # Use default stops if not provided | |
| if stops is None: | |
| stops = DEFAULT_STOPS.copy() | |
| elif "compulsory_stop" not in stops: | |
| stops = stops + ["compulsory_stop"] | |
| # Use cached phonemizer result | |
| result = get_cached_phonemizer_result(verse_ref, stops) | |
| if result is None: | |
| return "", "", "", False, "Failed to phonemize verse" | |
| phonemes_str = result.phonemes_str(phoneme_sep=" ", word_sep="") | |
| # Build canonical structure using frozen models | |
| # This ensures extensions (dagger alef) appear before other symbols (tatweel) | |
| from recitation_analysis.result_builder import get_result_builder | |
| builder = get_result_builder() | |
| # Main verse display: apply tanween transforms only, NOT word transforms | |
| # (stopping/starting/Allah). Word transforms are applied in the | |
| # Error Analysis, Ghunnah Analysis, and Madd Analysis tabs. | |
| recitation_result = builder.build_from_phonemizer_result( | |
| result, verse_ref, apply_word_transforms=False | |
| ) | |
| # Render text with correct symbol ordering and tanween substitution | |
| # 1. HTML version for display (with verse markers) | |
| arabic_text_html = _render_verse_text(recitation_result.canonical_words, clean_text=False) | |
| # 2. Clean version for processing (no markers, just words) | |
| arabic_text_clean = _render_verse_text(recitation_result.canonical_words, clean_text=True) | |
| return arabic_text_html, arabic_text_clean, phonemes_str, True, None | |
| except Exception as e: | |
| error_msg = f"Phonemization error: {str(e)}" | |
| return "", "", "", False, error_msg | |
| def _render_verse_text(words, clean_text=False) -> str: | |
| """ | |
| Render a tuple of WordData as plain text with correct symbol ordering. | |
| This ensures proper ordering of extensions and other symbols, and applies | |
| tanween substitutions for idgham/ikhfaa/iqlab contexts. | |
| Args: | |
| words: Tuple of WordData to render | |
| clean_text: If True, omit verse markers | |
| Returns: | |
| Arabic text string with correct symbol ordering | |
| """ | |
| from recitation_analysis.text_display.rendering import ( | |
| substitute_open_tanween, | |
| substitute_iqlab_tanween, | |
| ) | |
| from recitation_analysis.ui.verse_markers import format_verse_marker | |
| from recitation_analysis.text_display.special_word_builder import get_display_swap | |
| # Shaddah character (frozen model uses bool, not object) | |
| SHADDAH_CHAR = '\u0651' # ّ | |
| word_texts = [] | |
| prev_verse_num = None | |
| for word in words: | |
| parts = [] | |
| # Leading symbols (rub el hizb, etc.) - now strings directly | |
| for sym in word.leading_symbols: | |
| parts.append(sym) | |
| if word.leading_symbols: | |
| parts.append(' ') | |
| # Special words (e.g., الم، يس، حم) have no letters - use text directly | |
| if not word.letters and word.text: | |
| parts.append(word.text) | |
| # Render each letter with proper symbol ordering | |
| for letter in word.letters: | |
| # Base letter (apply display swap for special words) | |
| letter_char = get_display_swap(word.location, letter.char) or letter.char | |
| parts.append(letter_char) | |
| # Check for iqlab tanween substitution | |
| # In frozen model: diacritic_char is the character, diacritic is the name | |
| diac_char = letter.diacritic_char | |
| iqlab_base_diac, iqlab_meem = substitute_iqlab_tanween(diac_char, letter.letter_rules) | |
| is_iqlab_tanween = iqlab_meem is not None | |
| # Check for iqlab noon (noon sakinah before baa) | |
| # In frozen model: diacritic is the name string (or None) | |
| is_iqlab_noon = ( | |
| letter.letter_rules and | |
| 'iqlab_noon' in letter.letter_rules and | |
| (letter.diacritic is None or letter.diacritic == 'SUKUN') | |
| ) | |
| # Diacritic/Shaddah (proper stacking order) | |
| # In frozen model: shaddah is bool, not object | |
| if letter.shaddah: | |
| parts.append(SHADDAH_CHAR) | |
| if letter.diacritic_char: | |
| if is_iqlab_tanween: | |
| # Iqlab tanween: use base diacritic instead of tanween | |
| parts.append(iqlab_base_diac) | |
| else: | |
| # Apply open tanween substitution for idgham/ikhfaa | |
| diac = substitute_open_tanween(letter.diacritic_char, letter.letter_rules) | |
| parts.append(diac) | |
| # Extensions (dagger alef, maddah, etc.) - now strings directly | |
| for ext in letter.extensions: | |
| # Apply display swap for special words | |
| ext_char = get_display_swap(word.location, ext) or ext | |
| parts.append(ext_char) | |
| # Other symbols (tatweel, etc.) - now strings directly | |
| for sym in letter.other_symbols: | |
| parts.append(sym) | |
| # Iqlab small meem (after other symbols) | |
| if is_iqlab_tanween and iqlab_meem: | |
| parts.append(iqlab_meem) | |
| elif is_iqlab_noon: | |
| # For iqlab noon: add mini meem above after the noon | |
| parts.append('\u06E2') # MINI_MEEM_ABOVE | |
| # Trailing symbols (stop signs) - now strings directly | |
| for sym in word.trailing_symbols: | |
| parts.append(sym) | |
| word_text = ''.join(parts) | |
| # Extract verse number from word location (format: "surah:verse:word") | |
| location_parts = word.location.split(':') | |
| if len(location_parts) >= 2: | |
| current_verse_num = int(location_parts[1]) | |
| else: | |
| current_verse_num = None | |
| # Add verse marker when verse changes (for verse ranges) | |
| if not clean_text and prev_verse_num is not None and current_verse_num != prev_verse_num: | |
| word_texts.append(format_verse_marker(prev_verse_num)) | |
| word_texts.append(word_text) | |
| prev_verse_num = current_verse_num | |
| # Add final verse marker | |
| if not clean_text and prev_verse_num is not None: | |
| word_texts.append(format_verse_marker(prev_verse_num)) | |
| return ' '.join(word_texts) + ' ' | |
| def _apply_open_tanween_to_text(result, text: str) -> str: | |
| """ | |
| Apply open tanween substitution to Arabic text based on phonemizer rules. | |
| Substitutes standard tanween characters with open tanween for letters | |
| that have idgham/ikhfaa/iqlab rules. | |
| Args: | |
| result: PhonemizeResult from phonemizer | |
| text: Arabic text from result.text() | |
| Returns: | |
| Text with open tanween substituted where appropriate | |
| """ | |
| try: | |
| from utils.phoneme_map import OPEN_TANWEEN_RULES | |
| # Map diacritic names to (char, open_char) pairs for idgham/ikhfaa | |
| TANWEEN_NAME_TO_OPEN = { | |
| 'FATHATAN': ('\u064B', '\u08F0'), # FATHATAN -> open fathatan (DigitalKhatt V2) | |
| 'DAMMATAN': ('\u064C', '\u08F1'), # DAMMATAN -> open dammatan (DigitalKhatt V2) | |
| 'KASRATAN': ('\u064D', '\u08F2'), # KASRATAN -> open kasratan (DigitalKhatt V2) | |
| } | |
| # Map diacritic names to (normal_char, base_diac, small_meem) for iqlab | |
| TANWEEN_NAME_TO_IQLAB = { | |
| 'FATHATAN': ('\u064B', '\u064E', '\u06E2'), # -> FATHA + mini meem above | |
| 'DAMMATAN': ('\u064C', '\u064F', '\u06E2'), # -> DAMMA + mini meem above | |
| 'KASRATAN': ('\u064D', '\u0650', '\u06ED'), # -> KASRA + mini meem below | |
| } | |
| # Get mapping to check letter rules | |
| mapping = result.get_mapping() | |
| # Build list of (word_text, substitutions) to apply | |
| # Each substitution is (old_char, new_chars) for that word | |
| word_substitutions = [] | |
| for word in mapping.words: | |
| subs_for_word = [] | |
| for letter in word.letter_mappings: | |
| if not letter.letter_rules: | |
| continue | |
| diac_name = letter.diacritic | |
| if not diac_name or diac_name not in TANWEEN_NAME_TO_OPEN: | |
| continue | |
| # Check for iqlab first (special treatment) | |
| if 'iqlab_tanween' in letter.letter_rules: | |
| normal_char, base_diac, small_meem = TANWEEN_NAME_TO_IQLAB[diac_name] | |
| subs_for_word.append((normal_char, base_diac + small_meem)) | |
| continue | |
| # Check for open tanween rules (idgham/ikhfaa) | |
| has_open_rule = any(rule in OPEN_TANWEEN_RULES for rule in letter.letter_rules) | |
| if has_open_rule: | |
| normal_char, open_char = TANWEEN_NAME_TO_OPEN[diac_name] | |
| subs_for_word.append((normal_char, open_char)) | |
| if subs_for_word: | |
| word_substitutions.append((word.text, subs_for_word)) | |
| # Apply substitutions word by word in the text | |
| result_text = text | |
| for word_text, subs in word_substitutions: | |
| # Find this word in the remaining text and apply substitutions | |
| # Strip rule tags from word text for matching | |
| import re | |
| clean_word = re.sub(r'<[^>]*>', '', word_text) | |
| # Find the word position | |
| word_idx = result_text.find(clean_word) | |
| if word_idx == -1: | |
| # Try without some diacritics for fuzzy match | |
| continue | |
| # Extract the word, apply substitutions, replace | |
| word_end = word_idx + len(clean_word) | |
| word_chars = list(result_text[word_idx:word_end]) | |
| for old_char, new_chars in subs: | |
| for i, c in enumerate(word_chars): | |
| if c == old_char: | |
| word_chars[i] = new_chars | |
| break # Only replace first occurrence in this word | |
| result_text = result_text[:word_idx] + ''.join(word_chars) + result_text[word_end:] | |
| return result_text | |
| except Exception as e: | |
| # If anything fails, return original text | |
| import traceback | |
| traceback.print_exc() | |
| return text | |
| def format_verse_reference(from_chapter, from_verse, to_verse): | |
| """ | |
| Format verse selection into a reference string for the phonemizer. | |
| Args: | |
| from_chapter: Chapter number or None | |
| from_verse: Starting verse number or None | |
| to_verse: Ending verse number or None | |
| Returns: | |
| Formatted reference string or None if invalid | |
| """ | |
| # If only from_chapter is selected, return just the chapter | |
| if from_chapter and not from_verse: | |
| return str(from_chapter) | |
| # If from_chapter and from_verse are selected | |
| if from_chapter and from_verse: | |
| start_ref = f"{from_chapter}:{from_verse}" | |
| # If no to_verse, return single verse | |
| if not to_verse: | |
| return start_ref | |
| # If to_verse is same as from_verse, return single verse | |
| if to_verse == from_verse: | |
| return start_ref | |
| # Return range within same chapter | |
| end_ref = f"{from_chapter}:{to_verse}" | |
| return f"{start_ref}-{end_ref}" | |
| return None | |
| def match_text_to_verse(transcribed_text: str, verse_ref: str, stops: list = None): | |
| """ | |
| Match transcribed text to a verse reference using the phonemizer. | |
| This is used for text matching in segmented mode - takes ASR output | |
| and finds which portion of the canonical text it matches. | |
| Args: | |
| transcribed_text: Arabic text from ASR transcription | |
| verse_ref: Verse reference to match against (e.g., "1:2" or "1:2-1:7") | |
| stops: List of stop types (default: ["compulsory_stop"]) | |
| Returns: | |
| Tuple of (matched_text, phonemes, match_score, matched_ref) | |
| - matched_text: The canonical text portion that was matched | |
| - phonemes: Phoneme string for the matched portion | |
| - match_score: Confidence score (0-1) of the match | |
| - matched_ref: The specific verse reference matched (e.g., "1:2:1-1:2:4") | |
| """ | |
| if stops is None: | |
| stops = ["compulsory_stop"] | |
| phonemizer, error = load_phonemizer() | |
| if phonemizer is None: | |
| return "", "", 0.0, verse_ref | |
| try: | |
| result = phonemizer.phonemize( | |
| ref_text=transcribed_text, | |
| ref=verse_ref, | |
| stops=stops | |
| ) | |
| text = result.text() | |
| phonemes = result.phonemes_str(phoneme_sep=" ", word_sep="", verse_sep="") | |
| match_score = result.match_score | |
| matched_ref = result.ref | |
| return text, phonemes, match_score, matched_ref | |
| except Exception as e: | |
| print(f"[PHONEMIZER] Text matching error: {e}") | |
| return "", "", 0.0, verse_ref | |
| def get_total_words_for_verse_range(verse_ref: str) -> int: | |
| """ | |
| Get the total number of words for a verse reference from surah_info.json. | |
| Args: | |
| verse_ref: Verse reference like "1:2" or "1:2-1:5" | |
| Returns: | |
| Total number of words across the verse range | |
| """ | |
| surah_info = load_surah_info() | |
| if not surah_info: | |
| return 0 | |
| try: | |
| # Parse verse reference | |
| if '-' in verse_ref: | |
| # Range: "1:2-1:5" | |
| start_part, end_part = verse_ref.split('-') | |
| start_surah, start_verse = map(int, start_part.split(':')) | |
| end_surah, end_verse = map(int, end_part.split(':')) | |
| else: | |
| # Single verse: "1:2" | |
| start_surah, start_verse = map(int, verse_ref.split(':')) | |
| end_surah, end_verse = start_surah, start_verse | |
| total_words = 0 | |
| # Handle single surah case | |
| if start_surah == end_surah: | |
| surah_data = surah_info.get(str(start_surah)) | |
| if surah_data and "verses" in surah_data: | |
| for verse_data in surah_data["verses"]: | |
| verse_num = verse_data["verse"] | |
| if start_verse <= verse_num <= end_verse: | |
| total_words += verse_data.get("num_words", 0) | |
| else: | |
| # Multi-surah range (rare but possible) | |
| for surah_num in range(start_surah, end_surah + 1): | |
| surah_data = surah_info.get(str(surah_num)) | |
| if not surah_data or "verses" not in surah_data: | |
| continue | |
| for verse_data in surah_data["verses"]: | |
| verse_num = verse_data["verse"] | |
| # First surah: from start_verse onwards | |
| if surah_num == start_surah and verse_num >= start_verse: | |
| total_words += verse_data.get("num_words", 0) | |
| # Middle surahs: all verses | |
| elif start_surah < surah_num < end_surah: | |
| total_words += verse_data.get("num_words", 0) | |
| # Last surah: up to end_verse | |
| elif surah_num == end_surah and verse_num <= end_verse: | |
| total_words += verse_data.get("num_words", 0) | |
| return total_words | |
| except Exception as e: | |
| print(f"[PHONEMIZER] Error getting word count for {verse_ref}: {e}") | |
| return 0 | |