Tajweed-AI / utils /phonemizer_utils.py
hetchyy's picture
Add ghunnah/madd durations
c6974e2
"""
Phonemization logic and verse metadata loading.
"""
import os
import sys
import json
import yaml
from pathlib import Path
# Get config after ensuring sys.path is set
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SURAH_INFO_PATH, DEFAULT_STOPS, PHONEMIZER_RESULT_CACHE_SIZE
# Phonemizer resources path (for HF Spaces compatibility)
# Resources are now inside the core package: phonemizer/core/resources/
PHONEMIZER_RESOURCES_DIR = Path(__file__).parent.parent.parent / "phonemizer" / "core" / "resources"
RECITATION_DATA_DIR = Path(__file__).parent.parent / "data"
# Module-level cache
_phonemizer_cache = {
"phonemizer": None,
"surah_info": None,
"loaded": False,
"error": None
}
# LRU cache for phonemizer results (verse_ref -> result)
from functools import lru_cache
# Simple cache for recent phonemizer results
_result_cache = {}
def get_cached_phonemizer_result(verse_ref: str, stops: list = None):
"""
Get cached phonemizer result or compute and cache it.
Args:
verse_ref: Verse reference (e.g., "1:1", "2:255")
stops: List of stops (default: ["compulsory_stop"])
Returns:
PhonemizeResult or None if error
"""
if stops is None:
stops = ["compulsory_stop"]
# Create cache key from verse_ref and sorted stops
stops_key = ",".join(sorted(stops))
cache_key = f"{verse_ref}:{stops_key}"
if cache_key in _result_cache:
return _result_cache[cache_key]
phonemizer, error = load_phonemizer()
if not phonemizer:
return None
try:
result = phonemizer.phonemize(ref=verse_ref, stops=stops)
# Evict oldest if cache is full
if len(_result_cache) >= PHONEMIZER_RESULT_CACHE_SIZE:
oldest_key = next(iter(_result_cache))
del _result_cache[oldest_key]
_result_cache[cache_key] = result
return result
except Exception as e:
print(f"[PHONEMIZER] Error phonemizing {verse_ref}: {e}")
return None
def clear_phonemizer_result_cache():
"""Clear the phonemizer result cache."""
_result_cache.clear()
def set_ikhfaa_shafawi_phoneme_setting(value: str):
"""
Update ikhfaa shafawi phoneme in phonemizer registry and clear cache.
Called when user changes the "Iqlab/Ikhfaa Shafawi Sound" radio button.
Args:
value: "meem ghunnah" or "ikhfaa"
"""
from core.phoneme_registry import set_phoneme_override, clear_phoneme_overrides
clear_phoneme_overrides()
if value == "meem ghunnah":
set_phoneme_override("iqlab", "phoneme", "m̃")
set_phoneme_override("ikhfaa", "shafawi_phoneme", "m̃")
else: # value == "ikhfaa"
# Must explicitly override iqlab since YAML default is m̃ (meem ghunnah)
# ikhfaa shafawi's YAML default is already ŋ, so no override needed
set_phoneme_override("iqlab", "phoneme", "ŋ")
# Clear result cache so next phonemization uses new setting
clear_phonemizer_result_cache()
def load_phonemizer():
"""
Load the Phonemizer instance with caching.
Returns:
Phonemizer instance or None if not available
"""
if _phonemizer_cache["loaded"]:
return _phonemizer_cache["phonemizer"], _phonemizer_cache["error"]
try:
# Import from the installed phonemizer package
from core.phonemizer import Phonemizer
# Use local resources if available, otherwise use defaults
db_path = PHONEMIZER_RESOURCES_DIR / "Quran.json" if PHONEMIZER_RESOURCES_DIR.exists() else None
map_path = PHONEMIZER_RESOURCES_DIR / "base_phonemes.yaml" if PHONEMIZER_RESOURCES_DIR.exists() else None
special_words_path = PHONEMIZER_RESOURCES_DIR / "special_words.yaml" if PHONEMIZER_RESOURCES_DIR.exists() else None
if db_path and db_path.exists() and map_path and map_path.exists() and special_words_path and special_words_path.exists():
phonemizer = Phonemizer(
db_path=db_path,
map_path=map_path,
special_words_path=special_words_path,
)
else:
phonemizer = Phonemizer()
_phonemizer_cache["phonemizer"] = phonemizer
_phonemizer_cache["loaded"] = True
_phonemizer_cache["error"] = None
print("✓ Phonemizer loaded successfully")
return phonemizer, None
except ImportError as e:
import traceback
error_msg = f"Failed to import phonemizer. Make sure quranic-phonemizer is installed: {str(e)}"
_phonemizer_cache["error"] = error_msg
_phonemizer_cache["loaded"] = True
print(f"✗ {error_msg}")
print(traceback.format_exc())
return None, error_msg
except Exception as e:
import traceback
error_msg = f"Failed to load phonemizer: {str(e)}"
_phonemizer_cache["error"] = error_msg
_phonemizer_cache["loaded"] = True
print(f"✗ {error_msg}")
print(traceback.format_exc())
return None, error_msg
def load_surah_info():
"""
Load surah information from JSON file.
Returns:
Dictionary with surah information or None if not available
"""
if _phonemizer_cache["surah_info"] is not None:
return _phonemizer_cache["surah_info"]
try:
with open(SURAH_INFO_PATH, 'r', encoding='utf-8') as f:
surah_info = json.load(f)
_phonemizer_cache["surah_info"] = surah_info
print(f"✓ Loaded surah info with {len(surah_info)} chapters")
return surah_info
except Exception as e:
print(f"✗ Failed to load surah info: {str(e)}")
return None
def get_chapter_list():
"""
Get list of chapter numbers with names.
Returns:
List of tuples (chapter_number, chapter_name_en, chapter_name_ar)
"""
surah_info = load_surah_info()
if not surah_info:
return []
chapters = []
for chapter_num_str, chapter_data in surah_info.items():
chapter_num = int(chapter_num_str)
name_en = chapter_data.get("name_en", f"Chapter {chapter_num}")
name_ar = chapter_data.get("name_ar", "")
chapters.append((chapter_num, name_en, name_ar))
return sorted(chapters, key=lambda x: x[0])
def get_verses_for_chapter(chapter_num):
"""
Get list of verse numbers for a given chapter.
Args:
chapter_num: Chapter number (int)
Returns:
List of verse numbers
"""
surah_info = load_surah_info()
if not surah_info:
return []
chapter_data = surah_info.get(str(chapter_num))
if not chapter_data:
return []
verses = [v["verse"] for v in chapter_data.get("verses", [])]
return sorted(verses)
def phonemize_verse(verse_ref, stops=None):
"""
Phonemize a verse reference and return text and phonemes.
Args:
verse_ref: Verse reference (e.g., "1:1", "2:255", "1:1-1:7")
stops: List of stop types to include
Returns:
Tuple of (arabic_text_html, arabic_text_clean, phonemes_string, success, error_message)
"""
try:
# Use default stops if not provided
if stops is None:
stops = DEFAULT_STOPS.copy()
elif "compulsory_stop" not in stops:
stops = stops + ["compulsory_stop"]
# Use cached phonemizer result
result = get_cached_phonemizer_result(verse_ref, stops)
if result is None:
return "", "", "", False, "Failed to phonemize verse"
phonemes_str = result.phonemes_str(phoneme_sep=" ", word_sep="")
# Build canonical structure using frozen models
# This ensures extensions (dagger alef) appear before other symbols (tatweel)
from recitation_analysis.result_builder import get_result_builder
builder = get_result_builder()
# Main verse display: apply tanween transforms only, NOT word transforms
# (stopping/starting/Allah). Word transforms are applied in the
# Error Analysis, Ghunnah Analysis, and Madd Analysis tabs.
recitation_result = builder.build_from_phonemizer_result(
result, verse_ref, apply_word_transforms=False
)
# Render text with correct symbol ordering and tanween substitution
# 1. HTML version for display (with verse markers)
arabic_text_html = _render_verse_text(recitation_result.canonical_words, clean_text=False)
# 2. Clean version for processing (no markers, just words)
arabic_text_clean = _render_verse_text(recitation_result.canonical_words, clean_text=True)
return arabic_text_html, arabic_text_clean, phonemes_str, True, None
except Exception as e:
error_msg = f"Phonemization error: {str(e)}"
return "", "", "", False, error_msg
def _render_verse_text(words, clean_text=False) -> str:
"""
Render a tuple of WordData as plain text with correct symbol ordering.
This ensures proper ordering of extensions and other symbols, and applies
tanween substitutions for idgham/ikhfaa/iqlab contexts.
Args:
words: Tuple of WordData to render
clean_text: If True, omit verse markers
Returns:
Arabic text string with correct symbol ordering
"""
from recitation_analysis.text_display.rendering import (
substitute_open_tanween,
substitute_iqlab_tanween,
)
from recitation_analysis.ui.verse_markers import format_verse_marker
from recitation_analysis.text_display.special_word_builder import get_display_swap
# Shaddah character (frozen model uses bool, not object)
SHADDAH_CHAR = '\u0651' # ّ
word_texts = []
prev_verse_num = None
for word in words:
parts = []
# Leading symbols (rub el hizb, etc.) - now strings directly
for sym in word.leading_symbols:
parts.append(sym)
if word.leading_symbols:
parts.append(' ')
# Special words (e.g., الم، يس، حم) have no letters - use text directly
if not word.letters and word.text:
parts.append(word.text)
# Render each letter with proper symbol ordering
for letter in word.letters:
# Base letter (apply display swap for special words)
letter_char = get_display_swap(word.location, letter.char) or letter.char
parts.append(letter_char)
# Check for iqlab tanween substitution
# In frozen model: diacritic_char is the character, diacritic is the name
diac_char = letter.diacritic_char
iqlab_base_diac, iqlab_meem = substitute_iqlab_tanween(diac_char, letter.letter_rules)
is_iqlab_tanween = iqlab_meem is not None
# Check for iqlab noon (noon sakinah before baa)
# In frozen model: diacritic is the name string (or None)
is_iqlab_noon = (
letter.letter_rules and
'iqlab_noon' in letter.letter_rules and
(letter.diacritic is None or letter.diacritic == 'SUKUN')
)
# Diacritic/Shaddah (proper stacking order)
# In frozen model: shaddah is bool, not object
if letter.shaddah:
parts.append(SHADDAH_CHAR)
if letter.diacritic_char:
if is_iqlab_tanween:
# Iqlab tanween: use base diacritic instead of tanween
parts.append(iqlab_base_diac)
else:
# Apply open tanween substitution for idgham/ikhfaa
diac = substitute_open_tanween(letter.diacritic_char, letter.letter_rules)
parts.append(diac)
# Extensions (dagger alef, maddah, etc.) - now strings directly
for ext in letter.extensions:
# Apply display swap for special words
ext_char = get_display_swap(word.location, ext) or ext
parts.append(ext_char)
# Other symbols (tatweel, etc.) - now strings directly
for sym in letter.other_symbols:
parts.append(sym)
# Iqlab small meem (after other symbols)
if is_iqlab_tanween and iqlab_meem:
parts.append(iqlab_meem)
elif is_iqlab_noon:
# For iqlab noon: add mini meem above after the noon
parts.append('\u06E2') # MINI_MEEM_ABOVE
# Trailing symbols (stop signs) - now strings directly
for sym in word.trailing_symbols:
parts.append(sym)
word_text = ''.join(parts)
# Extract verse number from word location (format: "surah:verse:word")
location_parts = word.location.split(':')
if len(location_parts) >= 2:
current_verse_num = int(location_parts[1])
else:
current_verse_num = None
# Add verse marker when verse changes (for verse ranges)
if not clean_text and prev_verse_num is not None and current_verse_num != prev_verse_num:
word_texts.append(format_verse_marker(prev_verse_num))
word_texts.append(word_text)
prev_verse_num = current_verse_num
# Add final verse marker
if not clean_text and prev_verse_num is not None:
word_texts.append(format_verse_marker(prev_verse_num))
return ' '.join(word_texts) + ' '
def _apply_open_tanween_to_text(result, text: str) -> str:
"""
Apply open tanween substitution to Arabic text based on phonemizer rules.
Substitutes standard tanween characters with open tanween for letters
that have idgham/ikhfaa/iqlab rules.
Args:
result: PhonemizeResult from phonemizer
text: Arabic text from result.text()
Returns:
Text with open tanween substituted where appropriate
"""
try:
from utils.phoneme_map import OPEN_TANWEEN_RULES
# Map diacritic names to (char, open_char) pairs for idgham/ikhfaa
TANWEEN_NAME_TO_OPEN = {
'FATHATAN': ('\u064B', '\u08F0'), # FATHATAN -> open fathatan (DigitalKhatt V2)
'DAMMATAN': ('\u064C', '\u08F1'), # DAMMATAN -> open dammatan (DigitalKhatt V2)
'KASRATAN': ('\u064D', '\u08F2'), # KASRATAN -> open kasratan (DigitalKhatt V2)
}
# Map diacritic names to (normal_char, base_diac, small_meem) for iqlab
TANWEEN_NAME_TO_IQLAB = {
'FATHATAN': ('\u064B', '\u064E', '\u06E2'), # -> FATHA + mini meem above
'DAMMATAN': ('\u064C', '\u064F', '\u06E2'), # -> DAMMA + mini meem above
'KASRATAN': ('\u064D', '\u0650', '\u06ED'), # -> KASRA + mini meem below
}
# Get mapping to check letter rules
mapping = result.get_mapping()
# Build list of (word_text, substitutions) to apply
# Each substitution is (old_char, new_chars) for that word
word_substitutions = []
for word in mapping.words:
subs_for_word = []
for letter in word.letter_mappings:
if not letter.letter_rules:
continue
diac_name = letter.diacritic
if not diac_name or diac_name not in TANWEEN_NAME_TO_OPEN:
continue
# Check for iqlab first (special treatment)
if 'iqlab_tanween' in letter.letter_rules:
normal_char, base_diac, small_meem = TANWEEN_NAME_TO_IQLAB[diac_name]
subs_for_word.append((normal_char, base_diac + small_meem))
continue
# Check for open tanween rules (idgham/ikhfaa)
has_open_rule = any(rule in OPEN_TANWEEN_RULES for rule in letter.letter_rules)
if has_open_rule:
normal_char, open_char = TANWEEN_NAME_TO_OPEN[diac_name]
subs_for_word.append((normal_char, open_char))
if subs_for_word:
word_substitutions.append((word.text, subs_for_word))
# Apply substitutions word by word in the text
result_text = text
for word_text, subs in word_substitutions:
# Find this word in the remaining text and apply substitutions
# Strip rule tags from word text for matching
import re
clean_word = re.sub(r'<[^>]*>', '', word_text)
# Find the word position
word_idx = result_text.find(clean_word)
if word_idx == -1:
# Try without some diacritics for fuzzy match
continue
# Extract the word, apply substitutions, replace
word_end = word_idx + len(clean_word)
word_chars = list(result_text[word_idx:word_end])
for old_char, new_chars in subs:
for i, c in enumerate(word_chars):
if c == old_char:
word_chars[i] = new_chars
break # Only replace first occurrence in this word
result_text = result_text[:word_idx] + ''.join(word_chars) + result_text[word_end:]
return result_text
except Exception as e:
# If anything fails, return original text
import traceback
traceback.print_exc()
return text
def format_verse_reference(from_chapter, from_verse, to_verse):
"""
Format verse selection into a reference string for the phonemizer.
Args:
from_chapter: Chapter number or None
from_verse: Starting verse number or None
to_verse: Ending verse number or None
Returns:
Formatted reference string or None if invalid
"""
# If only from_chapter is selected, return just the chapter
if from_chapter and not from_verse:
return str(from_chapter)
# If from_chapter and from_verse are selected
if from_chapter and from_verse:
start_ref = f"{from_chapter}:{from_verse}"
# If no to_verse, return single verse
if not to_verse:
return start_ref
# If to_verse is same as from_verse, return single verse
if to_verse == from_verse:
return start_ref
# Return range within same chapter
end_ref = f"{from_chapter}:{to_verse}"
return f"{start_ref}-{end_ref}"
return None
def match_text_to_verse(transcribed_text: str, verse_ref: str, stops: list = None):
"""
Match transcribed text to a verse reference using the phonemizer.
This is used for text matching in segmented mode - takes ASR output
and finds which portion of the canonical text it matches.
Args:
transcribed_text: Arabic text from ASR transcription
verse_ref: Verse reference to match against (e.g., "1:2" or "1:2-1:7")
stops: List of stop types (default: ["compulsory_stop"])
Returns:
Tuple of (matched_text, phonemes, match_score, matched_ref)
- matched_text: The canonical text portion that was matched
- phonemes: Phoneme string for the matched portion
- match_score: Confidence score (0-1) of the match
- matched_ref: The specific verse reference matched (e.g., "1:2:1-1:2:4")
"""
if stops is None:
stops = ["compulsory_stop"]
phonemizer, error = load_phonemizer()
if phonemizer is None:
return "", "", 0.0, verse_ref
try:
result = phonemizer.phonemize(
ref_text=transcribed_text,
ref=verse_ref,
stops=stops
)
text = result.text()
phonemes = result.phonemes_str(phoneme_sep=" ", word_sep="", verse_sep="")
match_score = result.match_score
matched_ref = result.ref
return text, phonemes, match_score, matched_ref
except Exception as e:
print(f"[PHONEMIZER] Text matching error: {e}")
return "", "", 0.0, verse_ref
def get_total_words_for_verse_range(verse_ref: str) -> int:
"""
Get the total number of words for a verse reference from surah_info.json.
Args:
verse_ref: Verse reference like "1:2" or "1:2-1:5"
Returns:
Total number of words across the verse range
"""
surah_info = load_surah_info()
if not surah_info:
return 0
try:
# Parse verse reference
if '-' in verse_ref:
# Range: "1:2-1:5"
start_part, end_part = verse_ref.split('-')
start_surah, start_verse = map(int, start_part.split(':'))
end_surah, end_verse = map(int, end_part.split(':'))
else:
# Single verse: "1:2"
start_surah, start_verse = map(int, verse_ref.split(':'))
end_surah, end_verse = start_surah, start_verse
total_words = 0
# Handle single surah case
if start_surah == end_surah:
surah_data = surah_info.get(str(start_surah))
if surah_data and "verses" in surah_data:
for verse_data in surah_data["verses"]:
verse_num = verse_data["verse"]
if start_verse <= verse_num <= end_verse:
total_words += verse_data.get("num_words", 0)
else:
# Multi-surah range (rare but possible)
for surah_num in range(start_surah, end_surah + 1):
surah_data = surah_info.get(str(surah_num))
if not surah_data or "verses" not in surah_data:
continue
for verse_data in surah_data["verses"]:
verse_num = verse_data["verse"]
# First surah: from start_verse onwards
if surah_num == start_surah and verse_num >= start_verse:
total_words += verse_data.get("num_words", 0)
# Middle surahs: all verses
elif start_surah < surah_num < end_surah:
total_words += verse_data.get("num_words", 0)
# Last surah: up to end_verse
elif surah_num == end_surah and verse_num <= end_verse:
total_words += verse_data.get("num_words", 0)
return total_words
except Exception as e:
print(f"[PHONEMIZER] Error getting word count for {verse_ref}: {e}")
return 0