Spaces:

hetchyy
/

Tajweed-AI

Running on Zero

App Files Files Community

Tajweed-AI / utils /phonemizer_utils.py

hetchyy

Add ghunnah/madd durations

c6974e2 about 1 month ago

raw

history blame contribute delete

22.5 kB

	"""
	Phonemization logic and verse metadata loading.
	"""
	import os
	import sys
	import json
	import yaml
	from pathlib import Path

	# Get config after ensuring sys.path is set
	sys.path.insert(0, str(Path(__file__).parent.parent))
	from config import SURAH_INFO_PATH, DEFAULT_STOPS, PHONEMIZER_RESULT_CACHE_SIZE

	# Phonemizer resources path (for HF Spaces compatibility)
	# Resources are now inside the core package: phonemizer/core/resources/
	PHONEMIZER_RESOURCES_DIR = Path(__file__).parent.parent.parent / "phonemizer" / "core" / "resources"
	RECITATION_DATA_DIR = Path(__file__).parent.parent / "data"

	# Module-level cache
	_phonemizer_cache = {
	"phonemizer": None,
	"surah_info": None,
	"loaded": False,
	"error": None
	}

	# LRU cache for phonemizer results (verse_ref -> result)
	from functools import lru_cache

	# Simple cache for recent phonemizer results
	_result_cache = {}


	def get_cached_phonemizer_result(verse_ref: str, stops: list = None):
	"""
	Get cached phonemizer result or compute and cache it.

	Args:
	verse_ref: Verse reference (e.g., "1:1", "2:255")
	stops: List of stops (default: ["compulsory_stop"])

	Returns:
	PhonemizeResult or None if error
	"""
	if stops is None:
	stops = ["compulsory_stop"]

	# Create cache key from verse_ref and sorted stops
	stops_key = ",".join(sorted(stops))
	cache_key = f"{verse_ref}:{stops_key}"

	if cache_key in _result_cache:
	return _result_cache[cache_key]

	phonemizer, error = load_phonemizer()
	if not phonemizer:
	return None

	try:
	result = phonemizer.phonemize(ref=verse_ref, stops=stops)

	# Evict oldest if cache is full
	if len(_result_cache) >= PHONEMIZER_RESULT_CACHE_SIZE:
	oldest_key = next(iter(_result_cache))
	del _result_cache[oldest_key]

	_result_cache[cache_key] = result
	return result
	except Exception as e:
	print(f"[PHONEMIZER] Error phonemizing {verse_ref}: {e}")
	return None


	def clear_phonemizer_result_cache():
	"""Clear the phonemizer result cache."""
	_result_cache.clear()


	def set_ikhfaa_shafawi_phoneme_setting(value: str):
	"""
	Update ikhfaa shafawi phoneme in phonemizer registry and clear cache.

	Called when user changes the "Iqlab/Ikhfaa Shafawi Sound" radio button.

	Args:
	value: "meem ghunnah" or "ikhfaa"
	"""
	from core.phoneme_registry import set_phoneme_override, clear_phoneme_overrides

	clear_phoneme_overrides()
	if value == "meem ghunnah":
	set_phoneme_override("iqlab", "phoneme", "m̃")
	set_phoneme_override("ikhfaa", "shafawi_phoneme", "m̃")
	else: # value == "ikhfaa"
	# Must explicitly override iqlab since YAML default is m̃ (meem ghunnah)
	# ikhfaa shafawi's YAML default is already ŋ, so no override needed
	set_phoneme_override("iqlab", "phoneme", "ŋ")

	# Clear result cache so next phonemization uses new setting
	clear_phonemizer_result_cache()


	def load_phonemizer():
	"""
	Load the Phonemizer instance with caching.

	Returns:
	Phonemizer instance or None if not available
	"""
	if _phonemizer_cache["loaded"]:
	return _phonemizer_cache["phonemizer"], _phonemizer_cache["error"]

	try:
	# Import from the installed phonemizer package
	from core.phonemizer import Phonemizer

	# Use local resources if available, otherwise use defaults
	db_path = PHONEMIZER_RESOURCES_DIR / "Quran.json" if PHONEMIZER_RESOURCES_DIR.exists() else None
	map_path = PHONEMIZER_RESOURCES_DIR / "base_phonemes.yaml" if PHONEMIZER_RESOURCES_DIR.exists() else None
	special_words_path = PHONEMIZER_RESOURCES_DIR / "special_words.yaml" if PHONEMIZER_RESOURCES_DIR.exists() else None

	if db_path and db_path.exists() and map_path and map_path.exists() and special_words_path and special_words_path.exists():
	phonemizer = Phonemizer(
	db_path=db_path,
	map_path=map_path,
	special_words_path=special_words_path,
	)
	else:
	phonemizer = Phonemizer()

	_phonemizer_cache["phonemizer"] = phonemizer
	_phonemizer_cache["loaded"] = True
	_phonemizer_cache["error"] = None
	print("✓ Phonemizer loaded successfully")
	return phonemizer, None
	except ImportError as e:
	import traceback
	error_msg = f"Failed to import phonemizer. Make sure quranic-phonemizer is installed: {str(e)}"
	_phonemizer_cache["error"] = error_msg
	_phonemizer_cache["loaded"] = True
	print(f"✗ {error_msg}")
	print(traceback.format_exc())
	return None, error_msg
	except Exception as e:
	import traceback
	error_msg = f"Failed to load phonemizer: {str(e)}"
	_phonemizer_cache["error"] = error_msg
	_phonemizer_cache["loaded"] = True
	print(f"✗ {error_msg}")
	print(traceback.format_exc())
	return None, error_msg


	def load_surah_info():
	"""
	Load surah information from JSON file.

	Returns:
	Dictionary with surah information or None if not available
	"""
	if _phonemizer_cache["surah_info"] is not None:
	return _phonemizer_cache["surah_info"]

	try:
	with open(SURAH_INFO_PATH, 'r', encoding='utf-8') as f:
	surah_info = json.load(f)
	_phonemizer_cache["surah_info"] = surah_info
	print(f"✓ Loaded surah info with {len(surah_info)} chapters")
	return surah_info
	except Exception as e:
	print(f"✗ Failed to load surah info: {str(e)}")
	return None


	def get_chapter_list():
	"""
	Get list of chapter numbers with names.

	Returns:
	List of tuples (chapter_number, chapter_name_en, chapter_name_ar)
	"""
	surah_info = load_surah_info()
	if not surah_info:
	return []

	chapters = []
	for chapter_num_str, chapter_data in surah_info.items():
	chapter_num = int(chapter_num_str)
	name_en = chapter_data.get("name_en", f"Chapter {chapter_num}")
	name_ar = chapter_data.get("name_ar", "")
	chapters.append((chapter_num, name_en, name_ar))

	return sorted(chapters, key=lambda x: x[0])


	def get_verses_for_chapter(chapter_num):
	"""
	Get list of verse numbers for a given chapter.

	Args:
	chapter_num: Chapter number (int)

	Returns:
	List of verse numbers
	"""
	surah_info = load_surah_info()
	if not surah_info:
	return []

	chapter_data = surah_info.get(str(chapter_num))
	if not chapter_data:
	return []

	verses = [v["verse"] for v in chapter_data.get("verses", [])]
	return sorted(verses)


	def phonemize_verse(verse_ref, stops=None):
	"""
	Phonemize a verse reference and return text and phonemes.

	Args:
	verse_ref: Verse reference (e.g., "1:1", "2:255", "1:1-1:7")
	stops: List of stop types to include

	Returns:
	Tuple of (arabic_text_html, arabic_text_clean, phonemes_string, success, error_message)
	"""
	try:
	# Use default stops if not provided
	if stops is None:
	stops = DEFAULT_STOPS.copy()
	elif "compulsory_stop" not in stops:
	stops = stops + ["compulsory_stop"]

	# Use cached phonemizer result
	result = get_cached_phonemizer_result(verse_ref, stops)
	if result is None:
	return "", "", "", False, "Failed to phonemize verse"
	phonemes_str = result.phonemes_str(phoneme_sep=" ", word_sep="")

	# Build canonical structure using frozen models
	# This ensures extensions (dagger alef) appear before other symbols (tatweel)
	from recitation_analysis.result_builder import get_result_builder

	builder = get_result_builder()
	# Main verse display: apply tanween transforms only, NOT word transforms
	# (stopping/starting/Allah). Word transforms are applied in the
	# Error Analysis, Ghunnah Analysis, and Madd Analysis tabs.
	recitation_result = builder.build_from_phonemizer_result(
	result, verse_ref, apply_word_transforms=False
	)

	# Render text with correct symbol ordering and tanween substitution
	# 1. HTML version for display (with verse markers)
	arabic_text_html = _render_verse_text(recitation_result.canonical_words, clean_text=False)

	# 2. Clean version for processing (no markers, just words)
	arabic_text_clean = _render_verse_text(recitation_result.canonical_words, clean_text=True)

	return arabic_text_html, arabic_text_clean, phonemes_str, True, None
	except Exception as e:
	error_msg = f"Phonemization error: {str(e)}"
	return "", "", "", False, error_msg


	def _render_verse_text(words, clean_text=False) -> str:
	"""
	Render a tuple of WordData as plain text with correct symbol ordering.

	This ensures proper ordering of extensions and other symbols, and applies
	tanween substitutions for idgham/ikhfaa/iqlab contexts.

	Args:
	words: Tuple of WordData to render
	clean_text: If True, omit verse markers

	Returns:
	Arabic text string with correct symbol ordering
	"""
	from recitation_analysis.text_display.rendering import (
	substitute_open_tanween,
	substitute_iqlab_tanween,
	)
	from recitation_analysis.ui.verse_markers import format_verse_marker
	from recitation_analysis.text_display.special_word_builder import get_display_swap

	# Shaddah character (frozen model uses bool, not object)
	SHADDAH_CHAR = '\u0651' # ّ

	word_texts = []
	prev_verse_num = None

	for word in words:
	parts = []

	# Leading symbols (rub el hizb, etc.) - now strings directly
	for sym in word.leading_symbols:
	parts.append(sym)
	if word.leading_symbols:
	parts.append(' ')

	# Special words (e.g., الم، يس، حم) have no letters - use text directly
	if not word.letters and word.text:
	parts.append(word.text)

	# Render each letter with proper symbol ordering
	for letter in word.letters:
	# Base letter (apply display swap for special words)
	letter_char = get_display_swap(word.location, letter.char) or letter.char
	parts.append(letter_char)

	# Check for iqlab tanween substitution
	# In frozen model: diacritic_char is the character, diacritic is the name
	diac_char = letter.diacritic_char
	iqlab_base_diac, iqlab_meem = substitute_iqlab_tanween(diac_char, letter.letter_rules)
	is_iqlab_tanween = iqlab_meem is not None

	# Check for iqlab noon (noon sakinah before baa)
	# In frozen model: diacritic is the name string (or None)
	is_iqlab_noon = (
	letter.letter_rules and
	'iqlab_noon' in letter.letter_rules and
	(letter.diacritic is None or letter.diacritic == 'SUKUN')
	)

	# Diacritic/Shaddah (proper stacking order)
	# In frozen model: shaddah is bool, not object
	if letter.shaddah:
	parts.append(SHADDAH_CHAR)

	if letter.diacritic_char:
	if is_iqlab_tanween:
	# Iqlab tanween: use base diacritic instead of tanween
	parts.append(iqlab_base_diac)
	else:
	# Apply open tanween substitution for idgham/ikhfaa
	diac = substitute_open_tanween(letter.diacritic_char, letter.letter_rules)
	parts.append(diac)

	# Extensions (dagger alef, maddah, etc.) - now strings directly
	for ext in letter.extensions:
	# Apply display swap for special words
	ext_char = get_display_swap(word.location, ext) or ext
	parts.append(ext_char)

	# Other symbols (tatweel, etc.) - now strings directly
	for sym in letter.other_symbols:
	parts.append(sym)

	# Iqlab small meem (after other symbols)
	if is_iqlab_tanween and iqlab_meem:
	parts.append(iqlab_meem)
	elif is_iqlab_noon:
	# For iqlab noon: add mini meem above after the noon
	parts.append('\u06E2') # MINI_MEEM_ABOVE

	# Trailing symbols (stop signs) - now strings directly
	for sym in word.trailing_symbols:
	parts.append(sym)

	word_text = ''.join(parts)

	# Extract verse number from word location (format: "surah:verse:word")
	location_parts = word.location.split(':')
	if len(location_parts) >= 2:
	current_verse_num = int(location_parts[1])
	else:
	current_verse_num = None

	# Add verse marker when verse changes (for verse ranges)
	if not clean_text and prev_verse_num is not None and current_verse_num != prev_verse_num:
	word_texts.append(format_verse_marker(prev_verse_num))

	word_texts.append(word_text)
	prev_verse_num = current_verse_num

	# Add final verse marker
	if not clean_text and prev_verse_num is not None:
	word_texts.append(format_verse_marker(prev_verse_num))

	return ' '.join(word_texts) + ' '


	def _apply_open_tanween_to_text(result, text: str) -> str:
	"""
	Apply open tanween substitution to Arabic text based on phonemizer rules.

	Substitutes standard tanween characters with open tanween for letters
	that have idgham/ikhfaa/iqlab rules.

	Args:
	result: PhonemizeResult from phonemizer
	text: Arabic text from result.text()

	Returns:
	Text with open tanween substituted where appropriate
	"""
	try:
	from utils.phoneme_map import OPEN_TANWEEN_RULES

	# Map diacritic names to (char, open_char) pairs for idgham/ikhfaa
	TANWEEN_NAME_TO_OPEN = {
	'FATHATAN': ('\u064B', '\u08F0'), # FATHATAN -> open fathatan (DigitalKhatt V2)
	'DAMMATAN': ('\u064C', '\u08F1'), # DAMMATAN -> open dammatan (DigitalKhatt V2)
	'KASRATAN': ('\u064D', '\u08F2'), # KASRATAN -> open kasratan (DigitalKhatt V2)
	}

	# Map diacritic names to (normal_char, base_diac, small_meem) for iqlab
	TANWEEN_NAME_TO_IQLAB = {
	'FATHATAN': ('\u064B', '\u064E', '\u06E2'), # -> FATHA + mini meem above
	'DAMMATAN': ('\u064C', '\u064F', '\u06E2'), # -> DAMMA + mini meem above
	'KASRATAN': ('\u064D', '\u0650', '\u06ED'), # -> KASRA + mini meem below
	}

	# Get mapping to check letter rules
	mapping = result.get_mapping()

	# Build list of (word_text, substitutions) to apply
	# Each substitution is (old_char, new_chars) for that word
	word_substitutions = []

	for word in mapping.words:
	subs_for_word = []
	for letter in word.letter_mappings:
	if not letter.letter_rules:
	continue

	diac_name = letter.diacritic
	if not diac_name or diac_name not in TANWEEN_NAME_TO_OPEN:
	continue

	# Check for iqlab first (special treatment)
	if 'iqlab_tanween' in letter.letter_rules:
	normal_char, base_diac, small_meem = TANWEEN_NAME_TO_IQLAB[diac_name]
	subs_for_word.append((normal_char, base_diac + small_meem))
	continue

	# Check for open tanween rules (idgham/ikhfaa)
	has_open_rule = any(rule in OPEN_TANWEEN_RULES for rule in letter.letter_rules)
	if has_open_rule:
	normal_char, open_char = TANWEEN_NAME_TO_OPEN[diac_name]
	subs_for_word.append((normal_char, open_char))

	if subs_for_word:
	word_substitutions.append((word.text, subs_for_word))

	# Apply substitutions word by word in the text
	result_text = text
	for word_text, subs in word_substitutions:
	# Find this word in the remaining text and apply substitutions
	# Strip rule tags from word text for matching
	import re
	clean_word = re.sub(r'<[^>]*>', '', word_text)

	# Find the word position
	word_idx = result_text.find(clean_word)
	if word_idx == -1:
	# Try without some diacritics for fuzzy match
	continue

	# Extract the word, apply substitutions, replace
	word_end = word_idx + len(clean_word)
	word_chars = list(result_text[word_idx:word_end])

	for old_char, new_chars in subs:
	for i, c in enumerate(word_chars):
	if c == old_char:
	word_chars[i] = new_chars
	break # Only replace first occurrence in this word

	result_text = result_text[:word_idx] + ''.join(word_chars) + result_text[word_end:]

	return result_text
	except Exception as e:
	# If anything fails, return original text
	import traceback
	traceback.print_exc()
	return text


	def format_verse_reference(from_chapter, from_verse, to_verse):
	"""
	Format verse selection into a reference string for the phonemizer.

	Args:
	from_chapter: Chapter number or None
	from_verse: Starting verse number or None
	to_verse: Ending verse number or None

	Returns:
	Formatted reference string or None if invalid
	"""
	# If only from_chapter is selected, return just the chapter
	if from_chapter and not from_verse:
	return str(from_chapter)

	# If from_chapter and from_verse are selected
	if from_chapter and from_verse:
	start_ref = f"{from_chapter}:{from_verse}"

	# If no to_verse, return single verse
	if not to_verse:
	return start_ref

	# If to_verse is same as from_verse, return single verse
	if to_verse == from_verse:
	return start_ref

	# Return range within same chapter
	end_ref = f"{from_chapter}:{to_verse}"
	return f"{start_ref}-{end_ref}"

	return None


	def match_text_to_verse(transcribed_text: str, verse_ref: str, stops: list = None):
	"""
	Match transcribed text to a verse reference using the phonemizer.

	This is used for text matching in segmented mode - takes ASR output
	and finds which portion of the canonical text it matches.

	Args:
	transcribed_text: Arabic text from ASR transcription
	verse_ref: Verse reference to match against (e.g., "1:2" or "1:2-1:7")
	stops: List of stop types (default: ["compulsory_stop"])

	Returns:
	Tuple of (matched_text, phonemes, match_score, matched_ref)
	- matched_text: The canonical text portion that was matched
	- phonemes: Phoneme string for the matched portion
	- match_score: Confidence score (0-1) of the match
	- matched_ref: The specific verse reference matched (e.g., "1:2:1-1:2:4")
	"""
	if stops is None:
	stops = ["compulsory_stop"]

	phonemizer, error = load_phonemizer()
	if phonemizer is None:
	return "", "", 0.0, verse_ref

	try:
	result = phonemizer.phonemize(
	ref_text=transcribed_text,
	ref=verse_ref,
	stops=stops
	)

	text = result.text()
	phonemes = result.phonemes_str(phoneme_sep=" ", word_sep="", verse_sep="")
	match_score = result.match_score
	matched_ref = result.ref

	return text, phonemes, match_score, matched_ref

	except Exception as e:
	print(f"[PHONEMIZER] Text matching error: {e}")
	return "", "", 0.0, verse_ref


	def get_total_words_for_verse_range(verse_ref: str) -> int:
	"""
	Get the total number of words for a verse reference from surah_info.json.

	Args:
	verse_ref: Verse reference like "1:2" or "1:2-1:5"

	Returns:
	Total number of words across the verse range
	"""
	surah_info = load_surah_info()
	if not surah_info:
	return 0

	try:
	# Parse verse reference
	if '-' in verse_ref:
	# Range: "1:2-1:5"
	start_part, end_part = verse_ref.split('-')
	start_surah, start_verse = map(int, start_part.split(':'))
	end_surah, end_verse = map(int, end_part.split(':'))
	else:
	# Single verse: "1:2"
	start_surah, start_verse = map(int, verse_ref.split(':'))
	end_surah, end_verse = start_surah, start_verse

	total_words = 0

	# Handle single surah case
	if start_surah == end_surah:
	surah_data = surah_info.get(str(start_surah))
	if surah_data and "verses" in surah_data:
	for verse_data in surah_data["verses"]:
	verse_num = verse_data["verse"]
	if start_verse <= verse_num <= end_verse:
	total_words += verse_data.get("num_words", 0)
	else:
	# Multi-surah range (rare but possible)
	for surah_num in range(start_surah, end_surah + 1):
	surah_data = surah_info.get(str(surah_num))
	if not surah_data or "verses" not in surah_data:
	continue

	for verse_data in surah_data["verses"]:
	verse_num = verse_data["verse"]

	# First surah: from start_verse onwards
	if surah_num == start_surah and verse_num >= start_verse:
	total_words += verse_data.get("num_words", 0)
	# Middle surahs: all verses
	elif start_surah < surah_num < end_surah:
	total_words += verse_data.get("num_words", 0)
	# Last surah: up to end_verse
	elif surah_num == end_surah and verse_num <= end_verse:
	total_words += verse_data.get("num_words", 0)

	return total_words

	except Exception as e:
	print(f"[PHONEMIZER] Error getting word count for {verse_ref}: {e}")
	return 0