Spaces:

hetchyy
/

Tajweed-AI

Running on Zero

App Files Files Community

Tajweed-AI / utils /phoneme_map.py

hetchyy

Add ghunnah/madd durations

fcc17af about 1 month ago

raw

history blame contribute delete

15.1 kB

	"""
	Phoneme map utilities for loading mappings from phoneme_map.yaml.

	Provides a single source of truth for phoneme-to-character mappings.
	"""

	from pathlib import Path
	from functools import lru_cache
	import yaml

	# Path to the phoneme map file
	PHONEME_MAP_PATH = Path(__file__).parent.parent / "data" / "phoneme_map.yaml"


	@lru_cache(maxsize=1)
	def _load_phoneme_map() -> dict:
	"""Load the phoneme map from YAML file. Cached for performance."""
	with open(PHONEME_MAP_PATH, 'r', encoding='utf-8') as f:
	return yaml.safe_load(f)


	def get_phoneme_to_char() -> dict:
	"""
	Get a mapping of phonemes to Arabic characters.

	Combines consonants, geminated, heavy variants, long vowels, and tajweed phonemes.
	Short vowels map to None (they are diacritics, not characters).

	Returns:
	Dict[str, Optional[str]]: phoneme -> Arabic character (or None for vowels)
	"""
	data = _load_phoneme_map()
	mapping = {}

	# Consonants
	for phoneme, char in data.get('consonants', {}).items():
	mapping[phoneme] = char

	# Geminated consonants (extract base char without shaddah for consistency)
	for phoneme, char_with_shaddah in data.get('geminated', {}).items():
	# Remove shaddah (ّ) to get base character
	base_char = char_with_shaddah.replace('ّ', '')
	mapping[phoneme] = base_char

	# Heavy consonants
	for phoneme, char in data.get('heavy_consonants', {}).items():
	char_clean = char.replace('ّ', '') # Remove shaddah if present
	mapping[phoneme] = char_clean

	# Long vowels (map to their carrier letter)
	for phoneme, info in data.get('long_vowels', {}).items():
	if isinstance(info, dict):
	# Structured format: {short: "a", letter: "ا"}
	mapping[phoneme] = info.get('letter', '')
	else:
	# Legacy flat format: phoneme: char
	mapping[phoneme] = info

	# Short vowels map to None (they're diacritics)
	for phoneme in data.get('short_vowels', {}).keys():
	mapping[phoneme] = None

	# Tajweed phonemes
	for phoneme, char in data.get('tajweed_phonemes', {}).items():
	if char: # Skip empty mappings
	char_clean = char.replace('ّ', '')
	mapping[phoneme] = char_clean
	else:
	mapping[phoneme] = None

	return mapping


	def get_vowel_to_diacritic() -> dict:
	"""
	Get a mapping of short vowel phonemes to diacritic info.

	Returns:
	Dict[str, Tuple[str, str]]: phoneme -> (diacritic_name, diacritic_char)
	"""
	data = _load_phoneme_map()
	diacritics = data.get('diacritics', {})
	short_vowels = data.get('short_vowels', {})

	mapping = {}
	for phoneme, diacritic_name in short_vowels.items():
	if diacritic_name in diacritics:
	mapping[phoneme] = (diacritic_name, diacritics[diacritic_name])

	return mapping


	@lru_cache(maxsize=1)
	def get_short_vowel_set() -> frozenset:
	"""
	Get the set of short vowel phonemes.

	Cached and returns frozenset for immutability and hashability.
	Use this instead of re-deriving short vowels from get_vowel_to_diacritic().

	Returns:
	frozenset[str]: Set of short vowel phonemes (e.g., {'a', 'u', 'i', 'aˤ'})
	"""
	return frozenset(get_vowel_to_diacritic().keys())


	@lru_cache(maxsize=1)
	def get_long_vowel_mappings() -> dict:
	"""
	Get a mapping of long vowel phonemes to their components.

	Returns:
	Dict[str, Tuple[str, str]]: phoneme -> (short_vowel_phoneme, vowel_letter)

	Example:
	{'a:': ('a', 'ا'), 'u:': ('u', 'و'), 'i:': ('i', 'ي')}
	"""
	data = _load_phoneme_map()
	long_vowels = data.get('long_vowels', {})

	# Extract short vowel and letter from structured YAML
	mapping = {}
	for phoneme, info in long_vowels.items():
	if isinstance(info, dict):
	# New structured format: {short: "a", letter: "ا"}
	short_vowel = info.get('short', phoneme.replace(':', ''))
	letter = info.get('letter', '')
	else:
	# Legacy flat format: "a:": "ا"
	short_vowel = phoneme.replace(':', '')
	letter = info
	mapping[phoneme] = (short_vowel, letter)

	return mapping


	def get_diacritic_chars() -> dict:
	"""
	Get a mapping of diacritic names to Unicode characters.

	Returns:
	Dict[str, str]: diacritic_name -> Unicode character
	"""
	data = _load_phoneme_map()
	return data.get('diacritics', {})


	@lru_cache(maxsize=1)
	def get_geminated_set() -> frozenset:
	"""
	Get the set of all geminated phonemes.

	Cached for efficient repeated lookups.

	Returns:
	frozenset[str]: Set of geminated phonemes (e.g., {'bb', 'll', 'sˤsˤ', ...})
	"""
	data = _load_phoneme_map()
	return frozenset(data.get('geminated', {}).keys())


	def is_geminated_phoneme(phoneme: str) -> bool:
	"""Check if a phoneme is geminated (doubled consonant)."""
	return phoneme in get_geminated_set()


	@lru_cache(maxsize=1)
	def get_consonant_set() -> frozenset:
	"""
	Get the set of all single consonant phonemes.

	Returns:
	frozenset[str]: Set of consonant phonemes (e.g., {'b', 't', 'rˤ', ...})
	"""
	data = _load_phoneme_map()
	return frozenset(data.get('consonants', {}).keys())


	def is_consonant_phoneme(phoneme: str) -> bool:
	"""Check if a phoneme is a single consonant (not geminated, not vowel)."""
	return phoneme in get_consonant_set()


	@lru_cache(maxsize=1)
	def get_geminated_to_base_map() -> dict:
	"""
	Get a mapping from geminated phonemes to their base forms.

	Built from the geminated dict in phoneme_map.yaml.
	Maps each geminated phoneme to its single consonant form.

	Returns:
	Dict[str, str]: geminated -> base (e.g., {'bb': 'b', 'sˤsˤ': 'sˤ'})
	"""
	data = _load_phoneme_map()
	geminated = data.get('geminated', {})
	consonants = data.get('consonants', {})
	heavy = data.get('heavy_consonants', {})

	# Build reverse map: Arabic char -> phoneme
	char_to_phoneme = {}
	for phoneme, char in consonants.items():
	char_to_phoneme[char] = phoneme
	for phoneme, char in heavy.items():
	char_clean = char.replace('ّ', '')
	if char_clean not in char_to_phoneme:
	char_to_phoneme[char_clean] = phoneme

	# Build geminated -> base map
	mapping = {}
	for gem_phoneme, char_with_shaddah in geminated.items():
	base_char = char_with_shaddah.replace('ّ', '')
	if base_char in char_to_phoneme:
	mapping[gem_phoneme] = char_to_phoneme[base_char]
	else:
	# Fallback: split phoneme in half
	mid = len(gem_phoneme) // 2
	mapping[gem_phoneme] = gem_phoneme[:mid]

	return mapping


	def get_base_phoneme(phoneme: str) -> str:
	"""
	Get the base (non-geminated) form of a phoneme.

	If phoneme is geminated, returns the single consonant form.
	If phoneme is already single, returns it unchanged.

	Args:
	phoneme: Any phoneme (geminated or single)

	Returns:
	Base phoneme (e.g., 'bb' -> 'b', 'sˤsˤ' -> 'sˤ', 't' -> 't')
	"""
	gem_to_base = get_geminated_to_base_map()
	if phoneme in gem_to_base:
	return gem_to_base[phoneme]
	return phoneme


	def get_tanween_mappings() -> dict:
	"""
	Get a mapping of short vowel phonemes to tanween diacritic info.

	Returns:
	Dict[str, Tuple[str, str]]: phoneme -> (diacritic_name, diacritic_char)
	"""
	data = _load_phoneme_map()
	diacritics = data.get('diacritics', {})
	tanween = data.get('tanween', {})

	mapping = {}
	for phoneme, diacritic_name in tanween.items():
	if diacritic_name in diacritics:
	mapping[phoneme] = (diacritic_name, diacritics[diacritic_name])

	return mapping


	# =============================================================================
	# LONG VOWEL UTILITIES
	# =============================================================================

	@lru_cache(maxsize=1)
	def get_long_vowel_set() -> frozenset:
	"""
	Get the set of long vowel phonemes.

	Returns:
	frozenset[str]: Set of long vowel phonemes (e.g., {'a:', 'u:', 'i:', 'aˤ:'})
	"""
	return frozenset(get_long_vowel_mappings().keys())


	def is_long_vowel(phoneme: str) -> bool:
	"""Check if phoneme is a long vowel."""
	return phoneme in get_long_vowel_set()


	def get_short_from_long(long_vowel: str) -> str:
	"""
	Get short vowel component from long vowel.

	Args:
	long_vowel: Long vowel phoneme (e.g., 'a:', 'aˤ:')

	Returns:
	Short vowel phoneme (e.g., 'a', 'aˤ')
	"""
	mapping = get_long_vowel_mappings()
	if long_vowel in mapping:
	return mapping[long_vowel][0]
	return long_vowel.replace(':', '')


	def normalize_fatha_variants(vowel: str) -> str:
	"""
	Normalize fatha variants for equivalence checking.

	Treats aˤ as equivalent to a, and aˤ: as equivalent to a:.

	Args:
	vowel: Vowel phoneme (short or long)

	Returns:
	Normalized vowel (ˤ removed)
	"""
	return vowel.replace('ˤ', '')


	@lru_cache(maxsize=1)
	def get_vowel_extension_chars() -> frozenset:
	"""
	Get the set of vowel extension characters (mini graphemes for long vowels).

	These are small marks that sit above/below consonant letters to indicate
	a long vowel, rather than using a full alef/waw/yaa letter.

	Returns:
	frozenset of Unicode characters for vowel extensions
	"""
	data = _load_phoneme_map()
	extensions = data.get('vowel_extensions', {})
	return frozenset(extensions.values())


	@lru_cache(maxsize=1)
	def get_vowel_carrier_chars() -> frozenset:
	"""
	Get the set of vowel carrier letter characters.

	These are full letters (alef, waw, yaa, alef maksura) that ARE the vowel
	grapheme, not just consonants hosting a vowel extension. Used to determine
	if a letter should be treated as full letter case vs mini extension case.

	Returns:
	frozenset of Unicode characters for vowel carrier letters
	"""
	data = _load_phoneme_map()
	carriers = data.get('vowel_carrier_letters', {})
	return frozenset(carriers.values())


	@lru_cache(maxsize=1)
	def get_short_vowels() -> frozenset:
	"""
	Get the set of short vowel phonemes.

	Derived from short_vowels dict keys (a, u, i, aˤ).

	Returns:
	frozenset of short vowel phoneme strings
	"""
	data = _load_phoneme_map()
	short_vowels = data.get('short_vowels', {})
	return frozenset(short_vowels.keys())


	@lru_cache(maxsize=1)
	def get_ghunnah_phoneme_set() -> frozenset:
	"""
	Get the set of ghunnah (nasalized) phonemes.

	These are special tajweed phonemes that represent nasalization:
	- ŋ = ikhfaa nasal (hidden noon)
	- ñ = idgham noon (merged noon with shaddah)
	- m̃ = idgham/iqlab meem (merged meem with shaddah)
	- j̃ = idgham with ya (merged noon into ya)
	- w̃ = idgham with waw (merged noon into waw)

	Returns:
	frozenset[str]: Set of ghunnah phonemes
	"""
	return frozenset({'ŋ', 'ñ', 'm̃', 'j̃', 'w̃'})


	# =============================================================================
	# CENTRALIZED DIACRITIC AND TANWEEN MAPPINGS
	# Single source of truth for all builders and renderers
	# =============================================================================

	# Diacritic names to Unicode characters
	# Used by: canonical_builder, result_builder, word_builder, common.py
	DIACRITIC_NAME_TO_CHAR = {
	"FATHA": "\u064E", # َ
	"DAMMA": "\u064F", # ُ
	"KASRA": "\u0650", # ِ
	"SUKUN": "\u0652", # ْ
	"FATHATAN": "\u064B", # ً
	"DAMMATAN": "\u064C", # ٌ
	"KASRATAN": "\u064D", # ٍ
	"SHADDA": "\u0651", # ّ
	}

	# Reverse mapping: char to name
	DIACRITIC_CHAR_TO_NAME = {v: k for k, v in DIACRITIC_NAME_TO_CHAR.items()}


	# Open tanween characters (DigitalKhatt V2 font)
	OPEN_FATHATAN = '\u08F0'
	OPEN_DAMMATAN = '\u08F1'
	OPEN_KASRATAN = '\u08F2'

	# Small meem characters for iqlab
	MINI_MEEM_ABOVE = '\u06E2' # ۢ - for fathatan/dammatan
	MINI_MEEM_BELOW = '\u06ED' # ۭ - for kasratan


	# Rules that trigger open tanween rendering
	# Used by: result_builder, common.py
	OPEN_TANWEEN_RULES = frozenset({
	'idgham_ghunnah_tanween',
	'idgham_bila_ghunnah_tanween',
	'ikhfaa_tanween',
	})


	# Tanween to open tanween mapping (BY NAME)
	# For use in builders that work with diacritic names
	# Maps: tanween_name -> (open_char, open_name)
	TANWEEN_TO_OPEN_BY_NAME = {
	'FATHATAN': (OPEN_FATHATAN, 'OPEN_FATHATAN'),
	'DAMMATAN': (OPEN_DAMMATAN, 'OPEN_DAMMATAN'),
	'KASRATAN': (OPEN_KASRATAN, 'OPEN_KASRATAN'),
	}

	# Tanween to open tanween mapping (BY CHAR)
	# For use in renderers that work with Unicode characters
	# Maps: tanween_char -> open_char
	TANWEEN_TO_OPEN_BY_CHAR = {
	DIACRITIC_NAME_TO_CHAR['FATHATAN']: OPEN_FATHATAN,
	DIACRITIC_NAME_TO_CHAR['DAMMATAN']: OPEN_DAMMATAN,
	DIACRITIC_NAME_TO_CHAR['KASRATAN']: OPEN_KASRATAN,
	}

	# Reverse: open tanween to standard tanween
	OPEN_TO_TANWEEN_CHAR = {v: k for k, v in TANWEEN_TO_OPEN_BY_CHAR.items()}


	# Iqlab tanween mapping (BY NAME)
	# For use in builders that work with diacritic names
	# Maps: tanween_name -> (base_name, base_char, mini_meem)
	TANWEEN_TO_IQLAB_BY_NAME = {
	'FATHATAN': ('FATHA', DIACRITIC_NAME_TO_CHAR['FATHA'], MINI_MEEM_ABOVE),
	'DAMMATAN': ('DAMMA', DIACRITIC_NAME_TO_CHAR['DAMMA'], MINI_MEEM_ABOVE),
	'KASRATAN': ('KASRA', DIACRITIC_NAME_TO_CHAR['KASRA'], MINI_MEEM_BELOW),
	}

	# Iqlab tanween mapping (BY CHAR)
	# For use in renderers that work with Unicode characters
	# Maps: tanween_char -> (base_char, mini_meem)
	TANWEEN_TO_IQLAB_BY_CHAR = {
	DIACRITIC_NAME_TO_CHAR['FATHATAN']: (DIACRITIC_NAME_TO_CHAR['FATHA'], MINI_MEEM_ABOVE),
	DIACRITIC_NAME_TO_CHAR['DAMMATAN']: (DIACRITIC_NAME_TO_CHAR['DAMMA'], MINI_MEEM_ABOVE),
	DIACRITIC_NAME_TO_CHAR['KASRATAN']: (DIACRITIC_NAME_TO_CHAR['KASRA'], MINI_MEEM_BELOW),
	}


	def get_diacritic_char_by_name(name: str) -> str:
	"""
	Get diacritic Unicode character by name.

	Single source of truth for diacritic name -> char mapping.
	Falls back to get_diacritic_chars() for YAML-defined diacritics.

	Args:
	name: Diacritic name (e.g., 'FATHA', 'SUKUN', 'FATHATAN')

	Returns:
	Unicode character, or empty string if not found
	"""
	# First check hardcoded map (includes open tanween names)
	if name in DIACRITIC_NAME_TO_CHAR:
	return DIACRITIC_NAME_TO_CHAR[name]
	# Handle open tanween names
	if name == 'OPEN_FATHATAN':
	return OPEN_FATHATAN
	if name == 'OPEN_DAMMATAN':
	return OPEN_DAMMATAN
	if name == 'OPEN_KASRATAN':
	return OPEN_KASRATAN
	# Fall back to YAML-defined diacritics
	return get_diacritic_chars().get(name, '')