Tajweed-AI / utils /phoneme_map.py
hetchyy's picture
Add ghunnah/madd durations
fcc17af
"""
Phoneme map utilities for loading mappings from phoneme_map.yaml.
Provides a single source of truth for phoneme-to-character mappings.
"""
from pathlib import Path
from functools import lru_cache
import yaml
# Path to the phoneme map file
PHONEME_MAP_PATH = Path(__file__).parent.parent / "data" / "phoneme_map.yaml"
@lru_cache(maxsize=1)
def _load_phoneme_map() -> dict:
"""Load the phoneme map from YAML file. Cached for performance."""
with open(PHONEME_MAP_PATH, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def get_phoneme_to_char() -> dict:
"""
Get a mapping of phonemes to Arabic characters.
Combines consonants, geminated, heavy variants, long vowels, and tajweed phonemes.
Short vowels map to None (they are diacritics, not characters).
Returns:
Dict[str, Optional[str]]: phoneme -> Arabic character (or None for vowels)
"""
data = _load_phoneme_map()
mapping = {}
# Consonants
for phoneme, char in data.get('consonants', {}).items():
mapping[phoneme] = char
# Geminated consonants (extract base char without shaddah for consistency)
for phoneme, char_with_shaddah in data.get('geminated', {}).items():
# Remove shaddah (ّ) to get base character
base_char = char_with_shaddah.replace('ّ', '')
mapping[phoneme] = base_char
# Heavy consonants
for phoneme, char in data.get('heavy_consonants', {}).items():
char_clean = char.replace('ّ', '') # Remove shaddah if present
mapping[phoneme] = char_clean
# Long vowels (map to their carrier letter)
for phoneme, info in data.get('long_vowels', {}).items():
if isinstance(info, dict):
# Structured format: {short: "a", letter: "ا"}
mapping[phoneme] = info.get('letter', '')
else:
# Legacy flat format: phoneme: char
mapping[phoneme] = info
# Short vowels map to None (they're diacritics)
for phoneme in data.get('short_vowels', {}).keys():
mapping[phoneme] = None
# Tajweed phonemes
for phoneme, char in data.get('tajweed_phonemes', {}).items():
if char: # Skip empty mappings
char_clean = char.replace('ّ', '')
mapping[phoneme] = char_clean
else:
mapping[phoneme] = None
return mapping
def get_vowel_to_diacritic() -> dict:
"""
Get a mapping of short vowel phonemes to diacritic info.
Returns:
Dict[str, Tuple[str, str]]: phoneme -> (diacritic_name, diacritic_char)
"""
data = _load_phoneme_map()
diacritics = data.get('diacritics', {})
short_vowels = data.get('short_vowels', {})
mapping = {}
for phoneme, diacritic_name in short_vowels.items():
if diacritic_name in diacritics:
mapping[phoneme] = (diacritic_name, diacritics[diacritic_name])
return mapping
@lru_cache(maxsize=1)
def get_short_vowel_set() -> frozenset:
"""
Get the set of short vowel phonemes.
Cached and returns frozenset for immutability and hashability.
Use this instead of re-deriving short vowels from get_vowel_to_diacritic().
Returns:
frozenset[str]: Set of short vowel phonemes (e.g., {'a', 'u', 'i', 'aˤ'})
"""
return frozenset(get_vowel_to_diacritic().keys())
@lru_cache(maxsize=1)
def get_long_vowel_mappings() -> dict:
"""
Get a mapping of long vowel phonemes to their components.
Returns:
Dict[str, Tuple[str, str]]: phoneme -> (short_vowel_phoneme, vowel_letter)
Example:
{'a:': ('a', 'ا'), 'u:': ('u', 'و'), 'i:': ('i', 'ي')}
"""
data = _load_phoneme_map()
long_vowels = data.get('long_vowels', {})
# Extract short vowel and letter from structured YAML
mapping = {}
for phoneme, info in long_vowels.items():
if isinstance(info, dict):
# New structured format: {short: "a", letter: "ا"}
short_vowel = info.get('short', phoneme.replace(':', ''))
letter = info.get('letter', '')
else:
# Legacy flat format: "a:": "ا"
short_vowel = phoneme.replace(':', '')
letter = info
mapping[phoneme] = (short_vowel, letter)
return mapping
def get_diacritic_chars() -> dict:
"""
Get a mapping of diacritic names to Unicode characters.
Returns:
Dict[str, str]: diacritic_name -> Unicode character
"""
data = _load_phoneme_map()
return data.get('diacritics', {})
@lru_cache(maxsize=1)
def get_geminated_set() -> frozenset:
"""
Get the set of all geminated phonemes.
Cached for efficient repeated lookups.
Returns:
frozenset[str]: Set of geminated phonemes (e.g., {'bb', 'll', 'sˤsˤ', ...})
"""
data = _load_phoneme_map()
return frozenset(data.get('geminated', {}).keys())
def is_geminated_phoneme(phoneme: str) -> bool:
"""Check if a phoneme is geminated (doubled consonant)."""
return phoneme in get_geminated_set()
@lru_cache(maxsize=1)
def get_consonant_set() -> frozenset:
"""
Get the set of all single consonant phonemes.
Returns:
frozenset[str]: Set of consonant phonemes (e.g., {'b', 't', 'rˤ', ...})
"""
data = _load_phoneme_map()
return frozenset(data.get('consonants', {}).keys())
def is_consonant_phoneme(phoneme: str) -> bool:
"""Check if a phoneme is a single consonant (not geminated, not vowel)."""
return phoneme in get_consonant_set()
@lru_cache(maxsize=1)
def get_geminated_to_base_map() -> dict:
"""
Get a mapping from geminated phonemes to their base forms.
Built from the geminated dict in phoneme_map.yaml.
Maps each geminated phoneme to its single consonant form.
Returns:
Dict[str, str]: geminated -> base (e.g., {'bb': 'b', 'sˤsˤ': 'sˤ'})
"""
data = _load_phoneme_map()
geminated = data.get('geminated', {})
consonants = data.get('consonants', {})
heavy = data.get('heavy_consonants', {})
# Build reverse map: Arabic char -> phoneme
char_to_phoneme = {}
for phoneme, char in consonants.items():
char_to_phoneme[char] = phoneme
for phoneme, char in heavy.items():
char_clean = char.replace('ّ', '')
if char_clean not in char_to_phoneme:
char_to_phoneme[char_clean] = phoneme
# Build geminated -> base map
mapping = {}
for gem_phoneme, char_with_shaddah in geminated.items():
base_char = char_with_shaddah.replace('ّ', '')
if base_char in char_to_phoneme:
mapping[gem_phoneme] = char_to_phoneme[base_char]
else:
# Fallback: split phoneme in half
mid = len(gem_phoneme) // 2
mapping[gem_phoneme] = gem_phoneme[:mid]
return mapping
def get_base_phoneme(phoneme: str) -> str:
"""
Get the base (non-geminated) form of a phoneme.
If phoneme is geminated, returns the single consonant form.
If phoneme is already single, returns it unchanged.
Args:
phoneme: Any phoneme (geminated or single)
Returns:
Base phoneme (e.g., 'bb' -> 'b', 'sˤsˤ' -> 'sˤ', 't' -> 't')
"""
gem_to_base = get_geminated_to_base_map()
if phoneme in gem_to_base:
return gem_to_base[phoneme]
return phoneme
def get_tanween_mappings() -> dict:
"""
Get a mapping of short vowel phonemes to tanween diacritic info.
Returns:
Dict[str, Tuple[str, str]]: phoneme -> (diacritic_name, diacritic_char)
"""
data = _load_phoneme_map()
diacritics = data.get('diacritics', {})
tanween = data.get('tanween', {})
mapping = {}
for phoneme, diacritic_name in tanween.items():
if diacritic_name in diacritics:
mapping[phoneme] = (diacritic_name, diacritics[diacritic_name])
return mapping
# =============================================================================
# LONG VOWEL UTILITIES
# =============================================================================
@lru_cache(maxsize=1)
def get_long_vowel_set() -> frozenset:
"""
Get the set of long vowel phonemes.
Returns:
frozenset[str]: Set of long vowel phonemes (e.g., {'a:', 'u:', 'i:', 'aˤ:'})
"""
return frozenset(get_long_vowel_mappings().keys())
def is_long_vowel(phoneme: str) -> bool:
"""Check if phoneme is a long vowel."""
return phoneme in get_long_vowel_set()
def get_short_from_long(long_vowel: str) -> str:
"""
Get short vowel component from long vowel.
Args:
long_vowel: Long vowel phoneme (e.g., 'a:', 'aˤ:')
Returns:
Short vowel phoneme (e.g., 'a', 'aˤ')
"""
mapping = get_long_vowel_mappings()
if long_vowel in mapping:
return mapping[long_vowel][0]
return long_vowel.replace(':', '')
def normalize_fatha_variants(vowel: str) -> str:
"""
Normalize fatha variants for equivalence checking.
Treats aˤ as equivalent to a, and aˤ: as equivalent to a:.
Args:
vowel: Vowel phoneme (short or long)
Returns:
Normalized vowel (ˤ removed)
"""
return vowel.replace('ˤ', '')
@lru_cache(maxsize=1)
def get_vowel_extension_chars() -> frozenset:
"""
Get the set of vowel extension characters (mini graphemes for long vowels).
These are small marks that sit above/below consonant letters to indicate
a long vowel, rather than using a full alef/waw/yaa letter.
Returns:
frozenset of Unicode characters for vowel extensions
"""
data = _load_phoneme_map()
extensions = data.get('vowel_extensions', {})
return frozenset(extensions.values())
@lru_cache(maxsize=1)
def get_vowel_carrier_chars() -> frozenset:
"""
Get the set of vowel carrier letter characters.
These are full letters (alef, waw, yaa, alef maksura) that ARE the vowel
grapheme, not just consonants hosting a vowel extension. Used to determine
if a letter should be treated as full letter case vs mini extension case.
Returns:
frozenset of Unicode characters for vowel carrier letters
"""
data = _load_phoneme_map()
carriers = data.get('vowel_carrier_letters', {})
return frozenset(carriers.values())
@lru_cache(maxsize=1)
def get_short_vowels() -> frozenset:
"""
Get the set of short vowel phonemes.
Derived from short_vowels dict keys (a, u, i, aˤ).
Returns:
frozenset of short vowel phoneme strings
"""
data = _load_phoneme_map()
short_vowels = data.get('short_vowels', {})
return frozenset(short_vowels.keys())
@lru_cache(maxsize=1)
def get_ghunnah_phoneme_set() -> frozenset:
"""
Get the set of ghunnah (nasalized) phonemes.
These are special tajweed phonemes that represent nasalization:
- ŋ = ikhfaa nasal (hidden noon)
- ñ = idgham noon (merged noon with shaddah)
- m̃ = idgham/iqlab meem (merged meem with shaddah)
- j̃ = idgham with ya (merged noon into ya)
- w̃ = idgham with waw (merged noon into waw)
Returns:
frozenset[str]: Set of ghunnah phonemes
"""
return frozenset({'ŋ', 'ñ', 'm̃', 'j̃', 'w̃'})
# =============================================================================
# CENTRALIZED DIACRITIC AND TANWEEN MAPPINGS
# Single source of truth for all builders and renderers
# =============================================================================
# Diacritic names to Unicode characters
# Used by: canonical_builder, result_builder, word_builder, common.py
DIACRITIC_NAME_TO_CHAR = {
"FATHA": "\u064E", # َ
"DAMMA": "\u064F", # ُ
"KASRA": "\u0650", # ِ
"SUKUN": "\u0652", # ْ
"FATHATAN": "\u064B", # ً
"DAMMATAN": "\u064C", # ٌ
"KASRATAN": "\u064D", # ٍ
"SHADDA": "\u0651", # ّ
}
# Reverse mapping: char to name
DIACRITIC_CHAR_TO_NAME = {v: k for k, v in DIACRITIC_NAME_TO_CHAR.items()}
# Open tanween characters (DigitalKhatt V2 font)
OPEN_FATHATAN = '\u08F0'
OPEN_DAMMATAN = '\u08F1'
OPEN_KASRATAN = '\u08F2'
# Small meem characters for iqlab
MINI_MEEM_ABOVE = '\u06E2' # ۢ - for fathatan/dammatan
MINI_MEEM_BELOW = '\u06ED' # ۭ - for kasratan
# Rules that trigger open tanween rendering
# Used by: result_builder, common.py
OPEN_TANWEEN_RULES = frozenset({
'idgham_ghunnah_tanween',
'idgham_bila_ghunnah_tanween',
'ikhfaa_tanween',
})
# Tanween to open tanween mapping (BY NAME)
# For use in builders that work with diacritic names
# Maps: tanween_name -> (open_char, open_name)
TANWEEN_TO_OPEN_BY_NAME = {
'FATHATAN': (OPEN_FATHATAN, 'OPEN_FATHATAN'),
'DAMMATAN': (OPEN_DAMMATAN, 'OPEN_DAMMATAN'),
'KASRATAN': (OPEN_KASRATAN, 'OPEN_KASRATAN'),
}
# Tanween to open tanween mapping (BY CHAR)
# For use in renderers that work with Unicode characters
# Maps: tanween_char -> open_char
TANWEEN_TO_OPEN_BY_CHAR = {
DIACRITIC_NAME_TO_CHAR['FATHATAN']: OPEN_FATHATAN,
DIACRITIC_NAME_TO_CHAR['DAMMATAN']: OPEN_DAMMATAN,
DIACRITIC_NAME_TO_CHAR['KASRATAN']: OPEN_KASRATAN,
}
# Reverse: open tanween to standard tanween
OPEN_TO_TANWEEN_CHAR = {v: k for k, v in TANWEEN_TO_OPEN_BY_CHAR.items()}
# Iqlab tanween mapping (BY NAME)
# For use in builders that work with diacritic names
# Maps: tanween_name -> (base_name, base_char, mini_meem)
TANWEEN_TO_IQLAB_BY_NAME = {
'FATHATAN': ('FATHA', DIACRITIC_NAME_TO_CHAR['FATHA'], MINI_MEEM_ABOVE),
'DAMMATAN': ('DAMMA', DIACRITIC_NAME_TO_CHAR['DAMMA'], MINI_MEEM_ABOVE),
'KASRATAN': ('KASRA', DIACRITIC_NAME_TO_CHAR['KASRA'], MINI_MEEM_BELOW),
}
# Iqlab tanween mapping (BY CHAR)
# For use in renderers that work with Unicode characters
# Maps: tanween_char -> (base_char, mini_meem)
TANWEEN_TO_IQLAB_BY_CHAR = {
DIACRITIC_NAME_TO_CHAR['FATHATAN']: (DIACRITIC_NAME_TO_CHAR['FATHA'], MINI_MEEM_ABOVE),
DIACRITIC_NAME_TO_CHAR['DAMMATAN']: (DIACRITIC_NAME_TO_CHAR['DAMMA'], MINI_MEEM_ABOVE),
DIACRITIC_NAME_TO_CHAR['KASRATAN']: (DIACRITIC_NAME_TO_CHAR['KASRA'], MINI_MEEM_BELOW),
}
def get_diacritic_char_by_name(name: str) -> str:
"""
Get diacritic Unicode character by name.
Single source of truth for diacritic name -> char mapping.
Falls back to get_diacritic_chars() for YAML-defined diacritics.
Args:
name: Diacritic name (e.g., 'FATHA', 'SUKUN', 'FATHATAN')
Returns:
Unicode character, or empty string if not found
"""
# First check hardcoded map (includes open tanween names)
if name in DIACRITIC_NAME_TO_CHAR:
return DIACRITIC_NAME_TO_CHAR[name]
# Handle open tanween names
if name == 'OPEN_FATHATAN':
return OPEN_FATHATAN
if name == 'OPEN_DAMMATAN':
return OPEN_DAMMATAN
if name == 'OPEN_KASRATAN':
return OPEN_KASRATAN
# Fall back to YAML-defined diacritics
return get_diacritic_chars().get(name, '')