""" Phoneme map utilities for loading mappings from phoneme_map.yaml. Provides a single source of truth for phoneme-to-character mappings. """ from pathlib import Path from functools import lru_cache import yaml # Path to the phoneme map file PHONEME_MAP_PATH = Path(__file__).parent.parent / "data" / "phoneme_map.yaml" @lru_cache(maxsize=1) def _load_phoneme_map() -> dict: """Load the phoneme map from YAML file. Cached for performance.""" with open(PHONEME_MAP_PATH, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def get_phoneme_to_char() -> dict: """ Get a mapping of phonemes to Arabic characters. Combines consonants, geminated, heavy variants, long vowels, and tajweed phonemes. Short vowels map to None (they are diacritics, not characters). Returns: Dict[str, Optional[str]]: phoneme -> Arabic character (or None for vowels) """ data = _load_phoneme_map() mapping = {} # Consonants for phoneme, char in data.get('consonants', {}).items(): mapping[phoneme] = char # Geminated consonants (extract base char without shaddah for consistency) for phoneme, char_with_shaddah in data.get('geminated', {}).items(): # Remove shaddah (ّ) to get base character base_char = char_with_shaddah.replace('ّ', '') mapping[phoneme] = base_char # Heavy consonants for phoneme, char in data.get('heavy_consonants', {}).items(): char_clean = char.replace('ّ', '') # Remove shaddah if present mapping[phoneme] = char_clean # Long vowels (map to their carrier letter) for phoneme, info in data.get('long_vowels', {}).items(): if isinstance(info, dict): # Structured format: {short: "a", letter: "ا"} mapping[phoneme] = info.get('letter', '') else: # Legacy flat format: phoneme: char mapping[phoneme] = info # Short vowels map to None (they're diacritics) for phoneme in data.get('short_vowels', {}).keys(): mapping[phoneme] = None # Tajweed phonemes for phoneme, char in data.get('tajweed_phonemes', {}).items(): if char: # Skip empty mappings char_clean = char.replace('ّ', '') mapping[phoneme] = char_clean else: mapping[phoneme] = None return mapping def get_vowel_to_diacritic() -> dict: """ Get a mapping of short vowel phonemes to diacritic info. Returns: Dict[str, Tuple[str, str]]: phoneme -> (diacritic_name, diacritic_char) """ data = _load_phoneme_map() diacritics = data.get('diacritics', {}) short_vowels = data.get('short_vowels', {}) mapping = {} for phoneme, diacritic_name in short_vowels.items(): if diacritic_name in diacritics: mapping[phoneme] = (diacritic_name, diacritics[diacritic_name]) return mapping @lru_cache(maxsize=1) def get_short_vowel_set() -> frozenset: """ Get the set of short vowel phonemes. Cached and returns frozenset for immutability and hashability. Use this instead of re-deriving short vowels from get_vowel_to_diacritic(). Returns: frozenset[str]: Set of short vowel phonemes (e.g., {'a', 'u', 'i', 'aˤ'}) """ return frozenset(get_vowel_to_diacritic().keys()) @lru_cache(maxsize=1) def get_long_vowel_mappings() -> dict: """ Get a mapping of long vowel phonemes to their components. Returns: Dict[str, Tuple[str, str]]: phoneme -> (short_vowel_phoneme, vowel_letter) Example: {'a:': ('a', 'ا'), 'u:': ('u', 'و'), 'i:': ('i', 'ي')} """ data = _load_phoneme_map() long_vowels = data.get('long_vowels', {}) # Extract short vowel and letter from structured YAML mapping = {} for phoneme, info in long_vowels.items(): if isinstance(info, dict): # New structured format: {short: "a", letter: "ا"} short_vowel = info.get('short', phoneme.replace(':', '')) letter = info.get('letter', '') else: # Legacy flat format: "a:": "ا" short_vowel = phoneme.replace(':', '') letter = info mapping[phoneme] = (short_vowel, letter) return mapping def get_diacritic_chars() -> dict: """ Get a mapping of diacritic names to Unicode characters. Returns: Dict[str, str]: diacritic_name -> Unicode character """ data = _load_phoneme_map() return data.get('diacritics', {}) @lru_cache(maxsize=1) def get_geminated_set() -> frozenset: """ Get the set of all geminated phonemes. Cached for efficient repeated lookups. Returns: frozenset[str]: Set of geminated phonemes (e.g., {'bb', 'll', 'sˤsˤ', ...}) """ data = _load_phoneme_map() return frozenset(data.get('geminated', {}).keys()) def is_geminated_phoneme(phoneme: str) -> bool: """Check if a phoneme is geminated (doubled consonant).""" return phoneme in get_geminated_set() @lru_cache(maxsize=1) def get_consonant_set() -> frozenset: """ Get the set of all single consonant phonemes. Returns: frozenset[str]: Set of consonant phonemes (e.g., {'b', 't', 'rˤ', ...}) """ data = _load_phoneme_map() return frozenset(data.get('consonants', {}).keys()) def is_consonant_phoneme(phoneme: str) -> bool: """Check if a phoneme is a single consonant (not geminated, not vowel).""" return phoneme in get_consonant_set() @lru_cache(maxsize=1) def get_geminated_to_base_map() -> dict: """ Get a mapping from geminated phonemes to their base forms. Built from the geminated dict in phoneme_map.yaml. Maps each geminated phoneme to its single consonant form. Returns: Dict[str, str]: geminated -> base (e.g., {'bb': 'b', 'sˤsˤ': 'sˤ'}) """ data = _load_phoneme_map() geminated = data.get('geminated', {}) consonants = data.get('consonants', {}) heavy = data.get('heavy_consonants', {}) # Build reverse map: Arabic char -> phoneme char_to_phoneme = {} for phoneme, char in consonants.items(): char_to_phoneme[char] = phoneme for phoneme, char in heavy.items(): char_clean = char.replace('ّ', '') if char_clean not in char_to_phoneme: char_to_phoneme[char_clean] = phoneme # Build geminated -> base map mapping = {} for gem_phoneme, char_with_shaddah in geminated.items(): base_char = char_with_shaddah.replace('ّ', '') if base_char in char_to_phoneme: mapping[gem_phoneme] = char_to_phoneme[base_char] else: # Fallback: split phoneme in half mid = len(gem_phoneme) // 2 mapping[gem_phoneme] = gem_phoneme[:mid] return mapping def get_base_phoneme(phoneme: str) -> str: """ Get the base (non-geminated) form of a phoneme. If phoneme is geminated, returns the single consonant form. If phoneme is already single, returns it unchanged. Args: phoneme: Any phoneme (geminated or single) Returns: Base phoneme (e.g., 'bb' -> 'b', 'sˤsˤ' -> 'sˤ', 't' -> 't') """ gem_to_base = get_geminated_to_base_map() if phoneme in gem_to_base: return gem_to_base[phoneme] return phoneme def get_tanween_mappings() -> dict: """ Get a mapping of short vowel phonemes to tanween diacritic info. Returns: Dict[str, Tuple[str, str]]: phoneme -> (diacritic_name, diacritic_char) """ data = _load_phoneme_map() diacritics = data.get('diacritics', {}) tanween = data.get('tanween', {}) mapping = {} for phoneme, diacritic_name in tanween.items(): if diacritic_name in diacritics: mapping[phoneme] = (diacritic_name, diacritics[diacritic_name]) return mapping # ============================================================================= # LONG VOWEL UTILITIES # ============================================================================= @lru_cache(maxsize=1) def get_long_vowel_set() -> frozenset: """ Get the set of long vowel phonemes. Returns: frozenset[str]: Set of long vowel phonemes (e.g., {'a:', 'u:', 'i:', 'aˤ:'}) """ return frozenset(get_long_vowel_mappings().keys()) def is_long_vowel(phoneme: str) -> bool: """Check if phoneme is a long vowel.""" return phoneme in get_long_vowel_set() def get_short_from_long(long_vowel: str) -> str: """ Get short vowel component from long vowel. Args: long_vowel: Long vowel phoneme (e.g., 'a:', 'aˤ:') Returns: Short vowel phoneme (e.g., 'a', 'aˤ') """ mapping = get_long_vowel_mappings() if long_vowel in mapping: return mapping[long_vowel][0] return long_vowel.replace(':', '') def normalize_fatha_variants(vowel: str) -> str: """ Normalize fatha variants for equivalence checking. Treats aˤ as equivalent to a, and aˤ: as equivalent to a:. Args: vowel: Vowel phoneme (short or long) Returns: Normalized vowel (ˤ removed) """ return vowel.replace('ˤ', '') @lru_cache(maxsize=1) def get_vowel_extension_chars() -> frozenset: """ Get the set of vowel extension characters (mini graphemes for long vowels). These are small marks that sit above/below consonant letters to indicate a long vowel, rather than using a full alef/waw/yaa letter. Returns: frozenset of Unicode characters for vowel extensions """ data = _load_phoneme_map() extensions = data.get('vowel_extensions', {}) return frozenset(extensions.values()) @lru_cache(maxsize=1) def get_vowel_carrier_chars() -> frozenset: """ Get the set of vowel carrier letter characters. These are full letters (alef, waw, yaa, alef maksura) that ARE the vowel grapheme, not just consonants hosting a vowel extension. Used to determine if a letter should be treated as full letter case vs mini extension case. Returns: frozenset of Unicode characters for vowel carrier letters """ data = _load_phoneme_map() carriers = data.get('vowel_carrier_letters', {}) return frozenset(carriers.values()) @lru_cache(maxsize=1) def get_short_vowels() -> frozenset: """ Get the set of short vowel phonemes. Derived from short_vowels dict keys (a, u, i, aˤ). Returns: frozenset of short vowel phoneme strings """ data = _load_phoneme_map() short_vowels = data.get('short_vowels', {}) return frozenset(short_vowels.keys()) @lru_cache(maxsize=1) def get_ghunnah_phoneme_set() -> frozenset: """ Get the set of ghunnah (nasalized) phonemes. These are special tajweed phonemes that represent nasalization: - ŋ = ikhfaa nasal (hidden noon) - ñ = idgham noon (merged noon with shaddah) - m̃ = idgham/iqlab meem (merged meem with shaddah) - j̃ = idgham with ya (merged noon into ya) - w̃ = idgham with waw (merged noon into waw) Returns: frozenset[str]: Set of ghunnah phonemes """ return frozenset({'ŋ', 'ñ', 'm̃', 'j̃', 'w̃'}) # ============================================================================= # CENTRALIZED DIACRITIC AND TANWEEN MAPPINGS # Single source of truth for all builders and renderers # ============================================================================= # Diacritic names to Unicode characters # Used by: canonical_builder, result_builder, word_builder, common.py DIACRITIC_NAME_TO_CHAR = { "FATHA": "\u064E", # َ "DAMMA": "\u064F", # ُ "KASRA": "\u0650", # ِ "SUKUN": "\u0652", # ْ "FATHATAN": "\u064B", # ً "DAMMATAN": "\u064C", # ٌ "KASRATAN": "\u064D", # ٍ "SHADDA": "\u0651", # ّ } # Reverse mapping: char to name DIACRITIC_CHAR_TO_NAME = {v: k for k, v in DIACRITIC_NAME_TO_CHAR.items()} # Open tanween characters (DigitalKhatt V2 font) OPEN_FATHATAN = '\u08F0' OPEN_DAMMATAN = '\u08F1' OPEN_KASRATAN = '\u08F2' # Small meem characters for iqlab MINI_MEEM_ABOVE = '\u06E2' # ۢ - for fathatan/dammatan MINI_MEEM_BELOW = '\u06ED' # ۭ - for kasratan # Rules that trigger open tanween rendering # Used by: result_builder, common.py OPEN_TANWEEN_RULES = frozenset({ 'idgham_ghunnah_tanween', 'idgham_bila_ghunnah_tanween', 'ikhfaa_tanween', }) # Tanween to open tanween mapping (BY NAME) # For use in builders that work with diacritic names # Maps: tanween_name -> (open_char, open_name) TANWEEN_TO_OPEN_BY_NAME = { 'FATHATAN': (OPEN_FATHATAN, 'OPEN_FATHATAN'), 'DAMMATAN': (OPEN_DAMMATAN, 'OPEN_DAMMATAN'), 'KASRATAN': (OPEN_KASRATAN, 'OPEN_KASRATAN'), } # Tanween to open tanween mapping (BY CHAR) # For use in renderers that work with Unicode characters # Maps: tanween_char -> open_char TANWEEN_TO_OPEN_BY_CHAR = { DIACRITIC_NAME_TO_CHAR['FATHATAN']: OPEN_FATHATAN, DIACRITIC_NAME_TO_CHAR['DAMMATAN']: OPEN_DAMMATAN, DIACRITIC_NAME_TO_CHAR['KASRATAN']: OPEN_KASRATAN, } # Reverse: open tanween to standard tanween OPEN_TO_TANWEEN_CHAR = {v: k for k, v in TANWEEN_TO_OPEN_BY_CHAR.items()} # Iqlab tanween mapping (BY NAME) # For use in builders that work with diacritic names # Maps: tanween_name -> (base_name, base_char, mini_meem) TANWEEN_TO_IQLAB_BY_NAME = { 'FATHATAN': ('FATHA', DIACRITIC_NAME_TO_CHAR['FATHA'], MINI_MEEM_ABOVE), 'DAMMATAN': ('DAMMA', DIACRITIC_NAME_TO_CHAR['DAMMA'], MINI_MEEM_ABOVE), 'KASRATAN': ('KASRA', DIACRITIC_NAME_TO_CHAR['KASRA'], MINI_MEEM_BELOW), } # Iqlab tanween mapping (BY CHAR) # For use in renderers that work with Unicode characters # Maps: tanween_char -> (base_char, mini_meem) TANWEEN_TO_IQLAB_BY_CHAR = { DIACRITIC_NAME_TO_CHAR['FATHATAN']: (DIACRITIC_NAME_TO_CHAR['FATHA'], MINI_MEEM_ABOVE), DIACRITIC_NAME_TO_CHAR['DAMMATAN']: (DIACRITIC_NAME_TO_CHAR['DAMMA'], MINI_MEEM_ABOVE), DIACRITIC_NAME_TO_CHAR['KASRATAN']: (DIACRITIC_NAME_TO_CHAR['KASRA'], MINI_MEEM_BELOW), } def get_diacritic_char_by_name(name: str) -> str: """ Get diacritic Unicode character by name. Single source of truth for diacritic name -> char mapping. Falls back to get_diacritic_chars() for YAML-defined diacritics. Args: name: Diacritic name (e.g., 'FATHA', 'SUKUN', 'FATHATAN') Returns: Unicode character, or empty string if not found """ # First check hardcoded map (includes open tanween names) if name in DIACRITIC_NAME_TO_CHAR: return DIACRITIC_NAME_TO_CHAR[name] # Handle open tanween names if name == 'OPEN_FATHATAN': return OPEN_FATHATAN if name == 'OPEN_DAMMATAN': return OPEN_DAMMATAN if name == 'OPEN_KASRATAN': return OPEN_KASRATAN # Fall back to YAML-defined diacritics return get_diacritic_chars().get(name, '')