Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Phoneme map utilities for loading mappings from phoneme_map.yaml. | |
| Provides a single source of truth for phoneme-to-character mappings. | |
| """ | |
| from pathlib import Path | |
| from functools import lru_cache | |
| import yaml | |
| # Path to the phoneme map file | |
| PHONEME_MAP_PATH = Path(__file__).parent.parent / "data" / "phoneme_map.yaml" | |
| def _load_phoneme_map() -> dict: | |
| """Load the phoneme map from YAML file. Cached for performance.""" | |
| with open(PHONEME_MAP_PATH, 'r', encoding='utf-8') as f: | |
| return yaml.safe_load(f) | |
| def get_phoneme_to_char() -> dict: | |
| """ | |
| Get a mapping of phonemes to Arabic characters. | |
| Combines consonants, geminated, heavy variants, long vowels, and tajweed phonemes. | |
| Short vowels map to None (they are diacritics, not characters). | |
| Returns: | |
| Dict[str, Optional[str]]: phoneme -> Arabic character (or None for vowels) | |
| """ | |
| data = _load_phoneme_map() | |
| mapping = {} | |
| # Consonants | |
| for phoneme, char in data.get('consonants', {}).items(): | |
| mapping[phoneme] = char | |
| # Geminated consonants (extract base char without shaddah for consistency) | |
| for phoneme, char_with_shaddah in data.get('geminated', {}).items(): | |
| # Remove shaddah (ّ) to get base character | |
| base_char = char_with_shaddah.replace('ّ', '') | |
| mapping[phoneme] = base_char | |
| # Heavy consonants | |
| for phoneme, char in data.get('heavy_consonants', {}).items(): | |
| char_clean = char.replace('ّ', '') # Remove shaddah if present | |
| mapping[phoneme] = char_clean | |
| # Long vowels (map to their carrier letter) | |
| for phoneme, info in data.get('long_vowels', {}).items(): | |
| if isinstance(info, dict): | |
| # Structured format: {short: "a", letter: "ا"} | |
| mapping[phoneme] = info.get('letter', '') | |
| else: | |
| # Legacy flat format: phoneme: char | |
| mapping[phoneme] = info | |
| # Short vowels map to None (they're diacritics) | |
| for phoneme in data.get('short_vowels', {}).keys(): | |
| mapping[phoneme] = None | |
| # Tajweed phonemes | |
| for phoneme, char in data.get('tajweed_phonemes', {}).items(): | |
| if char: # Skip empty mappings | |
| char_clean = char.replace('ّ', '') | |
| mapping[phoneme] = char_clean | |
| else: | |
| mapping[phoneme] = None | |
| return mapping | |
| def get_vowel_to_diacritic() -> dict: | |
| """ | |
| Get a mapping of short vowel phonemes to diacritic info. | |
| Returns: | |
| Dict[str, Tuple[str, str]]: phoneme -> (diacritic_name, diacritic_char) | |
| """ | |
| data = _load_phoneme_map() | |
| diacritics = data.get('diacritics', {}) | |
| short_vowels = data.get('short_vowels', {}) | |
| mapping = {} | |
| for phoneme, diacritic_name in short_vowels.items(): | |
| if diacritic_name in diacritics: | |
| mapping[phoneme] = (diacritic_name, diacritics[diacritic_name]) | |
| return mapping | |
| def get_short_vowel_set() -> frozenset: | |
| """ | |
| Get the set of short vowel phonemes. | |
| Cached and returns frozenset for immutability and hashability. | |
| Use this instead of re-deriving short vowels from get_vowel_to_diacritic(). | |
| Returns: | |
| frozenset[str]: Set of short vowel phonemes (e.g., {'a', 'u', 'i', 'aˤ'}) | |
| """ | |
| return frozenset(get_vowel_to_diacritic().keys()) | |
| def get_long_vowel_mappings() -> dict: | |
| """ | |
| Get a mapping of long vowel phonemes to their components. | |
| Returns: | |
| Dict[str, Tuple[str, str]]: phoneme -> (short_vowel_phoneme, vowel_letter) | |
| Example: | |
| {'a:': ('a', 'ا'), 'u:': ('u', 'و'), 'i:': ('i', 'ي')} | |
| """ | |
| data = _load_phoneme_map() | |
| long_vowels = data.get('long_vowels', {}) | |
| # Extract short vowel and letter from structured YAML | |
| mapping = {} | |
| for phoneme, info in long_vowels.items(): | |
| if isinstance(info, dict): | |
| # New structured format: {short: "a", letter: "ا"} | |
| short_vowel = info.get('short', phoneme.replace(':', '')) | |
| letter = info.get('letter', '') | |
| else: | |
| # Legacy flat format: "a:": "ا" | |
| short_vowel = phoneme.replace(':', '') | |
| letter = info | |
| mapping[phoneme] = (short_vowel, letter) | |
| return mapping | |
| def get_diacritic_chars() -> dict: | |
| """ | |
| Get a mapping of diacritic names to Unicode characters. | |
| Returns: | |
| Dict[str, str]: diacritic_name -> Unicode character | |
| """ | |
| data = _load_phoneme_map() | |
| return data.get('diacritics', {}) | |
| def get_geminated_set() -> frozenset: | |
| """ | |
| Get the set of all geminated phonemes. | |
| Cached for efficient repeated lookups. | |
| Returns: | |
| frozenset[str]: Set of geminated phonemes (e.g., {'bb', 'll', 'sˤsˤ', ...}) | |
| """ | |
| data = _load_phoneme_map() | |
| return frozenset(data.get('geminated', {}).keys()) | |
| def is_geminated_phoneme(phoneme: str) -> bool: | |
| """Check if a phoneme is geminated (doubled consonant).""" | |
| return phoneme in get_geminated_set() | |
| def get_consonant_set() -> frozenset: | |
| """ | |
| Get the set of all single consonant phonemes. | |
| Returns: | |
| frozenset[str]: Set of consonant phonemes (e.g., {'b', 't', 'rˤ', ...}) | |
| """ | |
| data = _load_phoneme_map() | |
| return frozenset(data.get('consonants', {}).keys()) | |
| def is_consonant_phoneme(phoneme: str) -> bool: | |
| """Check if a phoneme is a single consonant (not geminated, not vowel).""" | |
| return phoneme in get_consonant_set() | |
| def get_geminated_to_base_map() -> dict: | |
| """ | |
| Get a mapping from geminated phonemes to their base forms. | |
| Built from the geminated dict in phoneme_map.yaml. | |
| Maps each geminated phoneme to its single consonant form. | |
| Returns: | |
| Dict[str, str]: geminated -> base (e.g., {'bb': 'b', 'sˤsˤ': 'sˤ'}) | |
| """ | |
| data = _load_phoneme_map() | |
| geminated = data.get('geminated', {}) | |
| consonants = data.get('consonants', {}) | |
| heavy = data.get('heavy_consonants', {}) | |
| # Build reverse map: Arabic char -> phoneme | |
| char_to_phoneme = {} | |
| for phoneme, char in consonants.items(): | |
| char_to_phoneme[char] = phoneme | |
| for phoneme, char in heavy.items(): | |
| char_clean = char.replace('ّ', '') | |
| if char_clean not in char_to_phoneme: | |
| char_to_phoneme[char_clean] = phoneme | |
| # Build geminated -> base map | |
| mapping = {} | |
| for gem_phoneme, char_with_shaddah in geminated.items(): | |
| base_char = char_with_shaddah.replace('ّ', '') | |
| if base_char in char_to_phoneme: | |
| mapping[gem_phoneme] = char_to_phoneme[base_char] | |
| else: | |
| # Fallback: split phoneme in half | |
| mid = len(gem_phoneme) // 2 | |
| mapping[gem_phoneme] = gem_phoneme[:mid] | |
| return mapping | |
| def get_base_phoneme(phoneme: str) -> str: | |
| """ | |
| Get the base (non-geminated) form of a phoneme. | |
| If phoneme is geminated, returns the single consonant form. | |
| If phoneme is already single, returns it unchanged. | |
| Args: | |
| phoneme: Any phoneme (geminated or single) | |
| Returns: | |
| Base phoneme (e.g., 'bb' -> 'b', 'sˤsˤ' -> 'sˤ', 't' -> 't') | |
| """ | |
| gem_to_base = get_geminated_to_base_map() | |
| if phoneme in gem_to_base: | |
| return gem_to_base[phoneme] | |
| return phoneme | |
| def get_tanween_mappings() -> dict: | |
| """ | |
| Get a mapping of short vowel phonemes to tanween diacritic info. | |
| Returns: | |
| Dict[str, Tuple[str, str]]: phoneme -> (diacritic_name, diacritic_char) | |
| """ | |
| data = _load_phoneme_map() | |
| diacritics = data.get('diacritics', {}) | |
| tanween = data.get('tanween', {}) | |
| mapping = {} | |
| for phoneme, diacritic_name in tanween.items(): | |
| if diacritic_name in diacritics: | |
| mapping[phoneme] = (diacritic_name, diacritics[diacritic_name]) | |
| return mapping | |
| # ============================================================================= | |
| # LONG VOWEL UTILITIES | |
| # ============================================================================= | |
| def get_long_vowel_set() -> frozenset: | |
| """ | |
| Get the set of long vowel phonemes. | |
| Returns: | |
| frozenset[str]: Set of long vowel phonemes (e.g., {'a:', 'u:', 'i:', 'aˤ:'}) | |
| """ | |
| return frozenset(get_long_vowel_mappings().keys()) | |
| def is_long_vowel(phoneme: str) -> bool: | |
| """Check if phoneme is a long vowel.""" | |
| return phoneme in get_long_vowel_set() | |
| def get_short_from_long(long_vowel: str) -> str: | |
| """ | |
| Get short vowel component from long vowel. | |
| Args: | |
| long_vowel: Long vowel phoneme (e.g., 'a:', 'aˤ:') | |
| Returns: | |
| Short vowel phoneme (e.g., 'a', 'aˤ') | |
| """ | |
| mapping = get_long_vowel_mappings() | |
| if long_vowel in mapping: | |
| return mapping[long_vowel][0] | |
| return long_vowel.replace(':', '') | |
| def normalize_fatha_variants(vowel: str) -> str: | |
| """ | |
| Normalize fatha variants for equivalence checking. | |
| Treats aˤ as equivalent to a, and aˤ: as equivalent to a:. | |
| Args: | |
| vowel: Vowel phoneme (short or long) | |
| Returns: | |
| Normalized vowel (ˤ removed) | |
| """ | |
| return vowel.replace('ˤ', '') | |
| def get_vowel_extension_chars() -> frozenset: | |
| """ | |
| Get the set of vowel extension characters (mini graphemes for long vowels). | |
| These are small marks that sit above/below consonant letters to indicate | |
| a long vowel, rather than using a full alef/waw/yaa letter. | |
| Returns: | |
| frozenset of Unicode characters for vowel extensions | |
| """ | |
| data = _load_phoneme_map() | |
| extensions = data.get('vowel_extensions', {}) | |
| return frozenset(extensions.values()) | |
| def get_vowel_carrier_chars() -> frozenset: | |
| """ | |
| Get the set of vowel carrier letter characters. | |
| These are full letters (alef, waw, yaa, alef maksura) that ARE the vowel | |
| grapheme, not just consonants hosting a vowel extension. Used to determine | |
| if a letter should be treated as full letter case vs mini extension case. | |
| Returns: | |
| frozenset of Unicode characters for vowel carrier letters | |
| """ | |
| data = _load_phoneme_map() | |
| carriers = data.get('vowel_carrier_letters', {}) | |
| return frozenset(carriers.values()) | |
| def get_short_vowels() -> frozenset: | |
| """ | |
| Get the set of short vowel phonemes. | |
| Derived from short_vowels dict keys (a, u, i, aˤ). | |
| Returns: | |
| frozenset of short vowel phoneme strings | |
| """ | |
| data = _load_phoneme_map() | |
| short_vowels = data.get('short_vowels', {}) | |
| return frozenset(short_vowels.keys()) | |
| def get_ghunnah_phoneme_set() -> frozenset: | |
| """ | |
| Get the set of ghunnah (nasalized) phonemes. | |
| These are special tajweed phonemes that represent nasalization: | |
| - ŋ = ikhfaa nasal (hidden noon) | |
| - ñ = idgham noon (merged noon with shaddah) | |
| - m̃ = idgham/iqlab meem (merged meem with shaddah) | |
| - j̃ = idgham with ya (merged noon into ya) | |
| - w̃ = idgham with waw (merged noon into waw) | |
| Returns: | |
| frozenset[str]: Set of ghunnah phonemes | |
| """ | |
| return frozenset({'ŋ', 'ñ', 'm̃', 'j̃', 'w̃'}) | |
| # ============================================================================= | |
| # CENTRALIZED DIACRITIC AND TANWEEN MAPPINGS | |
| # Single source of truth for all builders and renderers | |
| # ============================================================================= | |
| # Diacritic names to Unicode characters | |
| # Used by: canonical_builder, result_builder, word_builder, common.py | |
| DIACRITIC_NAME_TO_CHAR = { | |
| "FATHA": "\u064E", # َ | |
| "DAMMA": "\u064F", # ُ | |
| "KASRA": "\u0650", # ِ | |
| "SUKUN": "\u0652", # ْ | |
| "FATHATAN": "\u064B", # ً | |
| "DAMMATAN": "\u064C", # ٌ | |
| "KASRATAN": "\u064D", # ٍ | |
| "SHADDA": "\u0651", # ّ | |
| } | |
| # Reverse mapping: char to name | |
| DIACRITIC_CHAR_TO_NAME = {v: k for k, v in DIACRITIC_NAME_TO_CHAR.items()} | |
| # Open tanween characters (DigitalKhatt V2 font) | |
| OPEN_FATHATAN = '\u08F0' | |
| OPEN_DAMMATAN = '\u08F1' | |
| OPEN_KASRATAN = '\u08F2' | |
| # Small meem characters for iqlab | |
| MINI_MEEM_ABOVE = '\u06E2' # ۢ - for fathatan/dammatan | |
| MINI_MEEM_BELOW = '\u06ED' # ۭ - for kasratan | |
| # Rules that trigger open tanween rendering | |
| # Used by: result_builder, common.py | |
| OPEN_TANWEEN_RULES = frozenset({ | |
| 'idgham_ghunnah_tanween', | |
| 'idgham_bila_ghunnah_tanween', | |
| 'ikhfaa_tanween', | |
| }) | |
| # Tanween to open tanween mapping (BY NAME) | |
| # For use in builders that work with diacritic names | |
| # Maps: tanween_name -> (open_char, open_name) | |
| TANWEEN_TO_OPEN_BY_NAME = { | |
| 'FATHATAN': (OPEN_FATHATAN, 'OPEN_FATHATAN'), | |
| 'DAMMATAN': (OPEN_DAMMATAN, 'OPEN_DAMMATAN'), | |
| 'KASRATAN': (OPEN_KASRATAN, 'OPEN_KASRATAN'), | |
| } | |
| # Tanween to open tanween mapping (BY CHAR) | |
| # For use in renderers that work with Unicode characters | |
| # Maps: tanween_char -> open_char | |
| TANWEEN_TO_OPEN_BY_CHAR = { | |
| DIACRITIC_NAME_TO_CHAR['FATHATAN']: OPEN_FATHATAN, | |
| DIACRITIC_NAME_TO_CHAR['DAMMATAN']: OPEN_DAMMATAN, | |
| DIACRITIC_NAME_TO_CHAR['KASRATAN']: OPEN_KASRATAN, | |
| } | |
| # Reverse: open tanween to standard tanween | |
| OPEN_TO_TANWEEN_CHAR = {v: k for k, v in TANWEEN_TO_OPEN_BY_CHAR.items()} | |
| # Iqlab tanween mapping (BY NAME) | |
| # For use in builders that work with diacritic names | |
| # Maps: tanween_name -> (base_name, base_char, mini_meem) | |
| TANWEEN_TO_IQLAB_BY_NAME = { | |
| 'FATHATAN': ('FATHA', DIACRITIC_NAME_TO_CHAR['FATHA'], MINI_MEEM_ABOVE), | |
| 'DAMMATAN': ('DAMMA', DIACRITIC_NAME_TO_CHAR['DAMMA'], MINI_MEEM_ABOVE), | |
| 'KASRATAN': ('KASRA', DIACRITIC_NAME_TO_CHAR['KASRA'], MINI_MEEM_BELOW), | |
| } | |
| # Iqlab tanween mapping (BY CHAR) | |
| # For use in renderers that work with Unicode characters | |
| # Maps: tanween_char -> (base_char, mini_meem) | |
| TANWEEN_TO_IQLAB_BY_CHAR = { | |
| DIACRITIC_NAME_TO_CHAR['FATHATAN']: (DIACRITIC_NAME_TO_CHAR['FATHA'], MINI_MEEM_ABOVE), | |
| DIACRITIC_NAME_TO_CHAR['DAMMATAN']: (DIACRITIC_NAME_TO_CHAR['DAMMA'], MINI_MEEM_ABOVE), | |
| DIACRITIC_NAME_TO_CHAR['KASRATAN']: (DIACRITIC_NAME_TO_CHAR['KASRA'], MINI_MEEM_BELOW), | |
| } | |
| def get_diacritic_char_by_name(name: str) -> str: | |
| """ | |
| Get diacritic Unicode character by name. | |
| Single source of truth for diacritic name -> char mapping. | |
| Falls back to get_diacritic_chars() for YAML-defined diacritics. | |
| Args: | |
| name: Diacritic name (e.g., 'FATHA', 'SUKUN', 'FATHATAN') | |
| Returns: | |
| Unicode character, or empty string if not found | |
| """ | |
| # First check hardcoded map (includes open tanween names) | |
| if name in DIACRITIC_NAME_TO_CHAR: | |
| return DIACRITIC_NAME_TO_CHAR[name] | |
| # Handle open tanween names | |
| if name == 'OPEN_FATHATAN': | |
| return OPEN_FATHATAN | |
| if name == 'OPEN_DAMMATAN': | |
| return OPEN_DAMMATAN | |
| if name == 'OPEN_KASRATAN': | |
| return OPEN_KASRATAN | |
| # Fall back to YAML-defined diacritics | |
| return get_diacritic_chars().get(name, '') | |