"""
Phoneme map utilities for loading mappings from phoneme_map.yaml.

Provides a single source of truth for phoneme-to-character mappings.
"""

from pathlib import Path
from functools import lru_cache
import yaml

# Path to the phoneme map file
PHONEME_MAP_PATH = Path(__file__).parent.parent / "data" / "phoneme_map.yaml"


@lru_cache(maxsize=1)
def _load_phoneme_map() -> dict:
    """Load the phoneme map from YAML file. Cached for performance."""
    with open(PHONEME_MAP_PATH, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)


def get_phoneme_to_char() -> dict:
    """
    Get a mapping of phonemes to Arabic characters.
    
    Combines consonants, geminated, heavy variants, long vowels, and tajweed phonemes.
    Short vowels map to None (they are diacritics, not characters).
    
    Returns:
        Dict[str, Optional[str]]: phoneme -> Arabic character (or None for vowels)
    """
    data = _load_phoneme_map()
    mapping = {}
    
    # Consonants
    for phoneme, char in data.get('consonants', {}).items():
        mapping[phoneme] = char
    
    # Geminated consonants (extract base char without shaddah for consistency)
    for phoneme, char_with_shaddah in data.get('geminated', {}).items():
        # Remove shaddah (ّ) to get base character
        base_char = char_with_shaddah.replace('ّ', '')
        mapping[phoneme] = base_char
    
    # Heavy consonants
    for phoneme, char in data.get('heavy_consonants', {}).items():
        char_clean = char.replace('ّ', '')  # Remove shaddah if present
        mapping[phoneme] = char_clean
    
    # Long vowels (map to their carrier letter)
    for phoneme, info in data.get('long_vowels', {}).items():
        if isinstance(info, dict):
            # Structured format: {short: "a", letter: "ا"}
            mapping[phoneme] = info.get('letter', '')
        else:
            # Legacy flat format: phoneme: char
            mapping[phoneme] = info
    
    # Short vowels map to None (they're diacritics)
    for phoneme in data.get('short_vowels', {}).keys():
        mapping[phoneme] = None
    
    # Tajweed phonemes
    for phoneme, char in data.get('tajweed_phonemes', {}).items():
        if char:  # Skip empty mappings
            char_clean = char.replace('ّ', '')
            mapping[phoneme] = char_clean
        else:
            mapping[phoneme] = None
    
    return mapping


def get_vowel_to_diacritic() -> dict:
    """
    Get a mapping of short vowel phonemes to diacritic info.
    
    Returns:
        Dict[str, Tuple[str, str]]: phoneme -> (diacritic_name, diacritic_char)
    """
    data = _load_phoneme_map()
    diacritics = data.get('diacritics', {})
    short_vowels = data.get('short_vowels', {})
    
    mapping = {}
    for phoneme, diacritic_name in short_vowels.items():
        if diacritic_name in diacritics:
            mapping[phoneme] = (diacritic_name, diacritics[diacritic_name])
    
    return mapping


@lru_cache(maxsize=1)
def get_short_vowel_set() -> frozenset:
    """
    Get the set of short vowel phonemes.
    
    Cached and returns frozenset for immutability and hashability.
    Use this instead of re-deriving short vowels from get_vowel_to_diacritic().
    
    Returns:
        frozenset[str]: Set of short vowel phonemes (e.g., {'a', 'u', 'i', 'aˤ'})
    """
    return frozenset(get_vowel_to_diacritic().keys())


@lru_cache(maxsize=1)
def get_long_vowel_mappings() -> dict:
    """
    Get a mapping of long vowel phonemes to their components.
    
    Returns:
        Dict[str, Tuple[str, str]]: phoneme -> (short_vowel_phoneme, vowel_letter)
        
    Example:
        {'a:': ('a', 'ا'), 'u:': ('u', 'و'), 'i:': ('i', 'ي')}
    """
    data = _load_phoneme_map()
    long_vowels = data.get('long_vowels', {})
    
    # Extract short vowel and letter from structured YAML
    mapping = {}
    for phoneme, info in long_vowels.items():
        if isinstance(info, dict):
            # New structured format: {short: "a", letter: "ا"}
            short_vowel = info.get('short', phoneme.replace(':', ''))
            letter = info.get('letter', '')
        else:
            # Legacy flat format: "a:": "ا"
            short_vowel = phoneme.replace(':', '')
            letter = info
        mapping[phoneme] = (short_vowel, letter)
    
    return mapping


def get_diacritic_chars() -> dict:
    """
    Get a mapping of diacritic names to Unicode characters.
    
    Returns:
        Dict[str, str]: diacritic_name -> Unicode character
    """
    data = _load_phoneme_map()
    return data.get('diacritics', {})


@lru_cache(maxsize=1)
def get_geminated_set() -> frozenset:
    """
    Get the set of all geminated phonemes.

    Cached for efficient repeated lookups.

    Returns:
        frozenset[str]: Set of geminated phonemes (e.g., {'bb', 'll', 'sˤsˤ', ...})
    """
    data = _load_phoneme_map()
    return frozenset(data.get('geminated', {}).keys())


def is_geminated_phoneme(phoneme: str) -> bool:
    """Check if a phoneme is geminated (doubled consonant)."""
    return phoneme in get_geminated_set()


@lru_cache(maxsize=1)
def get_consonant_set() -> frozenset:
    """
    Get the set of all single consonant phonemes.

    Returns:
        frozenset[str]: Set of consonant phonemes (e.g., {'b', 't', 'rˤ', ...})
    """
    data = _load_phoneme_map()
    return frozenset(data.get('consonants', {}).keys())


def is_consonant_phoneme(phoneme: str) -> bool:
    """Check if a phoneme is a single consonant (not geminated, not vowel)."""
    return phoneme in get_consonant_set()


@lru_cache(maxsize=1)
def get_geminated_to_base_map() -> dict:
    """
    Get a mapping from geminated phonemes to their base forms.

    Built from the geminated dict in phoneme_map.yaml.
    Maps each geminated phoneme to its single consonant form.

    Returns:
        Dict[str, str]: geminated -> base (e.g., {'bb': 'b', 'sˤsˤ': 'sˤ'})
    """
    data = _load_phoneme_map()
    geminated = data.get('geminated', {})
    consonants = data.get('consonants', {})
    heavy = data.get('heavy_consonants', {})

    # Build reverse map: Arabic char -> phoneme
    char_to_phoneme = {}
    for phoneme, char in consonants.items():
        char_to_phoneme[char] = phoneme
    for phoneme, char in heavy.items():
        char_clean = char.replace('ّ', '')
        if char_clean not in char_to_phoneme:
            char_to_phoneme[char_clean] = phoneme

    # Build geminated -> base map
    mapping = {}
    for gem_phoneme, char_with_shaddah in geminated.items():
        base_char = char_with_shaddah.replace('ّ', '')
        if base_char in char_to_phoneme:
            mapping[gem_phoneme] = char_to_phoneme[base_char]
        else:
            # Fallback: split phoneme in half
            mid = len(gem_phoneme) // 2
            mapping[gem_phoneme] = gem_phoneme[:mid]

    return mapping


def get_base_phoneme(phoneme: str) -> str:
    """
    Get the base (non-geminated) form of a phoneme.

    If phoneme is geminated, returns the single consonant form.
    If phoneme is already single, returns it unchanged.

    Args:
        phoneme: Any phoneme (geminated or single)

    Returns:
        Base phoneme (e.g., 'bb' -> 'b', 'sˤsˤ' -> 'sˤ', 't' -> 't')
    """
    gem_to_base = get_geminated_to_base_map()
    if phoneme in gem_to_base:
        return gem_to_base[phoneme]
    return phoneme


def get_tanween_mappings() -> dict:
    """
    Get a mapping of short vowel phonemes to tanween diacritic info.
    
    Returns:
        Dict[str, Tuple[str, str]]: phoneme -> (diacritic_name, diacritic_char)
    """
    data = _load_phoneme_map()
    diacritics = data.get('diacritics', {})
    tanween = data.get('tanween', {})
    
    mapping = {}
    for phoneme, diacritic_name in tanween.items():
        if diacritic_name in diacritics:
            mapping[phoneme] = (diacritic_name, diacritics[diacritic_name])
    
    return mapping


# =============================================================================
# LONG VOWEL UTILITIES
# =============================================================================

@lru_cache(maxsize=1)
def get_long_vowel_set() -> frozenset:
    """
    Get the set of long vowel phonemes.
    
    Returns:
        frozenset[str]: Set of long vowel phonemes (e.g., {'a:', 'u:', 'i:', 'aˤ:'})
    """
    return frozenset(get_long_vowel_mappings().keys())


def is_long_vowel(phoneme: str) -> bool:
    """Check if phoneme is a long vowel."""
    return phoneme in get_long_vowel_set()


def get_short_from_long(long_vowel: str) -> str:
    """
    Get short vowel component from long vowel.
    
    Args:
        long_vowel: Long vowel phoneme (e.g., 'a:', 'aˤ:')
        
    Returns:
        Short vowel phoneme (e.g., 'a', 'aˤ')
    """
    mapping = get_long_vowel_mappings()
    if long_vowel in mapping:
        return mapping[long_vowel][0]
    return long_vowel.replace(':', '')


def normalize_fatha_variants(vowel: str) -> str:
    """
    Normalize fatha variants for equivalence checking.
    
    Treats aˤ as equivalent to a, and aˤ: as equivalent to a:.
    
    Args:
        vowel: Vowel phoneme (short or long)
        
    Returns:
        Normalized vowel (ˤ removed)
    """
    return vowel.replace('ˤ', '')


@lru_cache(maxsize=1)
def get_vowel_extension_chars() -> frozenset:
    """
    Get the set of vowel extension characters (mini graphemes for long vowels).
    
    These are small marks that sit above/below consonant letters to indicate
    a long vowel, rather than using a full alef/waw/yaa letter.
    
    Returns:
        frozenset of Unicode characters for vowel extensions
    """
    data = _load_phoneme_map()
    extensions = data.get('vowel_extensions', {})
    return frozenset(extensions.values())


@lru_cache(maxsize=1)
def get_vowel_carrier_chars() -> frozenset:
    """
    Get the set of vowel carrier letter characters.
    
    These are full letters (alef, waw, yaa, alef maksura) that ARE the vowel
    grapheme, not just consonants hosting a vowel extension. Used to determine
    if a letter should be treated as full letter case vs mini extension case.
    
    Returns:
        frozenset of Unicode characters for vowel carrier letters
    """
    data = _load_phoneme_map()
    carriers = data.get('vowel_carrier_letters', {})
    return frozenset(carriers.values())


@lru_cache(maxsize=1)
def get_short_vowels() -> frozenset:
    """
    Get the set of short vowel phonemes.

    Derived from short_vowels dict keys (a, u, i, aˤ).

    Returns:
        frozenset of short vowel phoneme strings
    """
    data = _load_phoneme_map()
    short_vowels = data.get('short_vowels', {})
    return frozenset(short_vowels.keys())


@lru_cache(maxsize=1)
def get_ghunnah_phoneme_set() -> frozenset:
    """
    Get the set of ghunnah (nasalized) phonemes.

    These are special tajweed phonemes that represent nasalization:
    - ŋ = ikhfaa nasal (hidden noon)
    - ñ = idgham noon (merged noon with shaddah)
    - m̃ = idgham/iqlab meem (merged meem with shaddah)
    - j̃ = idgham with ya (merged noon into ya)
    - w̃ = idgham with waw (merged noon into waw)

    Returns:
        frozenset[str]: Set of ghunnah phonemes
    """
    return frozenset({'ŋ', 'ñ', 'm̃', 'j̃', 'w̃'})


# =============================================================================
# CENTRALIZED DIACRITIC AND TANWEEN MAPPINGS
# Single source of truth for all builders and renderers
# =============================================================================

# Diacritic names to Unicode characters
# Used by: canonical_builder, result_builder, word_builder, common.py
DIACRITIC_NAME_TO_CHAR = {
    "FATHA": "\u064E",           # َ
    "DAMMA": "\u064F",           # ُ
    "KASRA": "\u0650",           # ِ
    "SUKUN": "\u0652",           # ْ
    "FATHATAN": "\u064B",        # ً
    "DAMMATAN": "\u064C",        # ٌ
    "KASRATAN": "\u064D",        # ٍ
    "SHADDA": "\u0651",          # ّ
}

# Reverse mapping: char to name
DIACRITIC_CHAR_TO_NAME = {v: k for k, v in DIACRITIC_NAME_TO_CHAR.items()}


# Open tanween characters (DigitalKhatt V2 font)
OPEN_FATHATAN = '\u08F0'
OPEN_DAMMATAN = '\u08F1'
OPEN_KASRATAN = '\u08F2'

# Small meem characters for iqlab
MINI_MEEM_ABOVE = '\u06E2'  # ۢ - for fathatan/dammatan
MINI_MEEM_BELOW = '\u06ED'  # ۭ - for kasratan


# Rules that trigger open tanween rendering
# Used by: result_builder, common.py
OPEN_TANWEEN_RULES = frozenset({
    'idgham_ghunnah_tanween',
    'idgham_bila_ghunnah_tanween',
    'ikhfaa_tanween',
})


# Tanween to open tanween mapping (BY NAME)
# For use in builders that work with diacritic names
# Maps: tanween_name -> (open_char, open_name)
TANWEEN_TO_OPEN_BY_NAME = {
    'FATHATAN': (OPEN_FATHATAN, 'OPEN_FATHATAN'),
    'DAMMATAN': (OPEN_DAMMATAN, 'OPEN_DAMMATAN'),
    'KASRATAN': (OPEN_KASRATAN, 'OPEN_KASRATAN'),
}

# Tanween to open tanween mapping (BY CHAR)
# For use in renderers that work with Unicode characters
# Maps: tanween_char -> open_char
TANWEEN_TO_OPEN_BY_CHAR = {
    DIACRITIC_NAME_TO_CHAR['FATHATAN']: OPEN_FATHATAN,
    DIACRITIC_NAME_TO_CHAR['DAMMATAN']: OPEN_DAMMATAN,
    DIACRITIC_NAME_TO_CHAR['KASRATAN']: OPEN_KASRATAN,
}

# Reverse: open tanween to standard tanween
OPEN_TO_TANWEEN_CHAR = {v: k for k, v in TANWEEN_TO_OPEN_BY_CHAR.items()}


# Iqlab tanween mapping (BY NAME)
# For use in builders that work with diacritic names
# Maps: tanween_name -> (base_name, base_char, mini_meem)
TANWEEN_TO_IQLAB_BY_NAME = {
    'FATHATAN': ('FATHA', DIACRITIC_NAME_TO_CHAR['FATHA'], MINI_MEEM_ABOVE),
    'DAMMATAN': ('DAMMA', DIACRITIC_NAME_TO_CHAR['DAMMA'], MINI_MEEM_ABOVE),
    'KASRATAN': ('KASRA', DIACRITIC_NAME_TO_CHAR['KASRA'], MINI_MEEM_BELOW),
}

# Iqlab tanween mapping (BY CHAR)
# For use in renderers that work with Unicode characters
# Maps: tanween_char -> (base_char, mini_meem)
TANWEEN_TO_IQLAB_BY_CHAR = {
    DIACRITIC_NAME_TO_CHAR['FATHATAN']: (DIACRITIC_NAME_TO_CHAR['FATHA'], MINI_MEEM_ABOVE),
    DIACRITIC_NAME_TO_CHAR['DAMMATAN']: (DIACRITIC_NAME_TO_CHAR['DAMMA'], MINI_MEEM_ABOVE),
    DIACRITIC_NAME_TO_CHAR['KASRATAN']: (DIACRITIC_NAME_TO_CHAR['KASRA'], MINI_MEEM_BELOW),
}


def get_diacritic_char_by_name(name: str) -> str:
    """
    Get diacritic Unicode character by name.

    Single source of truth for diacritic name -> char mapping.
    Falls back to get_diacritic_chars() for YAML-defined diacritics.

    Args:
        name: Diacritic name (e.g., 'FATHA', 'SUKUN', 'FATHATAN')

    Returns:
        Unicode character, or empty string if not found
    """
    # First check hardcoded map (includes open tanween names)
    if name in DIACRITIC_NAME_TO_CHAR:
        return DIACRITIC_NAME_TO_CHAR[name]
    # Handle open tanween names
    if name == 'OPEN_FATHATAN':
        return OPEN_FATHATAN
    if name == 'OPEN_DAMMATAN':
        return OPEN_DAMMATAN
    if name == 'OPEN_KASRATAN':
        return OPEN_KASRATAN
    # Fall back to YAML-defined diacritics
    return get_diacritic_chars().get(name, '')