"""
Phonemization logic and verse metadata loading.
"""
import os
import sys
import json
import yaml
from pathlib import Path

# Get config after ensuring sys.path is set
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SURAH_INFO_PATH, DEFAULT_STOPS, PHONEMIZER_RESULT_CACHE_SIZE

# Phonemizer resources path (for HF Spaces compatibility)
# Resources are now inside the core package: phonemizer/core/resources/
PHONEMIZER_RESOURCES_DIR = Path(__file__).parent.parent.parent / "phonemizer" / "core" / "resources"
RECITATION_DATA_DIR = Path(__file__).parent.parent / "data"

# Module-level cache
_phonemizer_cache = {
    "phonemizer": None,
    "surah_info": None,
    "loaded": False,
    "error": None
}

# LRU cache for phonemizer results (verse_ref -> result)
from functools import lru_cache

# Simple cache for recent phonemizer results
_result_cache = {}


def get_cached_phonemizer_result(verse_ref: str, stops: list = None):
    """
    Get cached phonemizer result or compute and cache it.

    Args:
        verse_ref: Verse reference (e.g., "1:1", "2:255")
        stops: List of stops (default: ["compulsory_stop"])

    Returns:
        PhonemizeResult or None if error
    """
    if stops is None:
        stops = ["compulsory_stop"]

    # Create cache key from verse_ref and sorted stops
    stops_key = ",".join(sorted(stops))
    cache_key = f"{verse_ref}:{stops_key}"

    if cache_key in _result_cache:
        return _result_cache[cache_key]

    phonemizer, error = load_phonemizer()
    if not phonemizer:
        return None

    try:
        result = phonemizer.phonemize(ref=verse_ref, stops=stops)

        # Evict oldest if cache is full
        if len(_result_cache) >= PHONEMIZER_RESULT_CACHE_SIZE:
            oldest_key = next(iter(_result_cache))
            del _result_cache[oldest_key]

        _result_cache[cache_key] = result
        return result
    except Exception as e:
        print(f"[PHONEMIZER] Error phonemizing {verse_ref}: {e}")
        return None


def clear_phonemizer_result_cache():
    """Clear the phonemizer result cache."""
    _result_cache.clear()


def set_ikhfaa_shafawi_phoneme_setting(value: str):
    """
    Update ikhfaa shafawi phoneme in phonemizer registry and clear cache.

    Called when user changes the "Iqlab/Ikhfaa Shafawi Sound" radio button.

    Args:
        value: "meem ghunnah" or "ikhfaa"
    """
    from core.phoneme_registry import set_phoneme_override, clear_phoneme_overrides

    clear_phoneme_overrides()
    if value == "meem ghunnah":
        set_phoneme_override("iqlab", "phoneme", "m̃")
        set_phoneme_override("ikhfaa", "shafawi_phoneme", "m̃")
    else:  # value == "ikhfaa"
        # Must explicitly override iqlab since YAML default is m̃ (meem ghunnah)
        # ikhfaa shafawi's YAML default is already ŋ, so no override needed
        set_phoneme_override("iqlab", "phoneme", "ŋ")

    # Clear result cache so next phonemization uses new setting
    clear_phonemizer_result_cache()


def load_phonemizer():
    """
    Load the Phonemizer instance with caching.
    
    Returns:
        Phonemizer instance or None if not available
    """
    if _phonemizer_cache["loaded"]:
        return _phonemizer_cache["phonemizer"], _phonemizer_cache["error"]
    
    try:
        # Import from the installed phonemizer package
        from core.phonemizer import Phonemizer
        
        # Use local resources if available, otherwise use defaults
        db_path = PHONEMIZER_RESOURCES_DIR / "Quran.json" if PHONEMIZER_RESOURCES_DIR.exists() else None
        map_path = PHONEMIZER_RESOURCES_DIR / "base_phonemes.yaml" if PHONEMIZER_RESOURCES_DIR.exists() else None
        special_words_path = PHONEMIZER_RESOURCES_DIR / "special_words.yaml" if PHONEMIZER_RESOURCES_DIR.exists() else None
        
        if db_path and db_path.exists() and map_path and map_path.exists() and special_words_path and special_words_path.exists():
            phonemizer = Phonemizer(
                db_path=db_path, 
                map_path=map_path, 
                special_words_path=special_words_path,
            )
        else:
            phonemizer = Phonemizer()
        
        _phonemizer_cache["phonemizer"] = phonemizer
        _phonemizer_cache["loaded"] = True
        _phonemizer_cache["error"] = None
        print("✓ Phonemizer loaded successfully")
        return phonemizer, None
    except ImportError as e:
        import traceback
        error_msg = f"Failed to import phonemizer. Make sure quranic-phonemizer is installed: {str(e)}"
        _phonemizer_cache["error"] = error_msg
        _phonemizer_cache["loaded"] = True
        print(f"✗ {error_msg}")
        print(traceback.format_exc())
        return None, error_msg
    except Exception as e:
        import traceback
        error_msg = f"Failed to load phonemizer: {str(e)}"
        _phonemizer_cache["error"] = error_msg
        _phonemizer_cache["loaded"] = True
        print(f"✗ {error_msg}")
        print(traceback.format_exc())
        return None, error_msg


def load_surah_info():
    """
    Load surah information from JSON file.
    
    Returns:
        Dictionary with surah information or None if not available
    """
    if _phonemizer_cache["surah_info"] is not None:
        return _phonemizer_cache["surah_info"]
    
    try:
        with open(SURAH_INFO_PATH, 'r', encoding='utf-8') as f:
            surah_info = json.load(f)
        _phonemizer_cache["surah_info"] = surah_info
        print(f"✓ Loaded surah info with {len(surah_info)} chapters")
        return surah_info
    except Exception as e:
        print(f"✗ Failed to load surah info: {str(e)}")
        return None


def get_chapter_list():
    """
    Get list of chapter numbers with names.
    
    Returns:
        List of tuples (chapter_number, chapter_name_en, chapter_name_ar)
    """
    surah_info = load_surah_info()
    if not surah_info:
        return []
    
    chapters = []
    for chapter_num_str, chapter_data in surah_info.items():
        chapter_num = int(chapter_num_str)
        name_en = chapter_data.get("name_en", f"Chapter {chapter_num}")
        name_ar = chapter_data.get("name_ar", "")
        chapters.append((chapter_num, name_en, name_ar))
    
    return sorted(chapters, key=lambda x: x[0])


def get_verses_for_chapter(chapter_num):
    """
    Get list of verse numbers for a given chapter.
    
    Args:
        chapter_num: Chapter number (int)
        
    Returns:
        List of verse numbers
    """
    surah_info = load_surah_info()
    if not surah_info:
        return []
    
    chapter_data = surah_info.get(str(chapter_num))
    if not chapter_data:
        return []
    
    verses = [v["verse"] for v in chapter_data.get("verses", [])]
    return sorted(verses)


def phonemize_verse(verse_ref, stops=None):
    """
    Phonemize a verse reference and return text and phonemes.

    Args:
        verse_ref: Verse reference (e.g., "1:1", "2:255", "1:1-1:7")
        stops: List of stop types to include

    Returns:
        Tuple of (arabic_text_html, arabic_text_clean, phonemes_string, success, error_message)
    """
    try:
        # Use default stops if not provided
        if stops is None:
            stops = DEFAULT_STOPS.copy()
        elif "compulsory_stop" not in stops:
            stops = stops + ["compulsory_stop"]

        # Use cached phonemizer result
        result = get_cached_phonemizer_result(verse_ref, stops)
        if result is None:
            return "", "", "", False, "Failed to phonemize verse"
        phonemes_str = result.phonemes_str(phoneme_sep=" ", word_sep="")

        # Build canonical structure using frozen models
        # This ensures extensions (dagger alef) appear before other symbols (tatweel)
        from recitation_analysis.result_builder import get_result_builder

        builder = get_result_builder()
        # Main verse display: apply tanween transforms only, NOT word transforms
        # (stopping/starting/Allah). Word transforms are applied in the
        # Error Analysis, Ghunnah Analysis, and Madd Analysis tabs.
        recitation_result = builder.build_from_phonemizer_result(
            result, verse_ref, apply_word_transforms=False
        )

        # Render text with correct symbol ordering and tanween substitution
        # 1. HTML version for display (with verse markers)
        arabic_text_html = _render_verse_text(recitation_result.canonical_words, clean_text=False)

        # 2. Clean version for processing (no markers, just words)
        arabic_text_clean = _render_verse_text(recitation_result.canonical_words, clean_text=True)

        return arabic_text_html, arabic_text_clean, phonemes_str, True, None
    except Exception as e:
        error_msg = f"Phonemization error: {str(e)}"
        return "", "", "", False, error_msg


def _render_verse_text(words, clean_text=False) -> str:
    """
    Render a tuple of WordData as plain text with correct symbol ordering.

    This ensures proper ordering of extensions and other symbols, and applies
    tanween substitutions for idgham/ikhfaa/iqlab contexts.

    Args:
        words: Tuple of WordData to render
        clean_text: If True, omit verse markers

    Returns:
        Arabic text string with correct symbol ordering
    """
    from recitation_analysis.text_display.rendering import (
        substitute_open_tanween,
        substitute_iqlab_tanween,
    )
    from recitation_analysis.ui.verse_markers import format_verse_marker
    from recitation_analysis.text_display.special_word_builder import get_display_swap

    # Shaddah character (frozen model uses bool, not object)
    SHADDAH_CHAR = '\u0651'  # ّ

    word_texts = []
    prev_verse_num = None

    for word in words:
        parts = []

        # Leading symbols (rub el hizb, etc.) - now strings directly
        for sym in word.leading_symbols:
            parts.append(sym)
        if word.leading_symbols:
            parts.append(' ')

        # Special words (e.g., الم، يس، حم) have no letters - use text directly
        if not word.letters and word.text:
            parts.append(word.text)

        # Render each letter with proper symbol ordering
        for letter in word.letters:
            # Base letter (apply display swap for special words)
            letter_char = get_display_swap(word.location, letter.char) or letter.char
            parts.append(letter_char)

            # Check for iqlab tanween substitution
            # In frozen model: diacritic_char is the character, diacritic is the name
            diac_char = letter.diacritic_char
            iqlab_base_diac, iqlab_meem = substitute_iqlab_tanween(diac_char, letter.letter_rules)
            is_iqlab_tanween = iqlab_meem is not None

            # Check for iqlab noon (noon sakinah before baa)
            # In frozen model: diacritic is the name string (or None)
            is_iqlab_noon = (
                letter.letter_rules and
                'iqlab_noon' in letter.letter_rules and
                (letter.diacritic is None or letter.diacritic == 'SUKUN')
            )

            # Diacritic/Shaddah (proper stacking order)
            # In frozen model: shaddah is bool, not object
            if letter.shaddah:
                parts.append(SHADDAH_CHAR)

            if letter.diacritic_char:
                if is_iqlab_tanween:
                    # Iqlab tanween: use base diacritic instead of tanween
                    parts.append(iqlab_base_diac)
                else:
                    # Apply open tanween substitution for idgham/ikhfaa
                    diac = substitute_open_tanween(letter.diacritic_char, letter.letter_rules)
                    parts.append(diac)

            # Extensions (dagger alef, maddah, etc.) - now strings directly
            for ext in letter.extensions:
                # Apply display swap for special words
                ext_char = get_display_swap(word.location, ext) or ext
                parts.append(ext_char)

            # Other symbols (tatweel, etc.) - now strings directly
            for sym in letter.other_symbols:
                parts.append(sym)

            # Iqlab small meem (after other symbols)
            if is_iqlab_tanween and iqlab_meem:
                parts.append(iqlab_meem)
            elif is_iqlab_noon:
                # For iqlab noon: add mini meem above after the noon
                parts.append('\u06E2')  # MINI_MEEM_ABOVE

        # Trailing symbols (stop signs) - now strings directly
        for sym in word.trailing_symbols:
            parts.append(sym)

        word_text = ''.join(parts)

        # Extract verse number from word location (format: "surah:verse:word")
        location_parts = word.location.split(':')
        if len(location_parts) >= 2:
            current_verse_num = int(location_parts[1])
        else:
            current_verse_num = None

        # Add verse marker when verse changes (for verse ranges)
        if not clean_text and prev_verse_num is not None and current_verse_num != prev_verse_num:
            word_texts.append(format_verse_marker(prev_verse_num))

        word_texts.append(word_text)
        prev_verse_num = current_verse_num

    # Add final verse marker
    if not clean_text and prev_verse_num is not None:
        word_texts.append(format_verse_marker(prev_verse_num))

    return ' '.join(word_texts) + ' '


def _apply_open_tanween_to_text(result, text: str) -> str:
    """
    Apply open tanween substitution to Arabic text based on phonemizer rules.
    
    Substitutes standard tanween characters with open tanween for letters
    that have idgham/ikhfaa/iqlab rules.
    
    Args:
        result: PhonemizeResult from phonemizer
        text: Arabic text from result.text()
        
    Returns:
        Text with open tanween substituted where appropriate
    """
    try:
        from utils.phoneme_map import OPEN_TANWEEN_RULES
        
        # Map diacritic names to (char, open_char) pairs for idgham/ikhfaa
        TANWEEN_NAME_TO_OPEN = {
            'FATHATAN': ('\u064B', '\u08F0'),  # FATHATAN -> open fathatan (DigitalKhatt V2)
            'DAMMATAN': ('\u064C', '\u08F1'),  # DAMMATAN -> open dammatan (DigitalKhatt V2)
            'KASRATAN': ('\u064D', '\u08F2'),  # KASRATAN -> open kasratan (DigitalKhatt V2)
        }
        
        # Map diacritic names to (normal_char, base_diac, small_meem) for iqlab
        TANWEEN_NAME_TO_IQLAB = {
            'FATHATAN': ('\u064B', '\u064E', '\u06E2'),  # -> FATHA + mini meem above
            'DAMMATAN': ('\u064C', '\u064F', '\u06E2'),  # -> DAMMA + mini meem above
            'KASRATAN': ('\u064D', '\u0650', '\u06ED'),  # -> KASRA + mini meem below
        }
        
        # Get mapping to check letter rules
        mapping = result.get_mapping()
        
        # Build list of (word_text, substitutions) to apply
        # Each substitution is (old_char, new_chars) for that word
        word_substitutions = []
        
        for word in mapping.words:
            subs_for_word = []
            for letter in word.letter_mappings:
                if not letter.letter_rules:
                    continue
                
                diac_name = letter.diacritic
                if not diac_name or diac_name not in TANWEEN_NAME_TO_OPEN:
                    continue
                
                # Check for iqlab first (special treatment)
                if 'iqlab_tanween' in letter.letter_rules:
                    normal_char, base_diac, small_meem = TANWEEN_NAME_TO_IQLAB[diac_name]
                    subs_for_word.append((normal_char, base_diac + small_meem))
                    continue
                    
                # Check for open tanween rules (idgham/ikhfaa)
                has_open_rule = any(rule in OPEN_TANWEEN_RULES for rule in letter.letter_rules)
                if has_open_rule:
                    normal_char, open_char = TANWEEN_NAME_TO_OPEN[diac_name]
                    subs_for_word.append((normal_char, open_char))
            
            if subs_for_word:
                word_substitutions.append((word.text, subs_for_word))
        
        # Apply substitutions word by word in the text
        result_text = text
        for word_text, subs in word_substitutions:
            # Find this word in the remaining text and apply substitutions
            # Strip rule tags from word text for matching
            import re
            clean_word = re.sub(r'<[^>]*>', '', word_text)
            
            # Find the word position
            word_idx = result_text.find(clean_word)
            if word_idx == -1:
                # Try without some diacritics for fuzzy match
                continue
            
            # Extract the word, apply substitutions, replace
            word_end = word_idx + len(clean_word)
            word_chars = list(result_text[word_idx:word_end])
            
            for old_char, new_chars in subs:
                for i, c in enumerate(word_chars):
                    if c == old_char:
                        word_chars[i] = new_chars
                        break  # Only replace first occurrence in this word
            
            result_text = result_text[:word_idx] + ''.join(word_chars) + result_text[word_end:]
        
        return result_text
    except Exception as e:
        # If anything fails, return original text
        import traceback
        traceback.print_exc()
        return text


def format_verse_reference(from_chapter, from_verse, to_verse):
    """
    Format verse selection into a reference string for the phonemizer.

    Args:
        from_chapter: Chapter number or None
        from_verse: Starting verse number or None
        to_verse: Ending verse number or None

    Returns:
        Formatted reference string or None if invalid
    """
    # If only from_chapter is selected, return just the chapter
    if from_chapter and not from_verse:
        return str(from_chapter)

    # If from_chapter and from_verse are selected
    if from_chapter and from_verse:
        start_ref = f"{from_chapter}:{from_verse}"

        # If no to_verse, return single verse
        if not to_verse:
            return start_ref

        # If to_verse is same as from_verse, return single verse
        if to_verse == from_verse:
            return start_ref

        # Return range within same chapter
        end_ref = f"{from_chapter}:{to_verse}"
        return f"{start_ref}-{end_ref}"

    return None


def match_text_to_verse(transcribed_text: str, verse_ref: str, stops: list = None):
    """
    Match transcribed text to a verse reference using the phonemizer.

    This is used for text matching in segmented mode - takes ASR output
    and finds which portion of the canonical text it matches.

    Args:
        transcribed_text: Arabic text from ASR transcription
        verse_ref: Verse reference to match against (e.g., "1:2" or "1:2-1:7")
        stops: List of stop types (default: ["compulsory_stop"])

    Returns:
        Tuple of (matched_text, phonemes, match_score, matched_ref)
        - matched_text: The canonical text portion that was matched
        - phonemes: Phoneme string for the matched portion
        - match_score: Confidence score (0-1) of the match
        - matched_ref: The specific verse reference matched (e.g., "1:2:1-1:2:4")
    """
    if stops is None:
        stops = ["compulsory_stop"]

    phonemizer, error = load_phonemizer()
    if phonemizer is None:
        return "", "", 0.0, verse_ref

    try:
        result = phonemizer.phonemize(
            ref_text=transcribed_text,
            ref=verse_ref,
            stops=stops
        )

        text = result.text()
        phonemes = result.phonemes_str(phoneme_sep=" ", word_sep="", verse_sep="")
        match_score = result.match_score
        matched_ref = result.ref

        return text, phonemes, match_score, matched_ref

    except Exception as e:
        print(f"[PHONEMIZER] Text matching error: {e}")
        return "", "", 0.0, verse_ref


def get_total_words_for_verse_range(verse_ref: str) -> int:
    """
    Get the total number of words for a verse reference from surah_info.json.

    Args:
        verse_ref: Verse reference like "1:2" or "1:2-1:5"

    Returns:
        Total number of words across the verse range
    """
    surah_info = load_surah_info()
    if not surah_info:
        return 0

    try:
        # Parse verse reference
        if '-' in verse_ref:
            # Range: "1:2-1:5"
            start_part, end_part = verse_ref.split('-')
            start_surah, start_verse = map(int, start_part.split(':'))
            end_surah, end_verse = map(int, end_part.split(':'))
        else:
            # Single verse: "1:2"
            start_surah, start_verse = map(int, verse_ref.split(':'))
            end_surah, end_verse = start_surah, start_verse

        total_words = 0

        # Handle single surah case
        if start_surah == end_surah:
            surah_data = surah_info.get(str(start_surah))
            if surah_data and "verses" in surah_data:
                for verse_data in surah_data["verses"]:
                    verse_num = verse_data["verse"]
                    if start_verse <= verse_num <= end_verse:
                        total_words += verse_data.get("num_words", 0)
        else:
            # Multi-surah range (rare but possible)
            for surah_num in range(start_surah, end_surah + 1):
                surah_data = surah_info.get(str(surah_num))
                if not surah_data or "verses" not in surah_data:
                    continue

                for verse_data in surah_data["verses"]:
                    verse_num = verse_data["verse"]

                    # First surah: from start_verse onwards
                    if surah_num == start_surah and verse_num >= start_verse:
                        total_words += verse_data.get("num_words", 0)
                    # Middle surahs: all verses
                    elif start_surah < surah_num < end_surah:
                        total_words += verse_data.get("num_words", 0)
                    # Last surah: up to end_verse
                    elif surah_num == end_surah and verse_num <= end_verse:
                        total_words += verse_data.get("num_words", 0)

        return total_words

    except Exception as e:
        print(f"[PHONEMIZER] Error getting word count for {verse_ref}: {e}")
        return 0