import subprocess
import logging
import string
from pathlib import Path
from collections import OrderedDict
from nltk.tokenize import TweetTokenizer
from typing import List, Dict, Optional
import re

# Constants
SUPPORTED_LANGUAGES = {'eu', 'es'}
SUPPORTED_SYMBOLS = {'sampa', 'ipa'}
SAMPA_TO_IPA = OrderedDict([
    ("p", "p"), ("b", "b"), ("t", "t"), ("c", "c"), ("d", "d"),
    ("k", "k"), ("g", "ɡ"), ("tS", "tʃ"), ("ts", "ts"), ("ts`", "tʂ"),
    ("gj", "ɟ"), ("jj", "ʝ"), ("f", "f"), ("B", "β"), ("T", "θ"),
    ("D", "ð"), ("s", "s"), ("s`", "ʂ"), ("S", "ʃ"), ("x", "x"),
    ("G", "ɣ"), ("m", "m"), ("n", "n"), ("J", "ɲ"), ("l", "l"),
    ("L", "ʎ"), ("r", "ɾ"), ("rr", "r"), ("j", "j"), ("w", "w"),
    ("i", "i"), ("'i", "'i"), ("e", "e"), ("'e", "'e"), ("a", "a"),
    ("'a", "'a"), ("o", "o"), ("'o", "'o"), ("u", "u"), ("'u", "'u"),
    ("y", "y"), ("Z", "ʒ"), ("h", "h"), ("ph", "pʰ"), ("kh", "kʰ"),
    ("th", "tʰ")
])

MULTICHAR_TO_SINGLECHAR = {
    "tʃ": "C",
    "ts": "V",
    "tʂ": "P",
    "'i": "I",
    "'e": "E",
    "'a": "A",
    "'o": "O",
    "'u": "U",
    "pʰ": "H",
    "kʰ": "K",
    "tʰ": "T"
}

class PhonemizerError(Exception):
    """Custom exception for Phonemizer errors."""
    pass

class Phonemizer:
    def __init__(self, language: str = "eu", symbol: str = "sampa", 
                path_modulo1y2: str = "modulo1y2/modulo1y2", 
                path_dicts: str = "dict") -> None:
        """Initialize the Phonemizer with the given language and symbol."""
        if language not in SUPPORTED_LANGUAGES:
            raise PhonemizerError(f"Unsupported language: {language}")
        if symbol not in SUPPORTED_SYMBOLS:
            raise PhonemizerError(f"Unsupported symbol type: {symbol}")

        self.language = language
        self.symbol = symbol
        self.path_modulo1y2 = Path(path_modulo1y2)
        self.path_dicts = Path(path_dicts)
        self.logger = logging.getLogger(__name__)
        
        # Initialize SAMPA to IPA dictionary
        self._sampa_to_ipa_dict = SAMPA_TO_IPA
        
        # Initialize word splitter regex
        self._word_splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)
        
        self._validate_paths()

    def normalize(self, text: str) -> str:
        """Normalize the given text using an external command."""
        try:
            command = self._build_normalization_command()
            process = subprocess.Popen(
                command,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                encoding='ISO-8859-15',
                shell=True
            )
            stdout, stderr = process.communicate(input=text)
            
            if process.returncode != 0:
                # Filter out the SetDur warning from the error message
                filtered_stderr = '\n'.join(line for line in stderr.split('\n') 
                                          if 'Warning: argument not used SetDur' not in line)
                if filtered_stderr.strip():  # Only raise error if there are other errors
                    error_msg = f"Normalization failed: {filtered_stderr}"
                    self.logger.error(error_msg)
                    raise PhonemizerError(error_msg)
            
            return stdout.strip()
            
        except Exception as e:
            error_msg = f"Error during normalization: {str(e)}"
            self.logger.error(error_msg)
            return text

    def getPhonemes(self, text: str, use_single_char: bool = False) -> str:
        """Extract phonemes from the given text.
        
        Args:
            text (str): The input text to convert to phonemes
            use_single_char (bool): If True, converts multi-character IPA phonemes to single characters
                                and joins them without spaces. If False, keeps phonemes separated by spaces.
                                Only applies when symbol="ipa". Defaults to False.
        
        Returns:
            str: The phoneme sequence with words separated by " | "
        """
        try:
            # Pre-process text to handle dots consistently
            # Replace multiple dots with a single dot to avoid issues with ellipsis
            text = re.sub(r'\.{2,}', '.', text)
            
            command = self._build_phoneme_extraction_command()
            process = subprocess.Popen(
                command,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                encoding='ISO-8859-15',
                shell=True
            )
            
            stdout, stderr = process.communicate(input=text)
            
            if process.returncode != 0:
                error_msg = f"Phoneme extraction failed: {stderr}"
                self.logger.error(error_msg)
                raise PhonemizerError(error_msg)
            
            # Handle newlines in the raw phonemes output
            # Replace newlines with underscores, similar to how other punctuation is handled
            stdout = stdout.replace('\n', ' | _ | ')
            
            # Get words and punctuation from normalized text
            words = self._word_splitter.findall(text)
            
            # Split into words and handle each separately
            word_phonemes = stdout.split(" | ")
            result_phonemes = []
            
            # Clean and prepare phoneme sequences
            cleaned_phonemes = []
            for phoneme_seq in word_phonemes:
                if not phoneme_seq.strip():
                    continue
                # Remove underscores and clean up the sequence
                if phoneme_seq.strip() == "_":
                    continue
                cleaned_phonemes.append(phoneme_seq.strip())
            
            # Count non-punctuation words and punctuation marks separately
            non_punct_words = [w for w in words if w not in string.punctuation]
            punct_marks = [w for w in words if w in string.punctuation]
            
            # Ensure we have enough phonemes for all non-punctuation words
            if len(cleaned_phonemes) < len(non_punct_words):
                # If not, duplicate the last phoneme
                while len(cleaned_phonemes) < len(non_punct_words):
                    if cleaned_phonemes:
                        cleaned_phonemes.append(cleaned_phonemes[-1])
                    else:
                        # If no phonemes at all, add a placeholder
                        cleaned_phonemes.append("a")
            
            # Process words and phonemes together
            phoneme_idx = 0
            word_idx = 0
            
            while word_idx < len(words):
                word = words[word_idx]
                
                if word in string.punctuation:
                    # Add punctuation mark directly
                    result_phonemes.append(word)
                    word_idx += 1
                else:
                    # Process regular word
                    if phoneme_idx < len(cleaned_phonemes):
                        phonemes = cleaned_phonemes[phoneme_idx].split()
                        if self.symbol == "sampa":
                            # For SAMPA, join and remove hyphens
                            processed_phonemes = " ".join(p for p in phonemes if p != "-")
                        else:
                            # For IPA, convert and remove hyphens
                            ipa_phonemes = [self._sampa_to_ipa_dict.get(p, p) for p in phonemes if p != "-"]
                            processed_phonemes = " ".join(ipa_phonemes)
                            if use_single_char:
                                processed_phonemes = self._transform_multichar_phonemes(processed_phonemes)
                                # Join phonemes when use_single_char is True
                                processed_phonemes = processed_phonemes.replace(" ", "")
                        
                        result_phonemes.append(processed_phonemes)
                        phoneme_idx += 1
                        word_idx += 1
                    else:
                        # If we run out of phonemes but still have words, skip the word
                        word_idx += 1
            
            # If we have more phonemes than words, add them as is
            while phoneme_idx < len(cleaned_phonemes):
                phonemes = cleaned_phonemes[phoneme_idx].split()
                if self.symbol == "sampa":
                    processed_phonemes = " ".join(p for p in phonemes if p != "-")
                else:
                    ipa_phonemes = [self._sampa_to_ipa_dict.get(p, p) for p in phonemes if p != "-"]
                    processed_phonemes = " ".join(ipa_phonemes)
                    if use_single_char:
                        processed_phonemes = self._transform_multichar_phonemes(processed_phonemes)
                        # Join phonemes when use_single_char is True
                        processed_phonemes = processed_phonemes.replace(" ", "")
                
                result_phonemes.append(processed_phonemes)
                phoneme_idx += 1
            
            return " | ".join(result_phonemes)
            
        except Exception as e:
            error_msg = f"Error in phoneme extraction: {str(e)}"
            self.logger.error(error_msg)
            return ""

    def _build_normalization_command(self) -> str:
        """Build the command string for normalization."""
        modulo_path = self._get_file_path() / self.path_modulo1y2
        dict_path = self._get_file_path() / self.path_dicts
        dict_file = f"{self.language}_dicc"
        return f'{modulo_path} -TxtMode=Word -Lang={self.language} -HDic={dict_path/dict_file}'

    def _build_phoneme_extraction_command(self) -> str:
        """Build the command string for phoneme extraction."""
        modulo_path = self._get_file_path() / self.path_modulo1y2
        dict_path = self._get_file_path() / self.path_dicts
        dict_file = f"{self.language}_dicc"
        return f'{modulo_path} -Lang={self.language} -HDic={dict_path/dict_file}'

    def _get_file_path(self) -> Path:
        return Path(__file__).parent

    def _validate_paths(self) -> None:
        """Validate paths with enhanced error reporting."""
        try:
            if not self.path_modulo1y2.exists():
                raise PhonemizerError(f"Modulo1y2 executable not found at: {self.path_modulo1y2}")
            if not self.path_dicts.exists():
                raise PhonemizerError(f"Dictionary directory not found at: {self.path_dicts}")
            
            # Check for both possible dictionary files
            dict_file = self.path_dicts / f"{self.language}_dicc"
            if not dict_file.exists():
                # Try with .dic extension as fallback
                dict_file_alt = self.path_dicts / f"{self.language}_dicc.dic"
                if not dict_file_alt.exists():
                    raise PhonemizerError(f"Dictionary file not found at either {dict_file} or {dict_file_alt}")
                
        except Exception as e:
            self.logger.error(f"Path validation error: {str(e)}")
            raise 

    def _transform_multichar_phonemes(self, phoneme_sequence: str) -> str:
        """
        Transform multicharacter IPA phonemes to single characters using the MULTICHAR_TO_SINGLECHAR mapping.
        
        Args:
            phoneme_sequence (str): A string containing phonemes separated by spaces
            
        Returns:
            str: The transformed phoneme sequence with multicharacter phonemes replaced by single characters
        """
        # Split the sequence into individual phonemes
        phonemes = phoneme_sequence.split()
        transformed_phonemes = []
        
        for phoneme in phonemes:
            # Check if the phoneme exists in our mapping
            if phoneme in MULTICHAR_TO_SINGLECHAR:
                transformed_phonemes.append(MULTICHAR_TO_SINGLECHAR[phoneme])
            else:
                transformed_phonemes.append(phoneme)
        
        return " ".join(transformed_phonemes)