Add phonemizer folder

Browse files

Files changed (5) hide show

.gitattributes +3 -0
phonemizer/dict/es_dicc.dic +3 -0
phonemizer/dict/eu_dicc.dic +3 -0
phonemizer/eu_phonemizer.py +344 -0
phonemizer/modulo1y2/modulo1y2 +3 -0

.gitattributes CHANGED Viewed

@@ -3,3 +3,6 @@
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.t7 filter=lfs diff=lfs merge=lfs -text

 *.bin filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.t7 filter=lfs diff=lfs merge=lfs -text
+phonemizer/dict/es_dicc.dic filter=lfs diff=lfs merge=lfs -text
+phonemizer/dict/eu_dicc.dic filter=lfs diff=lfs merge=lfs -text
+phonemizer/modulo1y2/modulo1y2 filter=lfs diff=lfs merge=lfs -text

phonemizer/dict/es_dicc.dic ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3880d688565dcfc4c1a239cb94c6cc0466b603cbf86fbf8a20ca411d64cb3c03
+size 141770

phonemizer/dict/eu_dicc.dic ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a4c6553965ac7c7937b599d3e8a3d8d94df48a0bdef943a84c63f4b261172f8
+size 865575

phonemizer/eu_phonemizer.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import subprocess
+import logging
+import string
+from pathlib import Path
+from collections import OrderedDict
+from nltk.tokenize import TweetTokenizer
+from typing import List, Dict, Optional
+import re
+# Constants
+SUPPORTED_LANGUAGES = {'eu', 'es'}
+SUPPORTED_SYMBOLS = {'sampa', 'ipa'}
+SAMPA_TO_IPA = OrderedDict([
+    ("p", "p"), ("b", "b"), ("t", "t"), ("c", "c"), ("d", "d"),
+    ("k", "k"), ("g", "ɡ"), ("tS", "tʃ"), ("ts", "ts"), ("ts`", "tʂ"),
+    ("gj", "ɟ"), ("jj", "ʝ"), ("f", "f"), ("B", "β"), ("T", "θ"),
+    ("D", "ð"), ("s", "s"), ("s`", "ʂ"), ("S", "ʃ"), ("x", "x"),
+    ("G", "ɣ"), ("m", "m"), ("n", "n"), ("J", "ɲ"), ("l", "l"),
+    ("L", "ʎ"), ("r", "ɾ"), ("rr", "r"), ("j", "j"), ("w", "w"),
+    ("i", "i"), ("'i", "'i"), ("e", "e"), ("'e", "'e"), ("a", "a"),
+    ("'a", "'a"), ("o", "o"), ("'o", "'o"), ("u", "u"), ("'u", "'u"),
+    ("y", "y"), ("Z", "ʒ"), ("h", "h"), ("ph", "pʰ"), ("kh", "kʰ"),
+    ("th", "tʰ")
+])
+MULTICHAR_TO_SINGLECHAR = {
+    "tʃ": "C",
+    "ts": "V",
+    "tʂ": "P",
+    "'i": "I",
+    "'e": "E",
+    "'a": "A",
+    "'o": "O",
+    "'u": "U",
+    "pʰ": "H",
+    "kʰ": "K",
+    "tʰ": "T"
+}
+class PhonemizerError(Exception):
+    """Custom exception for Phonemizer errors."""
+    pass
+class Phonemizer:
+    def __init__(self, language: str = "eu", symbol: str = "sampa",
+                path_modulo1y2: str = "modulo1y2/modulo1y2",
+                path_dicts: str = "dict") -> None:
+        """Initialize the Phonemizer with the given language and symbol."""
+        if language not in SUPPORTED_LANGUAGES:
+            raise PhonemizerError(f"Unsupported language: {language}")
+        if symbol not in SUPPORTED_SYMBOLS:
+            raise PhonemizerError(f"Unsupported symbol type: {symbol}")
+        self.language = language
+        self.symbol = symbol
+        self.path_modulo1y2 = Path(path_modulo1y2)
+        self.path_dicts = Path(path_dicts)
+        self.logger = logging.getLogger(__name__)
+        # Initialize SAMPA to IPA dictionary
+        self._sampa_to_ipa_dict = SAMPA_TO_IPA
+        # Initialize word splitter regex
+        self._word_splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)
+        self._validate_paths()
+    def normalize(self, text: str) -> str:
+        """Normalize the given text using an external command."""
+        try:
+            command = self._build_normalization_command()
+            process = subprocess.Popen(
+                command,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                encoding='ISO-8859-15',
+                shell=True
+            )
+            stdout, stderr = process.communicate(input=text)
+            if process.returncode != 0:
+                # Filter out the SetDur warning from the error message
+                filtered_stderr = '\n'.join(line for line in stderr.split('\n')
+                                          if 'Warning: argument not used SetDur' not in line)
+                if filtered_stderr.strip():  # Only raise error if there are other errors
+                    error_msg = f"Normalization failed: {filtered_stderr}"
+                    self.logger.error(error_msg)
+                    raise PhonemizerError(error_msg)
+            return stdout.strip()
+        except Exception as e:
+            error_msg = f"Error during normalization: {str(e)}"
+            self.logger.error(error_msg)
+            return text
+    def getPhonemes(self, text: str, separate_phonemes: bool = False, use_single_char: bool = False) -> str:
+        """Extract phonemes from the given text.
+        Args:
+            text (str): The input text to convert to phonemes
+            separate_phonemes (bool): If True, keeps spaces between phonemes. If False, produces compact phoneme strings.
+                                Defaults to False.
+            use_single_char (bool): When `symbol` is "ipa" and True, collapse multichar IPA sequences
+                                into mapped single characters (uses `_transform_multichar_phonemes`).
+                                Defaults to False.
+        Returns:
+            str: The phoneme sequence with words separated by " | "
+        """
+        try:
+            # Pre-process text to handle dots consistently
+            # Replace multiple dots with a single dot to avoid issues with ellipsis
+            text = re.sub(r'\.{2,}', '.', text)
+            # Process input line-by-line so we preserve original newlines
+            lines = text.split('\n')
+            per_line_outputs = []
+            for line in lines:
+                # If the input line is empty, preserve empty line
+                if not line.strip():
+                    per_line_outputs.append('')
+                    continue
+                command = self._build_phoneme_extraction_command()
+                proc = subprocess.Popen(
+                    command,
+                    stdin=subprocess.PIPE,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True,
+                    encoding='ISO-8859-15',
+                    shell=True
+                )
+                stdout, stderr = proc.communicate(input=line)
+                if proc.returncode != 0:
+                    error_msg = f"Phoneme extraction failed: {stderr}"
+                    self.logger.error(error_msg)
+                    raise PhonemizerError(error_msg)
+                # Replace any internal newlines in tool output with sentinel (shouldn't normally occur for single line)
+                stdout_line = stdout.replace('\n', ' | _ | ')
+                # Split into words and handle each separately for this line
+                word_phonemes = stdout_line.split(" | ")
+                result_phonemes = []
+                cleaned_phonemes = []
+                for phoneme_seq in word_phonemes:
+                    if not phoneme_seq.strip():
+                        continue
+                    if phoneme_seq.strip() == "_":
+                        continue
+                    cleaned_phonemes.append(phoneme_seq.strip())
+                # Tokenize the original line into words/punctuation
+                words = self._word_splitter.findall(line)
+                # Count non-punctuation words
+                non_punct_words = [w for w in words if w not in string.punctuation]
+                # Ensure we have enough phonemes for all non-punctuation words
+                if len(cleaned_phonemes) < len(non_punct_words):
+                    while len(cleaned_phonemes) < len(non_punct_words):
+                        if cleaned_phonemes:
+                            cleaned_phonemes.append(cleaned_phonemes[-1])
+                        else:
+                            cleaned_phonemes.append("a")
+                # Process words and phonemes together for this line
+                phoneme_idx = 0
+                word_idx = 0
+                line_result = []
+                while word_idx < len(words):
+                    word = words[word_idx]
+                    if word in string.punctuation:
+                        line_result.append(word)
+                        word_idx += 1
+                        continue
+                    # Regular word processing
+                    if phoneme_idx < len(cleaned_phonemes):
+                        phonemes = cleaned_phonemes[phoneme_idx].split()
+                        if self.symbol == "sampa":
+                            if separate_phonemes:
+                                processed_phonemes = " ".join(p for p in phonemes if p != "-")
+                            else:
+                                processed_phonemes = "".join(p for p in phonemes if p != "-")
+                        else:
+                            ipa_phonemes = [self._sampa_to_ipa_dict.get(p, p) for p in phonemes if p != "-"]
+                            if separate_phonemes:
+                                processed_phonemes = " ".join(ipa_phonemes)
+                            else:
+                                # Start with spaced IPA tokens to allow matching multichar tokens
+                                processed_phonemes = " ".join(ipa_phonemes)
+                                if use_single_char:
+                                    processed_phonemes = self._transform_multichar_phonemes(processed_phonemes)
+                                # Remove spaces for compact form
+                                processed_phonemes = processed_phonemes.replace(" ", "")
+                        line_result.append(processed_phonemes)
+                        phoneme_idx += 1
+                        word_idx += 1
+                    else:
+                        # No phoneme left for this word: skip it
+                        word_idx += 1
+                # If there are leftover phonemes, append them
+                while phoneme_idx < len(cleaned_phonemes):
+                    phonemes = cleaned_phonemes[phoneme_idx].split()
+                    if self.symbol == "sampa":
+                        processed_phonemes = " ".join(p for p in phonemes if p != "-")
+                    else:
+                        ipa_phonemes = [self._sampa_to_ipa_dict.get(p, p) for p in phonemes if p != "-"]
+                        if separate_phonemes:
+                            processed_phonemes = " ".join(ipa_phonemes)
+                        else:
+                            processed_phonemes = " ".join(ipa_phonemes)
+                            if use_single_char:
+                                processed_phonemes = self._transform_multichar_phonemes(processed_phonemes)
+                            processed_phonemes = processed_phonemes.replace(" ", "")
+                    line_result.append(processed_phonemes)
+                    phoneme_idx += 1
+                # Format final output for this line using spacing rules
+                out_parts = []
+                # Keep a parallel map to the original words so we can decide sentence splits
+                orig_map = []
+                for idx, token in enumerate(line_result):
+                    is_punct = token in string.punctuation
+                    if not is_punct:
+                        normalized = re.sub(r"\s+", " ", token.strip())
+                        out_parts.append(normalized)
+                        # Map this output token to the corresponding original word (if available)
+                        if idx < len(words):
+                            orig_map.append(words[idx])
+                        else:
+                            orig_map.append(None)
+                    else:
+                        out_parts.append(token)
+                        if idx < len(words):
+                            orig_map.append(words[idx])
+                        else:
+                            orig_map.append(None)
+                final_line = ""
+                for i, tok in enumerate(out_parts):
+                    if i == 0:
+                        final_line += tok
+                        continue
+                    prev = out_parts[i-1]
+                    if tok in string.punctuation:
+                        final_line = final_line.rstrip(' ')
+                        final_line += ('  ' if separate_phonemes else ' ') + tok
+                        # Preserve input line boundaries: do NOT insert newlines mid-line.
+                        # Always add the standard separator after punctuation.
+                        if i < len(out_parts) - 1:
+                            final_line += ('  ' if separate_phonemes else ' ')
+                    else:
+                        if prev in string.punctuation:
+                            final_line += tok
+                        else:
+                            sep = '  ' if separate_phonemes else ' '
+                            final_line += sep + tok
+                # If a sentence-ending punctuation is followed by a capital letter,
+                # split into separate lines (keeps numeric periods like "1980. urtean" intact).
+                # This turns "... ? Ni ..." into two lines at the sentence boundary.
+                split_line = re.sub(r"(?<=[\?\!\.])\s+(?=[A-ZÁÉÍÓÚÜÑ])", "\n", final_line)
+                per_line_outputs.append(split_line)
+            return "\n".join(per_line_outputs)
+        except Exception as e:
+            error_msg = f"Error in phoneme extraction: {str(e)}"
+            self.logger.error(error_msg)
+            return ""
+    def _build_normalization_command(self) -> str:
+        """Build the command string for normalization."""
+        modulo_path = self._get_file_path() / self.path_modulo1y2
+        dict_path = self._get_file_path() / self.path_dicts
+        dict_file = f"{self.language}_dicc"
+        return f'{modulo_path} -TxtMode=Word -Lang={self.language} -HDic={dict_path/dict_file}'
+    def _build_phoneme_extraction_command(self) -> str:
+        """Build the command string for phoneme extraction."""
+        modulo_path = self._get_file_path() / self.path_modulo1y2
+        dict_path = self._get_file_path() / self.path_dicts
+        dict_file = f"{self.language}_dicc"
+        return f'{modulo_path} -Lang={self.language} -HDic={dict_path/dict_file}'
+    def _get_file_path(self) -> Path:
+        return Path(__file__).parent
+    def _validate_paths(self) -> None:
+        """Validate paths with enhanced error reporting."""
+        try:
+            if not self.path_modulo1y2.exists():
+                raise PhonemizerError(f"Modulo1y2 executable not found at: {self.path_modulo1y2}")
+            if not self.path_dicts.exists():
+                raise PhonemizerError(f"Dictionary directory not found at: {self.path_dicts}")
+            # Check for both possible dictionary files
+            dict_file = self.path_dicts / f"{self.language}_dicc"
+            if not dict_file.exists():
+                # Try with .dic extension as fallback
+                dict_file_alt = self.path_dicts / f"{self.language}_dicc.dic"
+                if not dict_file_alt.exists():
+                    raise PhonemizerError(f"Dictionary file not found at either {dict_file} or {dict_file_alt}")
+        except Exception as e:
+            self.logger.error(f"Path validation error: {str(e)}")
+            raise
+    def _transform_multichar_phonemes(self, phoneme_sequence: str) -> str:
+        """
+        Transform multicharacter IPA phonemes to single characters using the MULTICHAR_TO_SINGLECHAR mapping.
+        Args:
+            phoneme_sequence (str): A string containing phonemes separated by spaces
+        Returns:
+            str: The transformed phoneme sequence with multicharacter phonemes replaced by single characters
+        """
+        # Split the sequence into individual phonemes
+        phonemes = phoneme_sequence.split()
+        transformed_phonemes = []
+        for phoneme in phonemes:
+            # Check if the phoneme exists in our mapping
+            if phoneme in MULTICHAR_TO_SINGLECHAR:
+                transformed_phonemes.append(MULTICHAR_TO_SINGLECHAR[phoneme])
+            else:
+                transformed_phonemes.append(phoneme)
+        return " ".join(transformed_phonemes)

phonemizer/modulo1y2/modulo1y2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c122bd6197e5e360d534957322f8d98a06cb3bcb4d412ee9978e891ae1b43e8a
+size 2245952