arrandi commited on
Commit
fdc4749
·
verified ·
1 Parent(s): bd694d7

Add phonemizer folder

Browse files
.gitattributes CHANGED
@@ -3,3 +3,6 @@
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.ckpt filter=lfs diff=lfs merge=lfs -text
5
  *.t7 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.ckpt filter=lfs diff=lfs merge=lfs -text
5
  *.t7 filter=lfs diff=lfs merge=lfs -text
6
+ phonemizer/dict/es_dicc.dic filter=lfs diff=lfs merge=lfs -text
7
+ phonemizer/dict/eu_dicc.dic filter=lfs diff=lfs merge=lfs -text
8
+ phonemizer/modulo1y2/modulo1y2 filter=lfs diff=lfs merge=lfs -text
phonemizer/dict/es_dicc.dic ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3880d688565dcfc4c1a239cb94c6cc0466b603cbf86fbf8a20ca411d64cb3c03
3
+ size 141770
phonemizer/dict/eu_dicc.dic ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a4c6553965ac7c7937b599d3e8a3d8d94df48a0bdef943a84c63f4b261172f8
3
+ size 865575
phonemizer/eu_phonemizer.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import logging
3
+ import string
4
+ from pathlib import Path
5
+ from collections import OrderedDict
6
+ from nltk.tokenize import TweetTokenizer
7
+ from typing import List, Dict, Optional
8
+ import re
9
+
10
+ # Constants
11
+ SUPPORTED_LANGUAGES = {'eu', 'es'}
12
+ SUPPORTED_SYMBOLS = {'sampa', 'ipa'}
13
+ SAMPA_TO_IPA = OrderedDict([
14
+ ("p", "p"), ("b", "b"), ("t", "t"), ("c", "c"), ("d", "d"),
15
+ ("k", "k"), ("g", "ɡ"), ("tS", "tʃ"), ("ts", "ts"), ("ts`", "tʂ"),
16
+ ("gj", "ɟ"), ("jj", "ʝ"), ("f", "f"), ("B", "β"), ("T", "θ"),
17
+ ("D", "ð"), ("s", "s"), ("s`", "ʂ"), ("S", "ʃ"), ("x", "x"),
18
+ ("G", "ɣ"), ("m", "m"), ("n", "n"), ("J", "ɲ"), ("l", "l"),
19
+ ("L", "ʎ"), ("r", "ɾ"), ("rr", "r"), ("j", "j"), ("w", "w"),
20
+ ("i", "i"), ("'i", "'i"), ("e", "e"), ("'e", "'e"), ("a", "a"),
21
+ ("'a", "'a"), ("o", "o"), ("'o", "'o"), ("u", "u"), ("'u", "'u"),
22
+ ("y", "y"), ("Z", "ʒ"), ("h", "h"), ("ph", "pʰ"), ("kh", "kʰ"),
23
+ ("th", "tʰ")
24
+ ])
25
+
26
+ MULTICHAR_TO_SINGLECHAR = {
27
+ "tʃ": "C",
28
+ "ts": "V",
29
+ "tʂ": "P",
30
+ "'i": "I",
31
+ "'e": "E",
32
+ "'a": "A",
33
+ "'o": "O",
34
+ "'u": "U",
35
+ "pʰ": "H",
36
+ "kʰ": "K",
37
+ "tʰ": "T"
38
+ }
39
+
40
+ class PhonemizerError(Exception):
41
+ """Custom exception for Phonemizer errors."""
42
+ pass
43
+
44
+ class Phonemizer:
45
+ def __init__(self, language: str = "eu", symbol: str = "sampa",
46
+ path_modulo1y2: str = "modulo1y2/modulo1y2",
47
+ path_dicts: str = "dict") -> None:
48
+ """Initialize the Phonemizer with the given language and symbol."""
49
+ if language not in SUPPORTED_LANGUAGES:
50
+ raise PhonemizerError(f"Unsupported language: {language}")
51
+ if symbol not in SUPPORTED_SYMBOLS:
52
+ raise PhonemizerError(f"Unsupported symbol type: {symbol}")
53
+
54
+ self.language = language
55
+ self.symbol = symbol
56
+ self.path_modulo1y2 = Path(path_modulo1y2)
57
+ self.path_dicts = Path(path_dicts)
58
+ self.logger = logging.getLogger(__name__)
59
+
60
+ # Initialize SAMPA to IPA dictionary
61
+ self._sampa_to_ipa_dict = SAMPA_TO_IPA
62
+
63
+ # Initialize word splitter regex
64
+ self._word_splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)
65
+
66
+ self._validate_paths()
67
+
68
+ def normalize(self, text: str) -> str:
69
+ """Normalize the given text using an external command."""
70
+ try:
71
+ command = self._build_normalization_command()
72
+ process = subprocess.Popen(
73
+ command,
74
+ stdin=subprocess.PIPE,
75
+ stdout=subprocess.PIPE,
76
+ stderr=subprocess.PIPE,
77
+ text=True,
78
+ encoding='ISO-8859-15',
79
+ shell=True
80
+ )
81
+ stdout, stderr = process.communicate(input=text)
82
+
83
+ if process.returncode != 0:
84
+ # Filter out the SetDur warning from the error message
85
+ filtered_stderr = '\n'.join(line for line in stderr.split('\n')
86
+ if 'Warning: argument not used SetDur' not in line)
87
+ if filtered_stderr.strip(): # Only raise error if there are other errors
88
+ error_msg = f"Normalization failed: {filtered_stderr}"
89
+ self.logger.error(error_msg)
90
+ raise PhonemizerError(error_msg)
91
+
92
+ return stdout.strip()
93
+
94
+ except Exception as e:
95
+ error_msg = f"Error during normalization: {str(e)}"
96
+ self.logger.error(error_msg)
97
+ return text
98
+
99
+ def getPhonemes(self, text: str, separate_phonemes: bool = False, use_single_char: bool = False) -> str:
100
+ """Extract phonemes from the given text.
101
+
102
+ Args:
103
+ text (str): The input text to convert to phonemes
104
+ separate_phonemes (bool): If True, keeps spaces between phonemes. If False, produces compact phoneme strings.
105
+ Defaults to False.
106
+ use_single_char (bool): When `symbol` is "ipa" and True, collapse multichar IPA sequences
107
+ into mapped single characters (uses `_transform_multichar_phonemes`).
108
+ Defaults to False.
109
+
110
+ Returns:
111
+ str: The phoneme sequence with words separated by " | "
112
+ """
113
+ try:
114
+ # Pre-process text to handle dots consistently
115
+ # Replace multiple dots with a single dot to avoid issues with ellipsis
116
+ text = re.sub(r'\.{2,}', '.', text)
117
+
118
+ # Process input line-by-line so we preserve original newlines
119
+ lines = text.split('\n')
120
+ per_line_outputs = []
121
+ for line in lines:
122
+ # If the input line is empty, preserve empty line
123
+ if not line.strip():
124
+ per_line_outputs.append('')
125
+ continue
126
+
127
+ command = self._build_phoneme_extraction_command()
128
+ proc = subprocess.Popen(
129
+ command,
130
+ stdin=subprocess.PIPE,
131
+ stdout=subprocess.PIPE,
132
+ stderr=subprocess.PIPE,
133
+ text=True,
134
+ encoding='ISO-8859-15',
135
+ shell=True
136
+ )
137
+ stdout, stderr = proc.communicate(input=line)
138
+ if proc.returncode != 0:
139
+ error_msg = f"Phoneme extraction failed: {stderr}"
140
+ self.logger.error(error_msg)
141
+ raise PhonemizerError(error_msg)
142
+
143
+ # Replace any internal newlines in tool output with sentinel (shouldn't normally occur for single line)
144
+ stdout_line = stdout.replace('\n', ' | _ | ')
145
+
146
+ # Split into words and handle each separately for this line
147
+ word_phonemes = stdout_line.split(" | ")
148
+ result_phonemes = []
149
+ cleaned_phonemes = []
150
+ for phoneme_seq in word_phonemes:
151
+ if not phoneme_seq.strip():
152
+ continue
153
+ if phoneme_seq.strip() == "_":
154
+ continue
155
+ cleaned_phonemes.append(phoneme_seq.strip())
156
+ # Tokenize the original line into words/punctuation
157
+ words = self._word_splitter.findall(line)
158
+
159
+ # Count non-punctuation words
160
+ non_punct_words = [w for w in words if w not in string.punctuation]
161
+
162
+ # Ensure we have enough phonemes for all non-punctuation words
163
+ if len(cleaned_phonemes) < len(non_punct_words):
164
+ while len(cleaned_phonemes) < len(non_punct_words):
165
+ if cleaned_phonemes:
166
+ cleaned_phonemes.append(cleaned_phonemes[-1])
167
+ else:
168
+ cleaned_phonemes.append("a")
169
+
170
+ # Process words and phonemes together for this line
171
+ phoneme_idx = 0
172
+ word_idx = 0
173
+ line_result = []
174
+
175
+ while word_idx < len(words):
176
+ word = words[word_idx]
177
+
178
+ if word in string.punctuation:
179
+ line_result.append(word)
180
+ word_idx += 1
181
+ continue
182
+
183
+ # Regular word processing
184
+ if phoneme_idx < len(cleaned_phonemes):
185
+ phonemes = cleaned_phonemes[phoneme_idx].split()
186
+ if self.symbol == "sampa":
187
+ if separate_phonemes:
188
+ processed_phonemes = " ".join(p for p in phonemes if p != "-")
189
+ else:
190
+ processed_phonemes = "".join(p for p in phonemes if p != "-")
191
+ else:
192
+ ipa_phonemes = [self._sampa_to_ipa_dict.get(p, p) for p in phonemes if p != "-"]
193
+ if separate_phonemes:
194
+ processed_phonemes = " ".join(ipa_phonemes)
195
+ else:
196
+ # Start with spaced IPA tokens to allow matching multichar tokens
197
+ processed_phonemes = " ".join(ipa_phonemes)
198
+ if use_single_char:
199
+ processed_phonemes = self._transform_multichar_phonemes(processed_phonemes)
200
+ # Remove spaces for compact form
201
+ processed_phonemes = processed_phonemes.replace(" ", "")
202
+
203
+ line_result.append(processed_phonemes)
204
+ phoneme_idx += 1
205
+ word_idx += 1
206
+ else:
207
+ # No phoneme left for this word: skip it
208
+ word_idx += 1
209
+
210
+ # If there are leftover phonemes, append them
211
+ while phoneme_idx < len(cleaned_phonemes):
212
+ phonemes = cleaned_phonemes[phoneme_idx].split()
213
+ if self.symbol == "sampa":
214
+ processed_phonemes = " ".join(p for p in phonemes if p != "-")
215
+ else:
216
+ ipa_phonemes = [self._sampa_to_ipa_dict.get(p, p) for p in phonemes if p != "-"]
217
+ if separate_phonemes:
218
+ processed_phonemes = " ".join(ipa_phonemes)
219
+ else:
220
+ processed_phonemes = " ".join(ipa_phonemes)
221
+ if use_single_char:
222
+ processed_phonemes = self._transform_multichar_phonemes(processed_phonemes)
223
+ processed_phonemes = processed_phonemes.replace(" ", "")
224
+
225
+ line_result.append(processed_phonemes)
226
+ phoneme_idx += 1
227
+
228
+ # Format final output for this line using spacing rules
229
+ out_parts = []
230
+ # Keep a parallel map to the original words so we can decide sentence splits
231
+ orig_map = []
232
+ for idx, token in enumerate(line_result):
233
+ is_punct = token in string.punctuation
234
+ if not is_punct:
235
+ normalized = re.sub(r"\s+", " ", token.strip())
236
+ out_parts.append(normalized)
237
+ # Map this output token to the corresponding original word (if available)
238
+ if idx < len(words):
239
+ orig_map.append(words[idx])
240
+ else:
241
+ orig_map.append(None)
242
+ else:
243
+ out_parts.append(token)
244
+ if idx < len(words):
245
+ orig_map.append(words[idx])
246
+ else:
247
+ orig_map.append(None)
248
+
249
+ final_line = ""
250
+ for i, tok in enumerate(out_parts):
251
+ if i == 0:
252
+ final_line += tok
253
+ continue
254
+
255
+ prev = out_parts[i-1]
256
+
257
+ if tok in string.punctuation:
258
+ final_line = final_line.rstrip(' ')
259
+ final_line += (' ' if separate_phonemes else ' ') + tok
260
+ # Preserve input line boundaries: do NOT insert newlines mid-line.
261
+ # Always add the standard separator after punctuation.
262
+ if i < len(out_parts) - 1:
263
+ final_line += (' ' if separate_phonemes else ' ')
264
+ else:
265
+ if prev in string.punctuation:
266
+ final_line += tok
267
+ else:
268
+ sep = ' ' if separate_phonemes else ' '
269
+ final_line += sep + tok
270
+
271
+ # If a sentence-ending punctuation is followed by a capital letter,
272
+ # split into separate lines (keeps numeric periods like "1980. urtean" intact).
273
+ # This turns "... ? Ni ..." into two lines at the sentence boundary.
274
+ split_line = re.sub(r"(?<=[\?\!\.])\s+(?=[A-ZÁÉÍÓÚÜÑ])", "\n", final_line)
275
+ per_line_outputs.append(split_line)
276
+
277
+ return "\n".join(per_line_outputs)
278
+
279
+ except Exception as e:
280
+ error_msg = f"Error in phoneme extraction: {str(e)}"
281
+ self.logger.error(error_msg)
282
+ return ""
283
+
284
+ def _build_normalization_command(self) -> str:
285
+ """Build the command string for normalization."""
286
+ modulo_path = self._get_file_path() / self.path_modulo1y2
287
+ dict_path = self._get_file_path() / self.path_dicts
288
+ dict_file = f"{self.language}_dicc"
289
+ return f'{modulo_path} -TxtMode=Word -Lang={self.language} -HDic={dict_path/dict_file}'
290
+
291
+ def _build_phoneme_extraction_command(self) -> str:
292
+ """Build the command string for phoneme extraction."""
293
+ modulo_path = self._get_file_path() / self.path_modulo1y2
294
+ dict_path = self._get_file_path() / self.path_dicts
295
+ dict_file = f"{self.language}_dicc"
296
+ return f'{modulo_path} -Lang={self.language} -HDic={dict_path/dict_file}'
297
+
298
+ def _get_file_path(self) -> Path:
299
+ return Path(__file__).parent
300
+
301
+ def _validate_paths(self) -> None:
302
+ """Validate paths with enhanced error reporting."""
303
+ try:
304
+ if not self.path_modulo1y2.exists():
305
+ raise PhonemizerError(f"Modulo1y2 executable not found at: {self.path_modulo1y2}")
306
+ if not self.path_dicts.exists():
307
+ raise PhonemizerError(f"Dictionary directory not found at: {self.path_dicts}")
308
+
309
+ # Check for both possible dictionary files
310
+ dict_file = self.path_dicts / f"{self.language}_dicc"
311
+ if not dict_file.exists():
312
+ # Try with .dic extension as fallback
313
+ dict_file_alt = self.path_dicts / f"{self.language}_dicc.dic"
314
+ if not dict_file_alt.exists():
315
+ raise PhonemizerError(f"Dictionary file not found at either {dict_file} or {dict_file_alt}")
316
+
317
+ except Exception as e:
318
+ self.logger.error(f"Path validation error: {str(e)}")
319
+ raise
320
+
321
+ def _transform_multichar_phonemes(self, phoneme_sequence: str) -> str:
322
+ """
323
+ Transform multicharacter IPA phonemes to single characters using the MULTICHAR_TO_SINGLECHAR mapping.
324
+
325
+ Args:
326
+ phoneme_sequence (str): A string containing phonemes separated by spaces
327
+
328
+ Returns:
329
+ str: The transformed phoneme sequence with multicharacter phonemes replaced by single characters
330
+ """
331
+ # Split the sequence into individual phonemes
332
+ phonemes = phoneme_sequence.split()
333
+ transformed_phonemes = []
334
+
335
+ for phoneme in phonemes:
336
+ # Check if the phoneme exists in our mapping
337
+ if phoneme in MULTICHAR_TO_SINGLECHAR:
338
+ transformed_phonemes.append(MULTICHAR_TO_SINGLECHAR[phoneme])
339
+ else:
340
+ transformed_phonemes.append(phoneme)
341
+
342
+ return " ".join(transformed_phonemes)
343
+
344
+
phonemizer/modulo1y2/modulo1y2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c122bd6197e5e360d534957322f8d98a06cb3bcb4d412ee9978e891ae1b43e8a
3
+ size 2245952