| | |
| | |
| | |
| | |
| |
|
| | """ This code is modified from https://github.com/keithito/tacotron """ |
| |
|
| | import re |
| |
|
| |
|
| | valid_symbols = [ |
| | "AA", |
| | "AA0", |
| | "AA1", |
| | "AA2", |
| | "AE", |
| | "AE0", |
| | "AE1", |
| | "AE2", |
| | "AH", |
| | "AH0", |
| | "AH1", |
| | "AH2", |
| | "AO", |
| | "AO0", |
| | "AO1", |
| | "AO2", |
| | "AW", |
| | "AW0", |
| | "AW1", |
| | "AW2", |
| | "AY", |
| | "AY0", |
| | "AY1", |
| | "AY2", |
| | "B", |
| | "CH", |
| | "D", |
| | "DH", |
| | "EH", |
| | "EH0", |
| | "EH1", |
| | "EH2", |
| | "ER", |
| | "ER0", |
| | "ER1", |
| | "ER2", |
| | "EY", |
| | "EY0", |
| | "EY1", |
| | "EY2", |
| | "F", |
| | "G", |
| | "HH", |
| | "IH", |
| | "IH0", |
| | "IH1", |
| | "IH2", |
| | "IY", |
| | "IY0", |
| | "IY1", |
| | "IY2", |
| | "JH", |
| | "K", |
| | "L", |
| | "M", |
| | "N", |
| | "NG", |
| | "OW", |
| | "OW0", |
| | "OW1", |
| | "OW2", |
| | "OY", |
| | "OY0", |
| | "OY1", |
| | "OY2", |
| | "P", |
| | "R", |
| | "S", |
| | "SH", |
| | "T", |
| | "TH", |
| | "UH", |
| | "UH0", |
| | "UH1", |
| | "UH2", |
| | "UW", |
| | "UW0", |
| | "UW1", |
| | "UW2", |
| | "V", |
| | "W", |
| | "Y", |
| | "Z", |
| | "ZH", |
| | ] |
| |
|
| | _valid_symbol_set = set(valid_symbols) |
| |
|
| |
|
| | class CMUDict: |
| | """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict""" |
| |
|
| | def __init__(self, file_or_path, keep_ambiguous=True): |
| | if isinstance(file_or_path, str): |
| | with open(file_or_path, encoding="latin-1") as f: |
| | entries = _parse_cmudict(f) |
| | else: |
| | entries = _parse_cmudict(file_or_path) |
| | if not keep_ambiguous: |
| | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} |
| | self._entries = entries |
| |
|
| | def __len__(self): |
| | return len(self._entries) |
| |
|
| | def lookup(self, word): |
| | """Returns list of ARPAbet pronunciations of the given word.""" |
| | return self._entries.get(word.upper()) |
| |
|
| |
|
| | _alt_re = re.compile(r"\([0-9]+\)") |
| |
|
| |
|
| | def _parse_cmudict(file): |
| | cmudict = {} |
| | for line in file: |
| | if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"): |
| | parts = line.split(" ") |
| | word = re.sub(_alt_re, "", parts[0]) |
| | pronunciation = _get_pronunciation(parts[1]) |
| | if pronunciation: |
| | if word in cmudict: |
| | cmudict[word].append(pronunciation) |
| | else: |
| | cmudict[word] = [pronunciation] |
| | return cmudict |
| |
|
| |
|
| | def _get_pronunciation(s): |
| | parts = s.strip().split(" ") |
| | for part in parts: |
| | if part not in _valid_symbol_set: |
| | return None |
| | return " ".join(parts) |
| |
|