| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import itertools |
| | import os |
| | from argparse import ArgumentParser |
| | from typing import Dict |
| |
|
| | from syllabify import syllabify |
| |
|
| |
|
| | """ |
| | Usage: |
| | cd NeMo/scripts && python dataset_processing/g2p/convert_cmu_arpabet_to_ipa.py |
| | """ |
| |
|
| |
|
| | def parse_args(): |
| | parser = ArgumentParser("ARPABET to IPA conversion sctipt") |
| | parser.add_argument( |
| | '--cmu_arpabet', |
| | help="Path to CMU ARPABET dictionary file", |
| | type=str, |
| | default="tts_dataset_files/cmudict-0.7b_nv22.10", |
| | ) |
| | parser.add_argument("--ipa_out", help="Path to save IPA version of the dictionary", type=str, required=True) |
| | parser.add_argument( |
| | "--mapping", |
| | help="ARPABET to IPA phoneme mapping file", |
| | type=str, |
| | default="tts_dataset_files/cmudict-arpabet_to_ipa_nv22.10.tsv", |
| | ) |
| | return parser.parse_args() |
| |
|
| |
|
| | def convert_arp_to_ipa(arp_to_ipa_dict: Dict[str, str], arp_input: str, remove_space: bool = False) -> str: |
| | """ |
| | Converts ARPABET phoneme to IPA based on arp_to_ipa_dict mapping |
| | |
| | Args: |
| | arp_to_ipa_dict: ARPABET to IPA phonemes mapping |
| | arp_input: ARPABET input |
| | remove_space: set to TRUE to remove spaces between IPA phonemes |
| | |
| | Returns: |
| | input word in IPA form |
| | """ |
| |
|
| | primary_stress = "ˈ" |
| | secondary_stress = "ˌ" |
| | stress_dict = {"0": "", "1": primary_stress, "2": secondary_stress} |
| |
|
| | word_ipa = "" |
| | phonemes = arp_input.split() |
| |
|
| | |
| | |
| | syllables = syllabify(phonemes) |
| |
|
| | for syl_idx, syll in enumerate(syllables): |
| | syll_stress = "" |
| | syll_ipa = "" |
| |
|
| | |
| | |
| | syll = [x for x in itertools.chain.from_iterable(syll)] |
| | for phon_idx, phon in enumerate(syll): |
| | if phon[-1].isdigit(): |
| | syll_stress = phon[-1] |
| | if syll_stress not in stress_dict: |
| | raise ValueError(f"{syll_stress} unknown") |
| | syll_stress = stress_dict[syll_stress] |
| |
|
| | |
| | if phon not in arp_to_ipa_dict and phon[-1].isdigit(): |
| | phon = phon[:-1] |
| |
|
| | if phon not in arp_to_ipa_dict: |
| | raise ValueError(f"|{phon}| phoneme not found in |{arp_input}|") |
| | else: |
| | ipa_phone = arp_to_ipa_dict[phon] |
| | syll_ipa += ipa_phone + " " |
| |
|
| | word_ipa += " " + syll_stress + syll_ipa.strip() |
| |
|
| | word_ipa = word_ipa.strip() |
| | if remove_space: |
| | word_ipa = word_ipa.replace(" ", "") |
| | return word_ipa |
| |
|
| |
|
| | def _get_arpabet_to_ipa_mapping(arp_ipa_map_file: str) -> Dict[str, str]: |
| | """ |
| | arp_ipa_map_file: Arpabet to IPA phonemes mapping |
| | """ |
| | arp_to_ipa = {} |
| | with open(arp_ipa_map_file, "r", encoding="utf-8") as f: |
| | for line in f: |
| | arp, ipa = line.strip().split("\t") |
| | arp_to_ipa[arp] = ipa |
| | return arp_to_ipa |
| |
|
| |
|
| | def convert_cmu_arpabet_to_ipa(arp_ipa_map_file: str, arp_dict_file: str, output_ipa_file: str): |
| | """ |
| | Converts CMU ARPABET-based dictionary to IPA. |
| | |
| | Args: |
| | arp_ipa_map_file: ARPABET to IPA phoneme mapping file |
| | arp_dict_file: path to ARPABET version of CMU dictionary |
| | output_ipa_file: path to output IPA version of CMU dictionary |
| | """ |
| | arp_to_ipa_dict = _get_arpabet_to_ipa_mapping(arp_ipa_map_file) |
| | with open(arp_dict_file, "r", encoding="utf-8") as f_arp, open(output_ipa_file, "w", encoding="utf-8") as f_ipa: |
| | for line in f_arp: |
| | if line.startswith(";;;"): |
| | f_ipa.write(line) |
| | else: |
| | |
| | |
| | graphemes, phonemes = line.split(" #")[0].strip().split(" ") |
| | ipa_form = convert_arp_to_ipa(arp_to_ipa_dict, phonemes, remove_space=True) |
| | f_ipa.write(f"{graphemes} {ipa_form}\n") |
| |
|
| | print(f"IPA version of {os.path.abspath(arp_dict_file)} saved in {os.path.abspath(output_ipa_file)}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | args = parse_args() |
| | convert_cmu_arpabet_to_ipa(args.mapping, args.cmu_arpabet, args.ipa_out) |
| |
|