| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| This file takes as input a lexicon.txt and output a new lexicon, |
| in which each word has a unique pronunciation. |
| |
| The way to do this is to keep only the first pronunciation of a word |
| in lexicon.txt. |
| """ |
|
|
|
|
| import argparse |
| import logging |
| from pathlib import Path |
| from typing import List, Tuple |
|
|
| from icefall.lexicon import read_lexicon, write_lexicon |
|
|
|
|
| def get_args(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| "--lang-dir", |
| type=str, |
| help="""Input and output directory. |
| It should contain a file lexicon.txt. |
| This file will generate a new file uniq_lexicon.txt |
| in it. |
| """, |
| ) |
|
|
| return parser.parse_args() |
|
|
|
|
| def filter_multiple_pronunications( |
| lexicon: List[Tuple[str, List[str]]] |
| ) -> List[Tuple[str, List[str]]]: |
| """Remove multiple pronunciations of words from a lexicon. |
| |
| If a word has more than one pronunciation in the lexicon, only |
| the first one is kept, while other pronunciations are removed |
| from the lexicon. |
| |
| Args: |
| lexicon: |
| The input lexicon, containing a list of (word, [p1, p2, ..., pn]), |
| where "p1, p2, ..., pn" are the pronunciations of the "word". |
| Returns: |
| Return a new lexicon where each word has a unique pronunciation. |
| """ |
| seen = set() |
| ans = [] |
|
|
| for word, tokens in lexicon: |
| if word in seen: |
| continue |
| seen.add(word) |
| ans.append((word, tokens)) |
| return ans |
|
|
|
|
| def main(): |
| args = get_args() |
| lang_dir = Path(args.lang_dir) |
|
|
| lexicon_filename = lang_dir / "lexicon.txt" |
|
|
| in_lexicon = read_lexicon(lexicon_filename) |
|
|
| out_lexicon = filter_multiple_pronunications(in_lexicon) |
|
|
| write_lexicon(lang_dir / "uniq_lexicon.txt", out_lexicon) |
|
|
| logging.info(f"Number of entries in lexicon.txt: {len(in_lexicon)}") |
| logging.info(f"Number of entries in uniq_lexicon.txt: {len(out_lexicon)}") |
|
|
|
|
| if __name__ == "__main__": |
| formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" |
|
|
| logging.basicConfig(format=formatter, level=logging.INFO) |
|
|
| main() |
|
|