| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| """ |
| This script takes as input words.txt without ids: |
| - words_no_ids.txt |
| and generates the new words.txt with related ids. |
| - words.txt |
| """ |
|
|
|
|
| import argparse |
| import logging |
|
|
| from tqdm import tqdm |
|
|
|
|
| def get_parser(): |
| parser = argparse.ArgumentParser( |
| description="Prepare words.txt", |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
| ) |
| parser.add_argument( |
| "--input-file", |
| default="data/lang_char/words_no_ids.txt", |
| type=str, |
| help="the words file without ids for WenetSpeech", |
| ) |
| parser.add_argument( |
| "--output-file", |
| default="data/lang_char/words.txt", |
| type=str, |
| help="the words file with ids for WenetSpeech", |
| ) |
|
|
| return parser |
|
|
|
|
| def main(): |
| parser = get_parser() |
| args = parser.parse_args() |
|
|
| input_file = args.input_file |
| output_file = args.output_file |
|
|
| f = open(input_file, "r", encoding="utf-8") |
| lines = f.readlines() |
| new_lines = [] |
| add_words = ["<eps> 0", "!SIL 1", "<SPOKEN_NOISE> 2", "<UNK> 3"] |
| new_lines.extend(add_words) |
|
|
| logging.info("Starting reading the input file") |
| for i in tqdm(range(len(lines))): |
| x = lines[i] |
| idx = 4 + i |
| new_line = str(x.strip("\n")) + " " + str(idx) |
| new_lines.append(new_line) |
|
|
| logging.info("Starting writing the words.txt") |
| f_out = open(output_file, "w", encoding="utf-8") |
|
|
| |
| id1, id2, id3 = ( |
| str(len(new_lines)), |
| str(len(new_lines) + 1), |
| str(len(new_lines) + 2), |
| ) |
| add_words = ["#0 " + id1, "<s> " + id2, "</s> " + id3] |
| new_lines.extend(add_words) |
|
|
| for line in new_lines: |
| f_out.write(line) |
| f_out.write("\n") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|