| |
| import os |
| import json |
| folder_path = "./vocabs" |
|
|
| all_dict = {} |
|
|
| def parse_file(filename): |
| dictionary = { |
| "</s>": 2, |
| "<pad>": 0, |
| "<s>": 1, |
| "<unk>": 3, |
| } |
| value = 4 |
|
|
| with open(filename, 'r') as file: |
| for line in file: |
| line = line.strip().split() |
| if line: |
| key = line[0] |
| dictionary[key] = value |
| value += 1 |
|
|
| return dictionary |
|
|
| for filename in os.listdir(folder_path): |
| filepath = os.path.join(folder_path, filename) |
| lang = filename.split(".")[0] |
| if os.path.isfile(filepath): |
| all_dict[lang] = parse_file(filepath) |
|
|
|
|
| output_path = "vocab.json" |
|
|
| with open(output_path, 'w') as output_file: |
| json.dump(all_dict, output_file, indent=4, sort_keys=True) |
|
|