| # from tokenization_gptpangu import GPTPanguTokenizer | |
| # import json | |
| # | |
| # tokenizer = GPTPanguTokenizer.from_pretrained(".") | |
| # with open("tokenizer.json",encoding="utf-8") as f: | |
| # cofig = json.load(f) | |
| # | |
| # | |
| # vocab_file = "vocab.vocab" | |
| # | |
| # f = open(vocab_file, 'r', encoding="utf-8") | |
| # lines = f.readlines() | |
| # vocab = [] | |
| # for line in enumerate(lines): | |
| # key = line[1].split('\t')[0] | |
| # pair = [key,line[0]] | |
| # vocab.append(pair) | |
| # | |
| # cofig['model']['vocab'] = vocab | |
| # | |
| # with open("new_tokenizer.json","w",encoding="utf-8") as w: | |
| # d = json.dumps(cofig) | |
| # w.write(d) | |
| # | |
| # print("ok") | |
| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(".") | |