iioSnail
/

ChineseBERT-base

@@ -1,8 +1,8 @@
 import json
 import os
-from pathlib import Path
 from typing import List
 import tokenizers
 import torch
 from pypinyin import pinyin, Style
@@ -14,26 +14,48 @@ except:
 from transformers import BertTokenizerFast
 class ChineseBertTokenizer(BertTokenizerFast):
     def __init__(self, **kwargs):
         super(ChineseBertTokenizer, self).__init__(**kwargs)
-        bert_path = Path(os.path.abspath(__file__)).parent
-        print("bert_path", bert_path)
         vocab_file = os.path.join(bert_path, 'vocab.txt')
         config_path = os.path.join(bert_path, 'config')
         self.max_length = 512
         self.tokenizer = BertWordPieceTokenizer(vocab_file)
         # load pinyin map dict
         with open(os.path.join(config_path, 'pinyin_map.json'), encoding='utf8') as fin:
             self.pinyin_dict = json.load(fin)
         # load char id map tensor
         with open(os.path.join(config_path, 'id2pinyin.json'), encoding='utf8') as fin:
             self.id2pinyin = json.load(fin)
         # load pinyin map tensor
         with open(os.path.join(config_path, 'pinyin2tensor.json'), encoding='utf8') as fin:
             self.pinyin2tensor = json.load(fin)

 import json
 import os
 from typing import List
+import requests
 import tokenizers
 import torch
 from pypinyin import pinyin, Style
 from transformers import BertTokenizerFast
+SOURCE_FILES_URL = {
+    "vocab.txt": "https://huggingface.co/iioSnail/chinesebert-base/blob/main/vocab.txt",
+    "pinyin_map.json": "https://huggingface.co/iioSnail/chinesebert-base/blob/main/config/pinyin_map.json",
+    "id2pinyin.json": "https://huggingface.co/iioSnail/chinesebert-base/blob/main/config/id2pinyin.json",
+    "pinyin2tensor.json": "https://huggingface.co/iioSnail/chinesebert-base/blob/main/config/id2pinyin.json",
+}
+def download_file(url, filename):
+    if os.path.exists(filename):
+        return
+    res = requests.get(url)
+    with open(filename, 'wb') as file:
+        file.write(res.content)
 class ChineseBertTokenizer(BertTokenizerFast):
     def __init__(self, **kwargs):
         super(ChineseBertTokenizer, self).__init__(**kwargs)
+        bert_path = self.name_or_path
         vocab_file = os.path.join(bert_path, 'vocab.txt')
         config_path = os.path.join(bert_path, 'config')
         self.max_length = 512
+        download_file(SOURCE_FILES_URL["vocab.txt"], vocab_file)
         self.tokenizer = BertWordPieceTokenizer(vocab_file)
         # load pinyin map dict
+        download_file(SOURCE_FILES_URL["pinyin_map.json"], os.path.join(config_path, 'pinyin_map.json'))
         with open(os.path.join(config_path, 'pinyin_map.json'), encoding='utf8') as fin:
             self.pinyin_dict = json.load(fin)
         # load char id map tensor
+        download_file(SOURCE_FILES_URL["id2pinyin.json"], os.path.join(config_path, 'id2pinyin.json'))
         with open(os.path.join(config_path, 'id2pinyin.json'), encoding='utf8') as fin:
             self.id2pinyin = json.load(fin)
         # load pinyin map tensor
+        download_file(SOURCE_FILES_URL["pinyin2tensor.json"], os.path.join(config_path, 'pinyin2tensor.json'))
         with open(os.path.join(config_path, 'pinyin2tensor.json'), encoding='utf8') as fin:
             self.pinyin2tensor = json.load(fin)