iioSnail
/

ChineseBERT-base

@@ -1,12 +1,13 @@
 import json
 import os
 import time
 from pathlib import Path
-from types import NoneType
 from typing import List, Union, Optional
 import tokenizers
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.file_download import http_user_agent
 from pypinyin import pinyin, Style
@@ -24,10 +25,14 @@ from transformers import BertTokenizerFast, BatchEncoding
 cache_path = Path(os.path.abspath(__file__)).parent
-def download_file(filename: str):
     if os.path.exists(cache_path / filename):
         return
     hf_hub_download(
         "iioSnail/ChineseBERT-base",
         filename,
@@ -42,25 +47,29 @@ class ChineseBertTokenizer(BertTokenizerFast):
     def __init__(self, **kwargs):
         super(ChineseBertTokenizer, self).__init__(**kwargs)
         vocab_file = cache_path / 'vocab.txt'
         config_path = cache_path / 'config'
         self.max_length = 512
-        download_file('vocab.txt')
         self.tokenizer = BertWordPieceTokenizer(str(vocab_file))
         # load pinyin map dict
-        download_file('config/pinyin_map.json')
         with open(config_path / 'pinyin_map.json', encoding='utf8') as fin:
             self.pinyin_dict = json.load(fin)
         # load char id map tensor
-        download_file('config/id2pinyin.json')
         with open(config_path / 'id2pinyin.json', encoding='utf8') as fin:
             self.id2pinyin = json.load(fin)
         # load pinyin map tensor
-        download_file('config/pinyin2tensor.json')
         with open(config_path / 'pinyin2tensor.json', encoding='utf8') as fin:
             self.pinyin2tensor = json.load(fin)

 import json
 import os
+import shutil
 import time
 from pathlib import Path
 from typing import List, Union, Optional
 import tokenizers
 import torch
+from torch import NoneType
 from huggingface_hub import hf_hub_download
 from huggingface_hub.file_download import http_user_agent
 from pypinyin import pinyin, Style
 cache_path = Path(os.path.abspath(__file__)).parent
+def download_file(filename: str, path: Path):
     if os.path.exists(cache_path / filename):
         return
+    if os.path.exists(path / filename):
+        shutil.copyfile(path / filename, cache_path / filename)
+        return
     hf_hub_download(
         "iioSnail/ChineseBERT-base",
         filename,
     def __init__(self, **kwargs):
         super(ChineseBertTokenizer, self).__init__(**kwargs)
+        self.path = Path(kwargs['name_or_path'])
         vocab_file = cache_path / 'vocab.txt'
         config_path = cache_path / 'config'
+        if not os.path.exists(config_path):
+            os.makedirs(config_path)
         self.max_length = 512
+        download_file('vocab.txt', self.path)
         self.tokenizer = BertWordPieceTokenizer(str(vocab_file))
         # load pinyin map dict
+        download_file('config/pinyin_map.json', self.path)
         with open(config_path / 'pinyin_map.json', encoding='utf8') as fin:
             self.pinyin_dict = json.load(fin)
         # load char id map tensor
+        download_file('config/id2pinyin.json', self.path)
         with open(config_path / 'id2pinyin.json', encoding='utf8') as fin:
             self.id2pinyin = json.load(fin)
         # load pinyin map tensor
+        download_file('config/pinyin2tensor.json', self.path)
         with open(config_path / 'pinyin2tensor.json', encoding='utf8') as fin:
             self.pinyin2tensor = json.load(fin)

modeling_glycebert.py CHANGED Viewed

@@ -10,6 +10,7 @@
 """
 import json
 import os
 import time
 import warnings
 from pathlib import Path
@@ -32,14 +33,17 @@ except:
 from transformers.modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput, \
     QuestionAnsweringModelOutput, TokenClassifierOutput
 cache_path = Path(os.path.abspath(__file__)).parent
-def download_file(filename: str):
     if os.path.exists(cache_path / filename):
         return
     hf_hub_download(
         "iioSnail/ChineseBERT-base",
         filename,
@@ -565,18 +569,22 @@ class FusionBertEmbeddings(nn.Module):
     def __init__(self, config):
         super(FusionBertEmbeddings, self).__init__()
         config_path = cache_path / 'config'
         font_files = []
-        download_file("config/STFANGSO.TTF24.npy")
-        download_file("config/STXINGKA.TTF24.npy")
-        download_file("config/方正古隶繁体.ttf24.npy")
         for file in os.listdir(config_path):
             if file.endswith(".npy"):
                 font_files.append(str(config_path / file))
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-        self.pinyin_embeddings = PinyinEmbedding(embedding_size=128, pinyin_out_dim=config.hidden_size)
         self.glyph_embeddings = GlyphEmbedding(font_npy_files=font_files)
         # self.LayerNorm is not snake-cased to stick with TensorFlow models variable name and be able to load
@@ -624,7 +632,8 @@ class FusionBertEmbeddings(nn.Module):
 class PinyinEmbedding(nn.Module):
-    def __init__(self, embedding_size: int, pinyin_out_dim: int):
         """
             Pinyin Embedding Module
         Args:
@@ -632,7 +641,7 @@ class PinyinEmbedding(nn.Module):
             pinyin_out_dim: kernel number of conv
         """
         super(PinyinEmbedding, self).__init__()
-        download_file('config/pinyin_map.json')
         with open(cache_path / 'config' / 'pinyin_map.json') as fin:
             pinyin_dict = json.load(fin)
         self.pinyin_out_dim = pinyin_out_dim

 """
 import json
 import os
+import shutil
 import time
 import warnings
 from pathlib import Path
 from transformers.modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput, \
     QuestionAnsweringModelOutput, TokenClassifierOutput
 cache_path = Path(os.path.abspath(__file__)).parent
+def download_file(filename: str, path: Path):
     if os.path.exists(cache_path / filename):
         return
+    if os.path.exists(path / filename):
+        shutil.copyfile(path / filename, cache_path / filename)
+        return
     hf_hub_download(
         "iioSnail/ChineseBERT-base",
         filename,
     def __init__(self, config):
         super(FusionBertEmbeddings, self).__init__()
+        self.path = Path(config._name_or_path)
         config_path = cache_path / 'config'
+        if not os.path.exists(config_path):
+            os.makedirs(config_path)
         font_files = []
+        download_file("config/STFANGSO.TTF24.npy", self.path)
+        download_file("config/STXINGKA.TTF24.npy", self.path)
+        download_file("config/方正古隶繁体.ttf24.npy", self.path)
         for file in os.listdir(config_path):
             if file.endswith(".npy"):
                 font_files.append(str(config_path / file))
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.pinyin_embeddings = PinyinEmbedding(embedding_size=128, pinyin_out_dim=config.hidden_size, config=config)
         self.glyph_embeddings = GlyphEmbedding(font_npy_files=font_files)
         # self.LayerNorm is not snake-cased to stick with TensorFlow models variable name and be able to load
 class PinyinEmbedding(nn.Module):
+    def __init__(self, embedding_size: int, pinyin_out_dim: int, config):
         """
             Pinyin Embedding Module
         Args:
             pinyin_out_dim: kernel number of conv
         """
         super(PinyinEmbedding, self).__init__()
+        download_file('config/pinyin_map.json', Path(config._name_or_path))
         with open(cache_path / 'config' / 'pinyin_map.json') as fin:
             pinyin_dict = json.load(fin)
         self.pinyin_out_dim = pinyin_out_dim