Upload 2 files
Browse files- bert_tokenizer.py +15 -6
- modeling_glycebert.py +17 -8
bert_tokenizer.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
|
|
|
| 3 |
import time
|
| 4 |
from pathlib import Path
|
| 5 |
-
from types import NoneType
|
| 6 |
from typing import List, Union, Optional
|
| 7 |
|
| 8 |
import tokenizers
|
| 9 |
import torch
|
|
|
|
| 10 |
from huggingface_hub import hf_hub_download
|
| 11 |
from huggingface_hub.file_download import http_user_agent
|
| 12 |
from pypinyin import pinyin, Style
|
|
@@ -24,10 +25,14 @@ from transformers import BertTokenizerFast, BatchEncoding
|
|
| 24 |
cache_path = Path(os.path.abspath(__file__)).parent
|
| 25 |
|
| 26 |
|
| 27 |
-
def download_file(filename: str):
|
| 28 |
if os.path.exists(cache_path / filename):
|
| 29 |
return
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
hf_hub_download(
|
| 32 |
"iioSnail/ChineseBERT-base",
|
| 33 |
filename,
|
|
@@ -42,25 +47,29 @@ class ChineseBertTokenizer(BertTokenizerFast):
|
|
| 42 |
def __init__(self, **kwargs):
|
| 43 |
super(ChineseBertTokenizer, self).__init__(**kwargs)
|
| 44 |
|
|
|
|
| 45 |
vocab_file = cache_path / 'vocab.txt'
|
| 46 |
config_path = cache_path / 'config'
|
|
|
|
|
|
|
|
|
|
| 47 |
self.max_length = 512
|
| 48 |
|
| 49 |
-
download_file('vocab.txt')
|
| 50 |
self.tokenizer = BertWordPieceTokenizer(str(vocab_file))
|
| 51 |
|
| 52 |
# load pinyin map dict
|
| 53 |
-
download_file('config/pinyin_map.json')
|
| 54 |
with open(config_path / 'pinyin_map.json', encoding='utf8') as fin:
|
| 55 |
self.pinyin_dict = json.load(fin)
|
| 56 |
|
| 57 |
# load char id map tensor
|
| 58 |
-
download_file('config/id2pinyin.json')
|
| 59 |
with open(config_path / 'id2pinyin.json', encoding='utf8') as fin:
|
| 60 |
self.id2pinyin = json.load(fin)
|
| 61 |
|
| 62 |
# load pinyin map tensor
|
| 63 |
-
download_file('config/pinyin2tensor.json')
|
| 64 |
with open(config_path / 'pinyin2tensor.json', encoding='utf8') as fin:
|
| 65 |
self.pinyin2tensor = json.load(fin)
|
| 66 |
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
+
import shutil
|
| 4 |
import time
|
| 5 |
from pathlib import Path
|
|
|
|
| 6 |
from typing import List, Union, Optional
|
| 7 |
|
| 8 |
import tokenizers
|
| 9 |
import torch
|
| 10 |
+
from torch import NoneType
|
| 11 |
from huggingface_hub import hf_hub_download
|
| 12 |
from huggingface_hub.file_download import http_user_agent
|
| 13 |
from pypinyin import pinyin, Style
|
|
|
|
| 25 |
cache_path = Path(os.path.abspath(__file__)).parent
|
| 26 |
|
| 27 |
|
| 28 |
+
def download_file(filename: str, path: Path):
|
| 29 |
if os.path.exists(cache_path / filename):
|
| 30 |
return
|
| 31 |
|
| 32 |
+
if os.path.exists(path / filename):
|
| 33 |
+
shutil.copyfile(path / filename, cache_path / filename)
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
hf_hub_download(
|
| 37 |
"iioSnail/ChineseBERT-base",
|
| 38 |
filename,
|
|
|
|
| 47 |
def __init__(self, **kwargs):
|
| 48 |
super(ChineseBertTokenizer, self).__init__(**kwargs)
|
| 49 |
|
| 50 |
+
self.path = Path(kwargs['name_or_path'])
|
| 51 |
vocab_file = cache_path / 'vocab.txt'
|
| 52 |
config_path = cache_path / 'config'
|
| 53 |
+
if not os.path.exists(config_path):
|
| 54 |
+
os.makedirs(config_path)
|
| 55 |
+
|
| 56 |
self.max_length = 512
|
| 57 |
|
| 58 |
+
download_file('vocab.txt', self.path)
|
| 59 |
self.tokenizer = BertWordPieceTokenizer(str(vocab_file))
|
| 60 |
|
| 61 |
# load pinyin map dict
|
| 62 |
+
download_file('config/pinyin_map.json', self.path)
|
| 63 |
with open(config_path / 'pinyin_map.json', encoding='utf8') as fin:
|
| 64 |
self.pinyin_dict = json.load(fin)
|
| 65 |
|
| 66 |
# load char id map tensor
|
| 67 |
+
download_file('config/id2pinyin.json', self.path)
|
| 68 |
with open(config_path / 'id2pinyin.json', encoding='utf8') as fin:
|
| 69 |
self.id2pinyin = json.load(fin)
|
| 70 |
|
| 71 |
# load pinyin map tensor
|
| 72 |
+
download_file('config/pinyin2tensor.json', self.path)
|
| 73 |
with open(config_path / 'pinyin2tensor.json', encoding='utf8') as fin:
|
| 74 |
self.pinyin2tensor = json.load(fin)
|
| 75 |
|
modeling_glycebert.py
CHANGED
|
@@ -10,6 +10,7 @@
|
|
| 10 |
"""
|
| 11 |
import json
|
| 12 |
import os
|
|
|
|
| 13 |
import time
|
| 14 |
import warnings
|
| 15 |
from pathlib import Path
|
|
@@ -32,14 +33,17 @@ except:
|
|
| 32 |
from transformers.modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput, \
|
| 33 |
QuestionAnsweringModelOutput, TokenClassifierOutput
|
| 34 |
|
| 35 |
-
|
| 36 |
cache_path = Path(os.path.abspath(__file__)).parent
|
| 37 |
|
| 38 |
|
| 39 |
-
def download_file(filename: str):
|
| 40 |
if os.path.exists(cache_path / filename):
|
| 41 |
return
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
hf_hub_download(
|
| 44 |
"iioSnail/ChineseBERT-base",
|
| 45 |
filename,
|
|
@@ -565,18 +569,22 @@ class FusionBertEmbeddings(nn.Module):
|
|
| 565 |
|
| 566 |
def __init__(self, config):
|
| 567 |
super(FusionBertEmbeddings, self).__init__()
|
|
|
|
| 568 |
config_path = cache_path / 'config'
|
|
|
|
|
|
|
|
|
|
| 569 |
font_files = []
|
| 570 |
-
download_file("config/STFANGSO.TTF24.npy")
|
| 571 |
-
download_file("config/STXINGKA.TTF24.npy")
|
| 572 |
-
download_file("config/方正古隶繁体.ttf24.npy")
|
| 573 |
for file in os.listdir(config_path):
|
| 574 |
if file.endswith(".npy"):
|
| 575 |
font_files.append(str(config_path / file))
|
| 576 |
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
|
| 577 |
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
| 578 |
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
|
| 579 |
-
self.pinyin_embeddings = PinyinEmbedding(embedding_size=128, pinyin_out_dim=config.hidden_size)
|
| 580 |
self.glyph_embeddings = GlyphEmbedding(font_npy_files=font_files)
|
| 581 |
|
| 582 |
# self.LayerNorm is not snake-cased to stick with TensorFlow models variable name and be able to load
|
|
@@ -624,7 +632,8 @@ class FusionBertEmbeddings(nn.Module):
|
|
| 624 |
|
| 625 |
|
| 626 |
class PinyinEmbedding(nn.Module):
|
| 627 |
-
|
|
|
|
| 628 |
"""
|
| 629 |
Pinyin Embedding Module
|
| 630 |
Args:
|
|
@@ -632,7 +641,7 @@ class PinyinEmbedding(nn.Module):
|
|
| 632 |
pinyin_out_dim: kernel number of conv
|
| 633 |
"""
|
| 634 |
super(PinyinEmbedding, self).__init__()
|
| 635 |
-
download_file('config/pinyin_map.json')
|
| 636 |
with open(cache_path / 'config' / 'pinyin_map.json') as fin:
|
| 637 |
pinyin_dict = json.load(fin)
|
| 638 |
self.pinyin_out_dim = pinyin_out_dim
|
|
|
|
| 10 |
"""
|
| 11 |
import json
|
| 12 |
import os
|
| 13 |
+
import shutil
|
| 14 |
import time
|
| 15 |
import warnings
|
| 16 |
from pathlib import Path
|
|
|
|
| 33 |
from transformers.modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput, \
|
| 34 |
QuestionAnsweringModelOutput, TokenClassifierOutput
|
| 35 |
|
|
|
|
| 36 |
cache_path = Path(os.path.abspath(__file__)).parent
|
| 37 |
|
| 38 |
|
| 39 |
+
def download_file(filename: str, path: Path):
|
| 40 |
if os.path.exists(cache_path / filename):
|
| 41 |
return
|
| 42 |
|
| 43 |
+
if os.path.exists(path / filename):
|
| 44 |
+
shutil.copyfile(path / filename, cache_path / filename)
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
hf_hub_download(
|
| 48 |
"iioSnail/ChineseBERT-base",
|
| 49 |
filename,
|
|
|
|
| 569 |
|
| 570 |
def __init__(self, config):
|
| 571 |
super(FusionBertEmbeddings, self).__init__()
|
| 572 |
+
self.path = Path(config._name_or_path)
|
| 573 |
config_path = cache_path / 'config'
|
| 574 |
+
if not os.path.exists(config_path):
|
| 575 |
+
os.makedirs(config_path)
|
| 576 |
+
|
| 577 |
font_files = []
|
| 578 |
+
download_file("config/STFANGSO.TTF24.npy", self.path)
|
| 579 |
+
download_file("config/STXINGKA.TTF24.npy", self.path)
|
| 580 |
+
download_file("config/方正古隶繁体.ttf24.npy", self.path)
|
| 581 |
for file in os.listdir(config_path):
|
| 582 |
if file.endswith(".npy"):
|
| 583 |
font_files.append(str(config_path / file))
|
| 584 |
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
|
| 585 |
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
| 586 |
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
|
| 587 |
+
self.pinyin_embeddings = PinyinEmbedding(embedding_size=128, pinyin_out_dim=config.hidden_size, config=config)
|
| 588 |
self.glyph_embeddings = GlyphEmbedding(font_npy_files=font_files)
|
| 589 |
|
| 590 |
# self.LayerNorm is not snake-cased to stick with TensorFlow models variable name and be able to load
|
|
|
|
| 632 |
|
| 633 |
|
| 634 |
class PinyinEmbedding(nn.Module):
|
| 635 |
+
|
| 636 |
+
def __init__(self, embedding_size: int, pinyin_out_dim: int, config):
|
| 637 |
"""
|
| 638 |
Pinyin Embedding Module
|
| 639 |
Args:
|
|
|
|
| 641 |
pinyin_out_dim: kernel number of conv
|
| 642 |
"""
|
| 643 |
super(PinyinEmbedding, self).__init__()
|
| 644 |
+
download_file('config/pinyin_map.json', Path(config._name_or_path))
|
| 645 |
with open(cache_path / 'config' / 'pinyin_map.json') as fin:
|
| 646 |
pinyin_dict = json.load(fin)
|
| 647 |
self.pinyin_out_dim = pinyin_out_dim
|