File size: 5,748 Bytes
2ab4c76 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | #!/usr/bin/env python
# coding: utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script creates a tiny random model
#
# It will be used then as "hf-internal-testing/tiny-albert"
# ***To build from scratch***
#
# 1. clone sentencepiece into a parent dir
# git clone https://github.com/google/sentencepiece
#
# 2. create a new repo at https://huggingface.co/new
# make sure to choose 'hf-internal-testing' as the Owner
#
# 3. clone
# git clone https://huggingface.co/hf-internal-testing/tiny-albert
# cd tiny-albert
# 4. start with some pre-existing script from one of the https://huggingface.co/hf-internal-testing/ tiny model repos, e.g.
# wget https://huggingface.co/hf-internal-testing/tiny-albert/raw/main/make-tiny-albert.py
# chmod a+x ./make-tiny-albert.py
# mv ./make-tiny-albert.py ./make-tiny-albert.py
#
# 5. automatically rename things from the old names to new ones
# perl -pi -e 's|Deberta|Deberta|g' make-*
# perl -pi -e 's|deberta|deberta|g' make-*
#
# 6. edit and re-run this script while fixing it up
# ./make-tiny-deberta.py
#
# 7. add/commit/push
# git add *
# git commit -m "new tiny model"
# git push
# ***To update***
#
# 1. clone the existing repo
# git clone https://huggingface.co/hf-internal-testing/tiny-deberta
# cd tiny-deberta
#
# 2. edit and re-run this script after doing whatever changes are needed
# ./make-tiny-deberta.py
#
# 3. commit/push
# git commit -m "new tiny model"
# git push
import sys
import os
# workaround for fast tokenizer protobuf issue, and it's much faster too!
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from transformers import DebertaTokenizer, DebertaTokenizerFast, DebertaConfig, DebertaForMaskedLM
mname_orig = "microsoft/deberta-base"
mname_tiny = "tiny-deberta"
### Tokenizer
# XXX: can't figure out how to shrink this tokenizer's vocab! Help?
# # Shrink the orig vocab to keep things small (just enough to tokenize any word, so letters+symbols)
# # DebertaTokenizerFast is fully defined by a tokenizer.json, which contains the vocab and the ids, so we just need to truncate it wisely
# import subprocess
# tokenizer_fast = DebertaTokenizerFast.from_pretrained(mname_orig)
# vocab_keep_items = 50265
# tmp_dir = f"/tmp/{mname_tiny}"
# tokenizer_fast.save_pretrained(tmp_dir)
# # resize tokenizer.json (vocab.txt will be automatically resized on save_pretrained)
# # perl -pi -e 's|(2999).*|$1}}}|' tokenizer.json # 0-indexed, so vocab_keep_items-1!
# closing_pat = "}}}"
# cmd = (f"perl -pi -e s|({vocab_keep_items-1}).*|$1{closing_pat}| {tmp_dir}/tokenizer.json").split()
# result = subprocess.run(cmd, capture_output=True, text=True)
# # reload with modified tokenizer
# tokenizer_fast_tiny = DebertaTokenizerFast.from_pretrained(tmp_dir)
# # it seems that DebertaTokenizer is not needed and DebertaTokenizerFast does the job
# # Shrink the orig vocab to keep things small (just enough to tokenize any word, so letters+symbols)
# # ElectraTokenizerFast is fully defined by a tokenizer.json, which contains the vocab and the ids, so we just need to truncate it wisely
# import subprocess
# tokenizer_fast = DebertaTokenizerFast.from_pretrained(mname_orig)
# vocab_keep_items = 5120
# tmp_dir = f"/tmp/{mname_tiny}"
# vocab_short_path = f"{tmp_dir}/vocab.json"
# tokenizer_fast.save_pretrained(tmp_dir)
# # resize tokenizer.json (vocab.txt will be automatically resized on save_pretrained)
# # perl -pi -e 's|(2999).*|$1}}}|' tokenizer.json # 0-indexed, so vocab_keep_items-1!
# closing_pat = "}"
# cmd = (f"perl -pi -e s|({vocab_keep_items-1}).*|$1{closing_pat}| {tmp_dir}/vocab.json").split()
# result = subprocess.run(cmd, capture_output=True, text=True)
# # reload with modified tokenizer
# #tokenizer_fast_tiny = DebertaTokenizerFast.from_pretrained(tmp_dir, vocab_file=vocab_short_path)
# # it seems that ElectraTokenizer is not needed and ElectraTokenizerFast does the job
# using full tokenizer for now
tokenizer_fast_tiny = DebertaTokenizerFast.from_pretrained(mname_orig)
### Config
config_tiny = DebertaConfig.from_pretrained(mname_orig)
print(config_tiny)
# remember to update this to the actual config as each model is different and then shrink the numbers
config_tiny.update(dict(
#vocab_size=vocab_keep_items,
embedding_size=32,
pooler_size=32,
hidden_size=32,
intermediate_size=64,
max_position_embeddings=128,
num_attention_heads=2,
num_hidden_layers=2,
))
print("New config", config_tiny)
### Model
model_tiny = DebertaForMaskedLM(config_tiny)
print(f"{mname_tiny}: num of params {model_tiny.num_parameters()}")
model_tiny.resize_token_embeddings(len(tokenizer_fast_tiny))
# Test
inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt")
#print(inputs)
outputs = model_tiny(**inputs)
print("Test with normal tokenizer:", len(outputs.logits[0]))
# Save
model_tiny.half() # makes it smaller
model_tiny.save_pretrained(".")
tokenizer_fast_tiny.save_pretrained(".")
#print(model_tiny)
readme = "README.md"
if not os.path.exists(readme):
with open(readme, "w") as f:
f.write(f"This is a {mname_tiny} random model to be used for basic testing.\n")
print(f"Generated {mname_tiny}")
|