Commit
·
c3166be
1
Parent(s):
1a7fe47
Upload 5 files
Browse files- spiece.model +2 -2
- spiece.vocab +0 -0
- spiece_45.model +3 -0
- spiece_45.vocab +0 -0
- tokenizer.py +36 -0
spiece.model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:579ebba0921710bb6bd17cd678d4379b4a81ca84756dab644d7e8529bd01009d
|
| 3 |
+
size 805610
|
spiece.vocab
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
spiece_45.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c489236e2ac4df783bdb4fc930323620027ee0279d2665d263cd74385d899425
|
| 3 |
+
size 802920
|
spiece_45.vocab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
# %pip install sentencepiece
|
| 2 |
# %pip install datasets
|
|
|
|
| 3 |
|
| 4 |
import unicodedata
|
| 5 |
import os
|
|
@@ -136,3 +137,38 @@ spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece
|
|
| 136 |
num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)
|
| 137 |
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# %pip install sentencepiece
|
| 2 |
# %pip install datasets
|
| 3 |
+
# %pip install seqio
|
| 4 |
|
| 5 |
import unicodedata
|
| 6 |
import os
|
|
|
|
| 137 |
num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)
|
| 138 |
|
| 139 |
|
| 140 |
+
# Add 100 extra tokens to the model
|
| 141 |
+
from seqio import SentencePieceVocabulary
|
| 142 |
+
import os
|
| 143 |
+
import tensorflow as tf
|
| 144 |
+
from sentencepiece import SentencePieceProcessor, sentencepiece_model_pb2
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def add_100extra(vocab: SentencePieceVocabulary, out_dir: str):
|
| 148 |
+
tf.io.gfile.makedirs(out_dir)
|
| 149 |
+
tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.model'), 'w').write(vocab.sp_model)
|
| 150 |
+
|
| 151 |
+
model = sentencepiece_model_pb2.ModelProto.FromString(vocab.sp_model)
|
| 152 |
+
tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.vocab'), 'w').write(
|
| 153 |
+
'\n'.join(f'{p.piece}\t{p.score}' for p in model.pieces)
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
# vocab = t5.data.get_default_vocabulary()
|
| 158 |
+
# out_dir = "../vocabulary/cc_all.32000.100extra"
|
| 159 |
+
#
|
| 160 |
+
# add_100extra(vocab, out_dir)
|
| 161 |
+
#
|
| 162 |
+
# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=100)
|
| 163 |
+
# out_dir = "../vocabulary/nedd.32000.100extra"
|
| 164 |
+
# add_100extra(vocab, out_dir)
|
| 165 |
+
#
|
| 166 |
+
# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=128)
|
| 167 |
+
# out_dir = "../vocabulary/nedd.32000.128extra"
|
| 168 |
+
# add_100extra(vocab, out_dir)
|
| 169 |
+
#
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
vocab = SentencePieceVocabulary("/Users/sdeshpande/Desktop/Challenges/patents/spiece_45.model", extra_ids=100)
|
| 173 |
+
out_dir = "conv"
|
| 174 |
+
add_100extra(vocab, out_dir)
|