Upload TFBilma
Browse files- config.json +6 -4
- configuration_bilma.py +13 -5
- modeling_bilma.py +52 -19
- tf_model.h5 +1 -1
config.json
CHANGED
|
@@ -1,17 +1,19 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
|
|
|
|
|
|
|
|
|
| 3 |
"auto_map": {
|
| 4 |
"AutoConfig": "configuration_bilma.BilmaConfig",
|
| 5 |
-
"TFAutoModel": "modeling_bilma.TFBilma"
|
| 6 |
-
"TFAutoModelForMaskedLM": "modeling_bilma.TFBilma"
|
| 7 |
},
|
| 8 |
"hidden_dropout_prob": 0.1,
|
| 9 |
"hidden_size": 512,
|
| 10 |
-
"include_head": null,
|
| 11 |
"include_top": true,
|
| 12 |
"model_type": "bilma",
|
| 13 |
"num_attention_heads": 4,
|
| 14 |
"num_hidden_layers": 2,
|
|
|
|
| 15 |
"seq_max_length": 280,
|
| 16 |
"transformers_version": "4.30.2",
|
| 17 |
"vocab_size": 29025,
|
|
|
|
| 1 |
{
|
| 2 |
+
"add_head": null,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"Bilma"
|
| 5 |
+
],
|
| 6 |
"auto_map": {
|
| 7 |
"AutoConfig": "configuration_bilma.BilmaConfig",
|
| 8 |
+
"TFAutoModel": "modeling_bilma.TFBilma"
|
|
|
|
| 9 |
},
|
| 10 |
"hidden_dropout_prob": 0.1,
|
| 11 |
"hidden_size": 512,
|
|
|
|
| 12 |
"include_top": true,
|
| 13 |
"model_type": "bilma",
|
| 14 |
"num_attention_heads": 4,
|
| 15 |
"num_hidden_layers": 2,
|
| 16 |
+
"pooling": null,
|
| 17 |
"seq_max_length": 280,
|
| 18 |
"transformers_version": "4.30.2",
|
| 19 |
"vocab_size": 29025,
|
configuration_bilma.py
CHANGED
|
@@ -7,7 +7,8 @@ class BilmaConfig(PretrainedConfig):
|
|
| 7 |
self,
|
| 8 |
weights="MX",
|
| 9 |
include_top = True,
|
| 10 |
-
|
|
|
|
| 11 |
num_attention_heads: int = 4,
|
| 12 |
num_hidden_layers: int = 2,
|
| 13 |
seq_max_length: int = 280,
|
|
@@ -17,14 +18,20 @@ class BilmaConfig(PretrainedConfig):
|
|
| 17 |
**kwargs,
|
| 18 |
):
|
| 19 |
countries = ["MX"]
|
|
|
|
| 20 |
if weights not in countries:
|
| 21 |
raise ValueError(f"`weights` must be one of {countries}, got {weights}.")
|
| 22 |
-
if
|
| 23 |
-
raise ValueError(f"To
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
if weights is not None:
|
| 25 |
self.weights = weights
|
| 26 |
self.include_top = include_top
|
| 27 |
-
self.
|
|
|
|
| 28 |
self.num_attention_heads = 4
|
| 29 |
self.num_hidden_layers = 2
|
| 30 |
self.seq_max_length = 280
|
|
@@ -36,7 +43,8 @@ class BilmaConfig(PretrainedConfig):
|
|
| 36 |
|
| 37 |
self.weights = weights
|
| 38 |
self.include_top = include_top
|
| 39 |
-
self.
|
|
|
|
| 40 |
self.num_attention_heads = num_attention_heads
|
| 41 |
self.num_hidden_layers = num_hidden_layers
|
| 42 |
self.seq_max_length = seq_max_length
|
|
|
|
| 7 |
self,
|
| 8 |
weights="MX",
|
| 9 |
include_top = True,
|
| 10 |
+
add_head = None,
|
| 11 |
+
pooling = None,
|
| 12 |
num_attention_heads: int = 4,
|
| 13 |
num_hidden_layers: int = 2,
|
| 14 |
seq_max_length: int = 280,
|
|
|
|
| 18 |
**kwargs,
|
| 19 |
):
|
| 20 |
countries = ["MX"]
|
| 21 |
+
poolings = ["mean", "cls", "max"]
|
| 22 |
if weights not in countries:
|
| 23 |
raise ValueError(f"`weights` must be one of {countries}, got {weights}.")
|
| 24 |
+
if add_head is not None and include_top == True:
|
| 25 |
+
raise ValueError(f"To add a head, 'include_top' must be False")
|
| 26 |
+
if pooling is not None and include_top == True:
|
| 27 |
+
raise ValueError(f"To specify a pooling, 'include_top' must be False")
|
| 28 |
+
if pooling is not None and pooling not in poolings:
|
| 29 |
+
raise ValueError(f"`pooling` must be one of {poolings}, got {pooling}.")
|
| 30 |
if weights is not None:
|
| 31 |
self.weights = weights
|
| 32 |
self.include_top = include_top
|
| 33 |
+
self.add_head = add_head
|
| 34 |
+
self.pooling = pooling
|
| 35 |
self.num_attention_heads = 4
|
| 36 |
self.num_hidden_layers = 2
|
| 37 |
self.seq_max_length = 280
|
|
|
|
| 43 |
|
| 44 |
self.weights = weights
|
| 45 |
self.include_top = include_top
|
| 46 |
+
self.add_head = add_head
|
| 47 |
+
self.pooling = pooling
|
| 48 |
self.num_attention_heads = num_attention_heads
|
| 49 |
self.num_hidden_layers = num_hidden_layers
|
| 50 |
self.seq_max_length = seq_max_length
|
modeling_bilma.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
from transformers import TFPreTrainedModel, PreTrainedTokenizer
|
|
|
|
| 2 |
from tensorflow.keras.models import Model, load_model, Sequential
|
| 3 |
from tensorflow.keras.layers import Layer, Dense, concatenate, Input, add, Dropout, LayerNormalization, MultiHeadAttention, Embedding
|
| 4 |
import tensorflow as tf
|
|
@@ -9,7 +10,7 @@ from typing import Dict
|
|
| 9 |
import re
|
| 10 |
import unicodedata
|
| 11 |
|
| 12 |
-
from
|
| 13 |
|
| 14 |
# copied from preprocessing.py
|
| 15 |
BLANK = ' '
|
|
@@ -38,7 +39,7 @@ class TFBilma(TFPreTrainedModel):
|
|
| 38 |
def __init__(self, config):
|
| 39 |
self.seq_max_length = config.seq_max_length
|
| 40 |
self.include_top = config.include_top
|
| 41 |
-
self.
|
| 42 |
super().__init__(config)
|
| 43 |
|
| 44 |
self.model = bilma(num_enc=config.num_hidden_layers,
|
|
@@ -49,7 +50,8 @@ class TFBilma(TFPreTrainedModel):
|
|
| 49 |
vocab_size=config.vocab_size,
|
| 50 |
rate=config.hidden_dropout_prob,
|
| 51 |
include_top = config.include_top,
|
| 52 |
-
|
|
|
|
| 53 |
|
| 54 |
@property
|
| 55 |
def dummy_inputs(self) -> Dict[str, tf.Tensor]:
|
|
@@ -72,19 +74,26 @@ class TFBilma(TFPreTrainedModel):
|
|
| 72 |
|
| 73 |
|
| 74 |
def call(self, inputs):
|
| 75 |
-
if isinstance(inputs, Dict):
|
| 76 |
ins = tf.cast(inputs["input_ids"], tf.float32)
|
| 77 |
else:
|
| 78 |
ins = inputs
|
| 79 |
if self.include_top:
|
| 80 |
output = {"logits":self.model(ins)}
|
| 81 |
else:
|
| 82 |
-
if self.
|
| 83 |
output = {"last_hidden_state":self.model(ins)}
|
| 84 |
else:
|
| 85 |
-
output = {"
|
| 86 |
return output
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
# copied from bilma_model.py
|
| 89 |
# --------------------------
|
| 90 |
|
|
@@ -115,18 +124,38 @@ def accuracy_function(ignore_id=0):
|
|
| 115 |
|
| 116 |
def mean_vectors(inputs, enc_vectors, max_length):
|
| 117 |
p = tf.where(inputs == 3)
|
| 118 |
-
count, _ = inputs.shape
|
| 119 |
pos = tf.transpose(p)[1]
|
| 120 |
C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
|
| 121 |
-
#C = tf.ragged.constant([[1]*i for i in pos.numpy()], dtype=tf.float32)
|
| 122 |
-
#C = C.to_tensor(0, shape=(count, max_length))
|
| 123 |
C = tf.reshape(C, (-1, max_length, 1))
|
| 124 |
S = tf.reduce_sum(enc_vectors * C, 1)
|
| 125 |
x = S / tf.expand_dims(tf.cast(pos, tf.float32), (1))
|
| 126 |
return x
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
-
|
|
|
|
| 130 |
capt_inputs_ids = Input(shape=(max_length, ), name='input_ids')
|
| 131 |
capt_embedding = Embedding(vocab_size, embed_dim, mask_zero=False, name="bilma/embedding")
|
| 132 |
capt_inputs = capt_embedding(capt_inputs_ids)
|
|
@@ -136,14 +165,20 @@ def bilma(num_enc=6, embed_dim=300, max_length=50, num_heads=6, ff_dim=512, voca
|
|
| 136 |
if include_top:
|
| 137 |
fin_output = Dense(vocab_size, use_bias=True, name="bilma/dense_final")(enc_output)
|
| 138 |
else:
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
else:
|
| 142 |
-
x = enc_output
|
| 143 |
x = mean_vectors(capt_inputs_ids, x, max_length)
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
x = Dense(m, use_bias=True, activation="relu", name=f"bilma/dense_ex_{i}")(x)
|
| 146 |
-
fin_output =
|
| 147 |
|
| 148 |
caption_model = Model(inputs=capt_inputs_ids, outputs=fin_output, name="bilma_model")
|
| 149 |
return caption_model
|
|
@@ -160,7 +195,6 @@ def load(model_file):
|
|
| 160 |
#
|
| 161 |
# Copied from transformer_text.py
|
| 162 |
# -------------------------------
|
| 163 |
-
|
| 164 |
class EncoderBlock(Layer):
|
| 165 |
def __init__(self, layer_num, patch_dim, num_heads, ff_dim, rate=0.1, **kwargs):
|
| 166 |
super(EncoderBlock, self).__init__(**kwargs)
|
|
@@ -242,7 +276,6 @@ class DecoderBlock(Layer):
|
|
| 242 |
|
| 243 |
return final_output, attn_output1, attn_encoder
|
| 244 |
|
| 245 |
-
|
| 246 |
class Encoder(Layer):
|
| 247 |
def __init__(self, n, embed_dim, max_length, num_heads, ff_dim, rate=0.1, **kwargs):
|
| 248 |
super(Encoder, self).__init__(**kwargs)
|
|
|
|
| 1 |
+
from transformers import TFPreTrainedModel, PreTrainedTokenizer, BatchEncoding
|
| 2 |
+
|
| 3 |
from tensorflow.keras.models import Model, load_model, Sequential
|
| 4 |
from tensorflow.keras.layers import Layer, Dense, concatenate, Input, add, Dropout, LayerNormalization, MultiHeadAttention, Embedding
|
| 5 |
import tensorflow as tf
|
|
|
|
| 10 |
import re
|
| 11 |
import unicodedata
|
| 12 |
|
| 13 |
+
from configuration_bilma import BilmaConfig
|
| 14 |
|
| 15 |
# copied from preprocessing.py
|
| 16 |
BLANK = ' '
|
|
|
|
| 39 |
def __init__(self, config):
|
| 40 |
self.seq_max_length = config.seq_max_length
|
| 41 |
self.include_top = config.include_top
|
| 42 |
+
self.add_head = config.add_head
|
| 43 |
super().__init__(config)
|
| 44 |
|
| 45 |
self.model = bilma(num_enc=config.num_hidden_layers,
|
|
|
|
| 50 |
vocab_size=config.vocab_size,
|
| 51 |
rate=config.hidden_dropout_prob,
|
| 52 |
include_top = config.include_top,
|
| 53 |
+
add_head = config.add_head,
|
| 54 |
+
pooling = config.pooling)
|
| 55 |
|
| 56 |
@property
|
| 57 |
def dummy_inputs(self) -> Dict[str, tf.Tensor]:
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
def call(self, inputs):
|
| 77 |
+
if isinstance(inputs, Dict) or isinstance(inputs, BatchEncoding):
|
| 78 |
ins = tf.cast(inputs["input_ids"], tf.float32)
|
| 79 |
else:
|
| 80 |
ins = inputs
|
| 81 |
if self.include_top:
|
| 82 |
output = {"logits":self.model(ins)}
|
| 83 |
else:
|
| 84 |
+
if self.add_head is None:
|
| 85 |
output = {"last_hidden_state":self.model(ins)}
|
| 86 |
else:
|
| 87 |
+
output = {"label":self.model(ins)}
|
| 88 |
return output
|
| 89 |
|
| 90 |
+
def get_loss_function():
|
| 91 |
+
return loss_funtion()
|
| 92 |
+
|
| 93 |
+
def get_acc_function():
|
| 94 |
+
return accuracy_function()
|
| 95 |
+
|
| 96 |
+
|
| 97 |
# copied from bilma_model.py
|
| 98 |
# --------------------------
|
| 99 |
|
|
|
|
| 124 |
|
| 125 |
def mean_vectors(inputs, enc_vectors, max_length):
|
| 126 |
p = tf.where(inputs == 3)
|
|
|
|
| 127 |
pos = tf.transpose(p)[1]
|
| 128 |
C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
|
|
|
|
|
|
|
| 129 |
C = tf.reshape(C, (-1, max_length, 1))
|
| 130 |
S = tf.reduce_sum(enc_vectors * C, 1)
|
| 131 |
x = S / tf.expand_dims(tf.cast(pos, tf.float32), (1))
|
| 132 |
return x
|
| 133 |
|
| 134 |
+
def mean_diff_vectors(inputs, enc_vectors, max_length):
|
| 135 |
+
p = tf.where(inputs == 3)
|
| 136 |
+
pos = tf.transpose(p)[1]
|
| 137 |
+
C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
|
| 138 |
+
C = tf.reshape(C, (-1, max_length, 1))
|
| 139 |
+
vecs = enc_vectors * C
|
| 140 |
+
S = tf.reduce_sum(vecs, 1)
|
| 141 |
+
mu = S / tf.expand_dims(tf.cast(pos, tf.float32), (1))
|
| 142 |
+
x = tf.reduce_sum(mu - vecs, 1) / tf.expand_dims(tf.cast(pos, tf.float32), (1))
|
| 143 |
+
return x
|
| 144 |
+
|
| 145 |
+
def max_vectors(inputs, enc_vectors, max_length):
|
| 146 |
+
p = tf.where(inputs == 3)
|
| 147 |
+
pos = tf.transpose(p)[1]
|
| 148 |
+
C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
|
| 149 |
+
C = tf.reshape(C, (-1, max_length, 1))
|
| 150 |
+
x = tf.reduce_max(enc_vectors * C, 1)
|
| 151 |
+
return x
|
| 152 |
+
|
| 153 |
+
def cls_vectors(inputs, enc_vectors, max_length):
|
| 154 |
+
x = tf.squeeze(enc_vectors[:, 0:1, :], axis=1)
|
| 155 |
+
return x
|
| 156 |
|
| 157 |
+
|
| 158 |
+
def bilma(num_enc=6, embed_dim=300, max_length=50, num_heads=6, ff_dim=512, vocab_size=9739, rate=0.1, include_top=True, add_head=None, pooling=None):
|
| 159 |
capt_inputs_ids = Input(shape=(max_length, ), name='input_ids')
|
| 160 |
capt_embedding = Embedding(vocab_size, embed_dim, mask_zero=False, name="bilma/embedding")
|
| 161 |
capt_inputs = capt_embedding(capt_inputs_ids)
|
|
|
|
| 165 |
if include_top:
|
| 166 |
fin_output = Dense(vocab_size, use_bias=True, name="bilma/dense_final")(enc_output)
|
| 167 |
else:
|
| 168 |
+
x = enc_output
|
| 169 |
+
if pooling == "mean":
|
|
|
|
|
|
|
| 170 |
x = mean_vectors(capt_inputs_ids, x, max_length)
|
| 171 |
+
elif pooling == "cls":
|
| 172 |
+
x = cls_vectors(capt_inputs_ids, x, max_length)
|
| 173 |
+
elif pooling == "max":
|
| 174 |
+
x = max_vectors(capt_inputs_ids, x, max_length)
|
| 175 |
+
|
| 176 |
+
if add_head is None:
|
| 177 |
+
fin_output = x
|
| 178 |
+
else:
|
| 179 |
+
for i, m in enumerate(add_head[:-1]):
|
| 180 |
x = Dense(m, use_bias=True, activation="relu", name=f"bilma/dense_ex_{i}")(x)
|
| 181 |
+
fin_output = Dense(add_head[-1], use_bias=True, activation="softmax", name=f"bilma/dense_ex_final")(x)
|
| 182 |
|
| 183 |
caption_model = Model(inputs=capt_inputs_ids, outputs=fin_output, name="bilma_model")
|
| 184 |
return caption_model
|
|
|
|
| 195 |
#
|
| 196 |
# Copied from transformer_text.py
|
| 197 |
# -------------------------------
|
|
|
|
| 198 |
class EncoderBlock(Layer):
|
| 199 |
def __init__(self, layer_num, patch_dim, num_heads, ff_dim, rate=0.1, **kwargs):
|
| 200 |
super(EncoderBlock, self).__init__(**kwargs)
|
|
|
|
| 276 |
|
| 277 |
return final_output, attn_output1, attn_encoder
|
| 278 |
|
|
|
|
| 279 |
class Encoder(Layer):
|
| 280 |
def __init__(self, n, embed_dim, max_length, num_heads, ff_dim, rate=0.1, **kwargs):
|
| 281 |
super(Encoder, self).__init__(**kwargs)
|
tf_model.h5
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 156875820
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2cc8b04b7a93e6fa9eb46a7a30d89f2e97e4b8ac52da1c0e35239ded8a29482c
|
| 3 |
size 156875820
|