Spaces:
Running
Running
| """ | |
| Title: Text classification with Transformer | |
| Author: [Apoorv Nandan](https://twitter.com/NandanApoorv) | |
| Date created: 2020/05/10 | |
| Last modified: 2024/01/18 | |
| Description: Implement a Transformer block as a Keras layer and use it for text classification. | |
| Accelerator: GPU | |
| Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT) | |
| """ | |
| """ | |
| ## Setup | |
| """ | |
| import keras | |
| from keras import ops | |
| from keras import layers | |
| """ | |
| ## Implement a Transformer block as a layer | |
| """ | |
| class TransformerBlock(layers.Layer): | |
| def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1): | |
| super().__init__() | |
| self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) | |
| self.ffn = keras.Sequential( | |
| [ | |
| layers.Dense(ff_dim, activation="relu"), | |
| layers.Dense(embed_dim), | |
| ] | |
| ) | |
| self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) | |
| self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) | |
| self.dropout1 = layers.Dropout(rate) | |
| self.dropout2 = layers.Dropout(rate) | |
| def call(self, inputs): | |
| attn_output = self.att(inputs, inputs) | |
| attn_output = self.dropout1(attn_output) | |
| out1 = self.layernorm1(inputs + attn_output) | |
| ffn_output = self.ffn(out1) | |
| ffn_output = self.dropout2(ffn_output) | |
| return self.layernorm2(out1 + ffn_output) | |
| """ | |
| ## Implement embedding layer | |
| Two separate embedding layers, one for tokens, one for token index (positions). | |
| """ | |
| class TokenAndPositionEmbedding(layers.Layer): | |
| def __init__(self, maxlen, vocab_size, embed_dim): | |
| super().__init__() | |
| self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim) | |
| self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim) | |
| def call(self, x): | |
| maxlen = ops.shape(x)[-1] | |
| positions = ops.arange(start=0, stop=maxlen, step=1) | |
| positions = self.pos_emb(positions) | |
| x = self.token_emb(x) | |
| return x + positions | |
| """ | |
| ## Download and prepare dataset | |
| """ | |
| vocab_size = 20000 # Only consider the top 20k words | |
| maxlen = 200 # Only consider the first 200 words of each movie review | |
| (x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size) | |
| print(len(x_train), "Training sequences") | |
| print(len(x_val), "Validation sequences") | |
| x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen) | |
| x_val = keras.utils.pad_sequences(x_val, maxlen=maxlen) | |
| """ | |
| ## Create classifier model using transformer layer | |
| Transformer layer outputs one vector for each time step of our input sequence. | |
| Here, we take the mean across all time steps and | |
| use a feed forward network on top of it to classify text. | |
| """ | |
| embed_dim = 32 # Embedding size for each token | |
| num_heads = 2 # Number of attention heads | |
| ff_dim = 32 # Hidden layer size in feed forward network inside transformer | |
| inputs = layers.Input(shape=(maxlen,)) | |
| embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim) | |
| x = embedding_layer(inputs) | |
| transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim) | |
| x = transformer_block(x) | |
| x = layers.GlobalAveragePooling1D()(x) | |
| x = layers.Dropout(0.1)(x) | |
| x = layers.Dense(20, activation="relu")(x) | |
| x = layers.Dropout(0.1)(x) | |
| outputs = layers.Dense(2, activation="softmax")(x) | |
| model = keras.Model(inputs=inputs, outputs=outputs) | |
| """ | |
| ## Train and Evaluate | |
| """ | |
| model.compile( | |
| optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] | |
| ) | |
| history = model.fit( | |
| x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val) | |
| ) | |