Spaces:
Runtime error
Runtime error
Upload 12 files
Browse files- aglib/meliad/transformer/configs/size/large_1200M.gin +7 -0
- aglib/meliad/transformer/configs/size/large_600M.gin +8 -0
- aglib/meliad/transformer/configs/size/layer13.gin +4 -0
- aglib/meliad/transformer/configs/size/layer24.gin +4 -0
- aglib/meliad/transformer/configs/size/layer26.gin +4 -0
- aglib/meliad/transformer/configs/size/medium_150M.gin +8 -0
- aglib/meliad/transformer/configs/size/medium_300M.gin +8 -0
- aglib/meliad/transformer/configs/size/small.gin +7 -0
- aglib/meliad/transformer/configs/size/small_37M.gin +8 -0
- aglib/meliad/transformer/configs/size/small_75M.gin +8 -0
- aglib/meliad/transformer/configs/size/small_test.gin +37 -0
- aglib/meliad/transformer/configs/size/tiny_test.gin +27 -0
aglib/meliad/transformer/configs/size/large_1200M.gin
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Number of parameters = 1208M
|
| 3 |
+
NUM_LAYERS = 12
|
| 4 |
+
EMBED_DIM = 2048
|
| 5 |
+
NUM_HEADS = 32
|
| 6 |
+
HEAD_DIM = 128
|
| 7 |
+
MLP_DIM = 16384
|
aglib/meliad/transformer/configs/size/large_600M.gin
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Number of parameters = 605M
|
| 3 |
+
NUM_LAYERS = 12
|
| 4 |
+
EMBED_DIM = 2048
|
| 5 |
+
NUM_HEADS = 16
|
| 6 |
+
HEAD_DIM = 128
|
| 7 |
+
MLP_DIM = 8192
|
| 8 |
+
|
aglib/meliad/transformer/configs/size/layer13.gin
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Use 13 layers, for comparison against recurrent transformers.
|
| 3 |
+
|
| 4 |
+
NUM_LAYERS = 13
|
aglib/meliad/transformer/configs/size/layer24.gin
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Tall configuration, which doubles the number of layers and parameters.
|
| 3 |
+
|
| 4 |
+
NUM_LAYERS = 24
|
aglib/meliad/transformer/configs/size/layer26.gin
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Use 26 layers, for comparison against tall recurrent transformers.
|
| 3 |
+
|
| 4 |
+
NUM_LAYERS = 26
|
aglib/meliad/transformer/configs/size/medium_150M.gin
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Number of parameters = 151M
|
| 3 |
+
NUM_LAYERS = 12
|
| 4 |
+
EMBED_DIM = 1024
|
| 5 |
+
NUM_HEADS = 8
|
| 6 |
+
HEAD_DIM = 128
|
| 7 |
+
MLP_DIM = 4096
|
| 8 |
+
|
aglib/meliad/transformer/configs/size/medium_300M.gin
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Number of parameters = 302M
|
| 3 |
+
NUM_LAYERS = 12
|
| 4 |
+
EMBED_DIM = 1024
|
| 5 |
+
NUM_HEADS = 16
|
| 6 |
+
HEAD_DIM = 128
|
| 7 |
+
MLP_DIM = 8192
|
| 8 |
+
|
aglib/meliad/transformer/configs/size/small.gin
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
NUM_LAYERS = 6
|
| 3 |
+
EMBED_DIM = 512
|
| 4 |
+
NUM_HEADS = 8
|
| 5 |
+
HEAD_DIM = 128
|
| 6 |
+
MLP_DIM = 2048
|
| 7 |
+
|
aglib/meliad/transformer/configs/size/small_37M.gin
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Number of parameters = 37M.
|
| 3 |
+
NUM_LAYERS = 12
|
| 4 |
+
EMBED_DIM = 512
|
| 5 |
+
NUM_HEADS = 8
|
| 6 |
+
HEAD_DIM = 64
|
| 7 |
+
MLP_DIM = 2048
|
| 8 |
+
|
aglib/meliad/transformer/configs/size/small_75M.gin
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Number of parameters = 75M
|
| 3 |
+
NUM_LAYERS = 12
|
| 4 |
+
EMBED_DIM = 512
|
| 5 |
+
NUM_HEADS = 8
|
| 6 |
+
HEAD_DIM = 128
|
| 7 |
+
MLP_DIM = 4096
|
| 8 |
+
|
aglib/meliad/transformer/configs/size/small_test.gin
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Small config for testing purposes
|
| 2 |
+
|
| 3 |
+
NUM_LAYERS = 6
|
| 4 |
+
EMBED_DIM = 512
|
| 5 |
+
NUM_HEADS = 8
|
| 6 |
+
HEAD_DIM = 128
|
| 7 |
+
MLP_DIM = 2048
|
| 8 |
+
DROPOUT_RATE = 0.1
|
| 9 |
+
ATTN_DROPOUT_RATE = 0.1
|
| 10 |
+
|
| 11 |
+
decoder_stack.TransformerTaskConfig:
|
| 12 |
+
sequence_length = 512
|
| 13 |
+
batch_size = 2
|
| 14 |
+
|
| 15 |
+
transformer_layer.TransformerLayer:
|
| 16 |
+
window_length = 256
|
| 17 |
+
use_long_xl_architecture = True
|
| 18 |
+
max_unrolled_windows = -1
|
| 19 |
+
recurrent_num_states = 384 # Odd number for debugging purposes.
|
| 20 |
+
recurrent_gate_type = "bias"
|
| 21 |
+
recurrent_single_gate = False
|
| 22 |
+
recurrent_skip_ffn = True
|
| 23 |
+
|
| 24 |
+
decoder_stack.DecoderStack:
|
| 25 |
+
dstack_window_length = 0
|
| 26 |
+
recurrent_layer_indices = () # (-1,)
|
| 27 |
+
feedback_recurrence = False
|
| 28 |
+
|
| 29 |
+
training_loop.Trainer:
|
| 30 |
+
num_steps = 10_000
|
| 31 |
+
status_every_steps = 5
|
| 32 |
+
log_every_steps = 20
|
| 33 |
+
test_every_steps = 50
|
| 34 |
+
num_test_steps = 2
|
| 35 |
+
generate_every_steps = 100
|
| 36 |
+
print_input_every_steps = 100
|
| 37 |
+
checkpoint_every_steps = 200
|
aglib/meliad/transformer/configs/size/tiny_test.gin
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Tiny config for testing purposes.
|
| 2 |
+
|
| 3 |
+
NUM_LAYERS = 2
|
| 4 |
+
EMBED_DIM = 128
|
| 5 |
+
NUM_HEADS = 4
|
| 6 |
+
HEAD_DIM = 32
|
| 7 |
+
MLP_DIM = 256
|
| 8 |
+
DROPOUT_RATE = 0.1
|
| 9 |
+
ATTN_DROPOUT_RATE = 0.1
|
| 10 |
+
|
| 11 |
+
decoder_stack.TransformerTaskConfig:
|
| 12 |
+
sequence_length = 256
|
| 13 |
+
batch_size = 1
|
| 14 |
+
|
| 15 |
+
transformer_layer.TransformerLayer:
|
| 16 |
+
window_length = 128
|
| 17 |
+
use_long_xl_architecture = True
|
| 18 |
+
|
| 19 |
+
training_loop.Trainer:
|
| 20 |
+
num_steps = 1000
|
| 21 |
+
warmup_steps = 100
|
| 22 |
+
log_every_steps = 10
|
| 23 |
+
test_every_steps = 10
|
| 24 |
+
num_test_steps = 1
|
| 25 |
+
generate_every_steps = 100
|
| 26 |
+
print_input_every_steps = 100
|
| 27 |
+
checkpoint_every_steps = 100
|