- 4x_smaller_bs_half_lr_half_seq_len
- attention_kindselective_n_heads2_seed1338
- attention_kindselective_n_heads2_seed1339
- attention_kindselective_n_heads2_seed1340
- attention_kindselective_n_heads2_seed1341
- attention_kindselective_n_heads4_seed1338
- attention_kindselective_n_heads4_seed1339
- attention_kindselective_n_heads4_seed1340
- attention_kindselective_n_heads4_seed1341
- attention_kindselective_n_heads4_seed1342
- attention_kindselective_n_heads4_seed1343
- attention_kindselective_n_heads4_seed1344
- attention_kindselective_n_heads4_seed1345
- attention_kindselective_n_heads8_seed1338
- attention_kindselective_n_heads8_seed1339
- attention_kindselective_n_heads8_seed1340
- attention_kindselective_n_heads8_seed1341
- attention_kindself_n_heads2_seed1338
- attention_kindself_n_heads2_seed1339
- attention_kindself_n_heads2_seed1340
- attention_kindself_n_heads2_seed1341
- attention_kindself_n_heads4_seed1338
- attention_kindself_n_heads4_seed1339
- attention_kindself_n_heads4_seed1340
- attention_kindself_n_heads4_seed1341
- baseline
- half_total_bs_sqrt_lr
- lr0.75e-4_total_batch_size5120_seq_len256_decay_lrfalse_attention_kindselective
- lr1.5e-3_total_batch_size20480_seq_len128
- lr1.5e-3_total_batch_size20480_seq_len512
- lr1.5e-3_total_batch_size40960_seq_len128
- lr1.5e-3_total_batch_size40960_seq_len512
- lr1.5e-3_total_batch_size5120_seq_len128
- lr1.5e-3_total_batch_size5120_seq_len512
- lr1.5e-4_total_batch_size10240_n_heads2_seed1338
- lr1.5e-4_total_batch_size10240_n_heads2_seed1339
- lr1.5e-4_total_batch_size10240_n_heads2_seed1340
- lr1.5e-4_total_batch_size10240_n_heads4_seed1338
- lr1.5e-4_total_batch_size10240_n_heads4_seed1340
- lr1.5e-4_total_batch_size5120_n_heads2_seed1338
- lr1.5e-4_total_batch_size5120_n_heads2_seed1339
- lr1.5e-4_total_batch_size5120_n_heads2_seed1340
- lr1.5e-4_total_batch_size5120_n_heads4_seed1338
- lr1.5e-4_total_batch_size5120_n_heads4_seed1339
- lr1.5e-4_total_batch_size5120_n_heads4_seed1340
- lr1.5e-4_total_batch_size5120_seq_len256_decay_lrfalse_attention_kindselective
- lr1.5e-4_total_batch_size5120_seq_len256_decay_lrtrue_attention_kindselective_warmup_steps1
- lr1.75e-4_total_batch_size10240_n_heads2_seed1339
- lr1.75e-4_total_batch_size10240_n_heads2_seed1340
- lr1.75e-4_total_batch_size10240_n_heads4_seed1338