| ## Model parameters | |
| residual_channels = 64 | |
| residual_blocks = 6 | |
| se_ratio = 8 | |
| vit_input_channels = 320 # input dimension to ViT | |
| transformer_input_dim = 1024 | |
| model_embedding_size = 512 | |
| transformer_depth = 12 | |
| attention_heads = 8 | |
| mlp_dim = 2048 | |
| dim_head = 64 # k_q_v dims, risky to tune? | |
| dropout = 0. | |
| emb_dropout = 0. | |
| similarity_weight_init = 10. | |
| similarity_bias_init = -5. | |
| ## Training parameters | |
| learning_rate_init = 0.005 | |
| players_per_batch = 36 | |
| games_per_player = 10 | |
| v_players_per_batch = 40 | |
| v_games_per_player = 10 | |
| num_validate = 10 |