| dimension: 1024 # Used as dim_in and dim_out for ContinuousTransformerWrapper | |
| ff_mult: 4 # Multiplier for the feedforward dimension | |
| depth: 12 # Number of layers in the Encoder | |
| heads: 8 # Number of attention heads in the Encoder | |
| rotary_pos_emb: True | |
| attn_flash: True # FA-2 if installed | |
| attn_kv_heads: 2 # GQA | |
| qk_norm: True | |
| pre_norm: True | |
| residual_attn: False # Set pre_norm to False if residual_attn is True | |
| num_memory_tokens: 0 # Number of memory tokens, 0 means no memory tokens | |
| causal: False # Enable causal masking for autoregressive attention | |
| direction_loss_bins: 16 | |
| pos_scale_factor: 1.0 |