File size: 613 Bytes
d67ca3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
dimension: 1024  # Used as dim_in and dim_out for ContinuousTransformerWrapper
ff_mult: 4         # Multiplier for the feedforward dimension
depth: 12      # Number of layers in the Encoder
heads: 8      # Number of attention heads in the Encoder
rotary_pos_emb: True
attn_flash: True # FA-2 if installed
attn_kv_heads: 2 # GQA
qk_norm: True
pre_norm: True
residual_attn: False # Set pre_norm to False if residual_attn is True
num_memory_tokens: 0 # Number of memory tokens, 0 means no memory tokens
causal: False # Enable causal masking for autoregressive attention

direction_loss_bins: 16
pos_scale_factor: 1.0