File size: 2,152 Bytes
d899b9f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
common:
# The number of historical images
img_history_size: 2
# The number of future actions to predict
action_chunk_size: 64
# The number of cameras to be used in the model
num_cameras: 3
# Dimension for state/action, we use the same space for both state and action
# This MUST be equal to configs/state_vec.py
state_dim: 128
dataset:
# We will extract the data from raw dataset
# and store them in the disk buffer by producer
# When training, we will read the data
# randomly from the buffer by consumer
# The producer will replace the data which has been
# read by the consumer with new data
# The path to the buffer (at least 400GB)
buf_path: /home/jellyho/RDTBuffer
# The number of chunks in the buffer
buf_num_chunks: 128
# The number of samples (step rather than episode) in each chunk
buf_chunk_size: 128
# We will filter the episodes with length less than `epsd_len_thresh_low`
epsd_len_thresh_low: 32
# For those more than `epsd_len_thresh_high`,
# we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
# to better balance the training datasets
epsd_len_thresh_high: 2048
# How to fit the image size
image_aspect_ratio: pad
# Maximum number of language tokens
tokenizer_max_length: 1024
model:
# Config for condition adpators
lang_adaptor: mlp2x_gelu
img_adaptor: mlp2x_gelu
state_adaptor: mlp3x_gelu
lang_token_dim: 4096
img_token_dim: 1152
# Dim of action or proprioception vector
# A `state` refers to an action or a proprioception vector
state_token_dim: 128
# Config for RDT structure
rdt:
# 1B: num_head 32 hidden_size 2048
hidden_size: 2048
depth: 28
num_heads: 32
cond_pos_embed_type: multimodal
# For noise scheduler
noise_scheduler:
type: ddpm
num_train_timesteps: 1000
num_inference_timesteps: 5
beta_schedule: squaredcos_cap_v2 # Critical choice
prediction_type: sample
clip_sample: False
# For EMA (params averaging)
# We do not use EMA currently
ema:
update_after_step: 0
inv_gamma: 1.0
power: 0.75
min_value: 0.0
max_value: 0.9999
|