| { | |
| "action_depth": 5, | |
| "action_dropout": 0.0, | |
| "action_hidden_dim": 96, | |
| "action_obs_dim": 0, | |
| "action_prev_dim": 10, | |
| "action_state_dim": 5, | |
| "action_target_dim": 10, | |
| "action_wide_dim": 512, | |
| "architectures": [ | |
| "LAMModel" | |
| ], | |
| "decoder_attention_head_dim": 64, | |
| "decoder_attn_implementation": "flash_attention_2", | |
| "decoder_encoder_hidden_dim": 5, | |
| "decoder_eps": 1e-06, | |
| "decoder_ffn_dim": 768, | |
| "decoder_freq_dim": 64, | |
| "decoder_in_channels": 3, | |
| "decoder_num_attention_heads": 3, | |
| "decoder_num_layers": 12, | |
| "decoder_out_channels": 3, | |
| "decoder_patch_size": [ | |
| 4, | |
| 4 | |
| ], | |
| "decoder_pos_embed_seq_len": null, | |
| "decoder_rope_max_seq_len": 1024, | |
| "dtype": "bfloat16", | |
| "encoder_height": 64, | |
| "encoder_width": 64, | |
| "fsq_levels": [ | |
| 7, | |
| 5, | |
| 5, | |
| 5, | |
| 5 | |
| ], | |
| "initializer_range": 0.02, | |
| "is_action_discrete": false, | |
| "is_diffusion": true, | |
| "latent_channels": 5, | |
| "max_tokens": 128, | |
| "min_tokens": 1, | |
| "model_type": "lam", | |
| "null_latent": 0, | |
| "transformers_version": "4.57.1", | |
| "use_tail_drop": true, | |
| "videomae_config": { | |
| "attn_drop_rate": 0.0, | |
| "cos_attn": false, | |
| "depth": 8, | |
| "drop_path_rate": 0.0, | |
| "drop_rate": 0.0, | |
| "embed_dim": 192, | |
| "img_size": [ | |
| 64, | |
| 64 | |
| ], | |
| "in_chans": 3, | |
| "init_values": 0.0, | |
| "layer_norm_eps": 1e-06, | |
| "mlp_ratio": 4, | |
| "norm_layer": "nn.LayerNorm", | |
| "num_classes": 0, | |
| "num_frames": 2, | |
| "num_heads": 3, | |
| "patch_size": 4, | |
| "qk_scale": null, | |
| "qkv_bias": true, | |
| "tubelet_size": 2, | |
| "use_learnable_pos_emb": false, | |
| "use_mean_pooling": false, | |
| "with_cp": false | |
| }, | |
| "videomae_from_pretrained": null, | |
| "vocab_size": 4375 | |
| } | |