{ "action_depth": 5, "action_dropout": 0.0, "action_hidden_dim": 96, "action_obs_dim": 0, "action_prev_dim": 10, "action_state_dim": 5, "action_target_dim": 10, "action_wide_dim": 512, "architectures": [ "LAMModel" ], "decoder_attention_head_dim": 64, "decoder_attn_implementation": "flash_attention_2", "decoder_encoder_hidden_dim": 5, "decoder_eps": 1e-06, "decoder_ffn_dim": 768, "decoder_freq_dim": 64, "decoder_in_channels": 3, "decoder_num_attention_heads": 3, "decoder_num_layers": 12, "decoder_out_channels": 3, "decoder_patch_size": [ 4, 4 ], "decoder_pos_embed_seq_len": null, "decoder_rope_max_seq_len": 1024, "dtype": "bfloat16", "encoder_height": 64, "encoder_width": 64, "fsq_levels": [ 7, 5, 5, 5, 5 ], "initializer_range": 0.02, "is_diffusion": true, "latent_channels": 5, "max_tokens": 128, "min_tokens": 1, "model_type": "lam", "null_latent": 0, "transformers_version": "4.57.1", "use_tail_drop": true, "videomae_config": { "attn_drop_rate": 0.0, "cos_attn": false, "depth": 8, "drop_path_rate": 0.0, "drop_rate": 0.0, "embed_dim": 192, "img_size": [ 64, 64 ], "in_chans": 3, "init_values": 0.0, "layer_norm_eps": 1e-06, "mlp_ratio": 4, "norm_layer": "nn.LayerNorm", "num_classes": 0, "num_frames": 2, "num_heads": 3, "patch_size": 4, "qk_scale": null, "qkv_bias": true, "tubelet_size": 2, "use_learnable_pos_emb": false, "use_mean_pooling": false, "with_cp": false }, "videomae_from_pretrained": null, "vocab_size": 4375 }