| { | |
| "name": "boxing_delta_iris_world_model", | |
| "env": "BoxingNoFrameSkip-v4", | |
| "model_type": "delta_iris", | |
| "metadata": { | |
| "latent_dim": [1, 4, 1024], | |
| "two_hot_rews": false, | |
| "tokens_per_block": 6, | |
| "num_tokens": 4, | |
| "tokens_grid_res": 2, | |
| "token_res": 4 | |
| }, | |
| "util_folders":{ | |
| "data": "../../src/data", | |
| "models": "../../src/models" | |
| }, | |
| "requirements":{ | |
| "-r": "requirements.txt" | |
| }, | |
| "models": [ | |
| { | |
| "name": "world_model", | |
| "framework": null, | |
| "format": "state_dict", | |
| "source": { | |
| "weights_path": "world_model.pt", | |
| "class_path": "../../src/world_model.py", | |
| "class_name": "WorldModel", | |
| "class_args": [ | |
| { | |
| "latent_vocab_size": 1024, | |
| "num_actions": 18, | |
| "image_channels": 3, | |
| "image_size": 64, | |
| "two_hot_rews": false, | |
| "transformer_config": { | |
| "tokens_per_block": 6, | |
| "max_blocks": 26, | |
| "num_layers": 3, | |
| "num_heads": 4, | |
| "embed_dim": 256, | |
| "attention": "causal", | |
| "embed_pdrop": 0.0, | |
| "resid_pdrop": 0.0, | |
| "attn_pdrop": 0.0 | |
| }, | |
| "frame_cnn_config": { | |
| "image_channels": 3, | |
| "latent_dim": 4, | |
| "num_channels": 32, | |
| "mult": [1, 1, 2, 2, 4], | |
| "down": [1, 0, 1, 1, 0] | |
| } | |
| }] | |
| }, | |
| "signature": { | |
| "inputs": ["wm_input_sequence", "use_kv_cache"], | |
| "call_mode": "positional" | |
| }, | |
| "sub_models": | |
| [ | |
| { | |
| "name": "act_emb", | |
| "sub_model_name": "act_emb", | |
| "signature": | |
| { | |
| "inputs": ["act"], | |
| "call_mode": "positional" | |
| } | |
| }, | |
| { | |
| "name": "latents_emb", | |
| "sub_model_name": "latents_emb", | |
| "signature": | |
| { | |
| "inputs": ["latent_tokens"], | |
| "call_mode": "positional" | |
| } | |
| }, | |
| { | |
| "name": "transformer", | |
| "sub_model_name": "transformer", | |
| "signature": | |
| { | |
| "call_mode": "auto" | |
| } | |
| }, | |
| { | |
| "name": "frame_cnn", | |
| "sub_model_name": "frame_cnn", | |
| "signature": | |
| { | |
| "inputs": ["obs"], | |
| "call_mode": "auto" | |
| } | |
| } | |
| ], | |
| "methods": | |
| [ | |
| { | |
| "name": "blocks_left_in_kv_cache", | |
| "method_name": "blocks_left_in_kv_cache" | |
| }, | |
| { | |
| "name": "reset_kv_cache", | |
| "method_name": "reset_kv_cache" | |
| } | |
| ] | |
| }, | |
| { | |
| "name": "tokenizer", | |
| "framework": null, | |
| "format": "state_dict", | |
| "source": { | |
| "weights_path": "tokenizer.pt", | |
| "class_path": "../../src/tokenizer.py", | |
| "class_name": "Tokenizer", | |
| "class_args": [{ | |
| "image_channels": 3, | |
| "image_size": 64, | |
| "num_actions": 18, | |
| "num_tokens": 4, | |
| "decoder_act_channels": 4, | |
| "codebook_size": 1024, | |
| "codebook_dim": 64, | |
| "max_codebook_updates_with_revival": 0, | |
| "encoder_config": { | |
| "image_channels": 7, | |
| "latent_dim": 64, | |
| "num_channels": 64, | |
| "mult": [1, 1, 2, 2, 4], | |
| "down": [1, 0, 1, 1, 0] | |
| }, | |
| "decoder_config": { | |
| "image_channels": 3, | |
| "latent_dim": 84, | |
| "num_channels": 64, | |
| "mult": [1, 1, 2, 2, 4], | |
| "down": [1, 0, 1, 1, 0] | |
| }, | |
| "frame_cnn_config": { | |
| "image_channels": 3, | |
| "latent_dim": 16, | |
| "num_channels": 32, | |
| "mult": [1, 1, 2, 2, 4], | |
| "down": [1, 0, 1, 1, 0] | |
| } | |
| }] | |
| }, | |
| "signature": { | |
| "inputs": ["o1", "a", "o2"], | |
| "call_mode": "positional" | |
| }, | |
| "sub_models": | |
| [ | |
| { | |
| "name": "decode", | |
| "sub_model_name": "decode", | |
| "signature": | |
| { | |
| "inputs": ["obs", "act", "q", "should_clamp"], | |
| "call_mode": "positional" | |
| } | |
| } | |
| ], | |
| "methods": | |
| [ | |
| { | |
| "name": "embed_tokens", | |
| "method_name": "embed_tokens" | |
| } | |
| ] | |
| } | |
| ] | |
| } |