{ "name": "boxing_delta_iris_world_model", "env": "BoxingNoFrameSkip-v4", "model_type": "delta_iris", "metadata": { "latent_dim": [1, 4, 1024], "two_hot_rews": false, "tokens_per_block": 6, "num_tokens": 4, "tokens_grid_res": 2, "token_res": 4 }, "util_folders":{ "data": "../../src/data", "models": "../../src/models" }, "requirements":{ "-r": "requirements.txt" }, "models": [ { "name": "world_model", "framework": null, "format": "state_dict", "source": { "weights_path": "world_model.pt", "class_path": "../../src/world_model.py", "class_name": "WorldModel", "class_args": [ { "latent_vocab_size": 1024, "num_actions": 18, "image_channels": 3, "image_size": 64, "two_hot_rews": false, "transformer_config": { "tokens_per_block": 6, "max_blocks": 26, "num_layers": 3, "num_heads": 4, "embed_dim": 256, "attention": "causal", "embed_pdrop": 0.0, "resid_pdrop": 0.0, "attn_pdrop": 0.0 }, "frame_cnn_config": { "image_channels": 3, "latent_dim": 4, "num_channels": 32, "mult": [1, 1, 2, 2, 4], "down": [1, 0, 1, 1, 0] } }] }, "signature": { "inputs": ["wm_input_sequence", "use_kv_cache"], "call_mode": "positional" }, "sub_models": [ { "name": "act_emb", "sub_model_name": "act_emb", "signature": { "inputs": ["act"], "call_mode": "positional" } }, { "name": "latents_emb", "sub_model_name": "latents_emb", "signature": { "inputs": ["latent_tokens"], "call_mode": "positional" } }, { "name": "transformer", "sub_model_name": "transformer", "signature": { "call_mode": "auto" } }, { "name": "frame_cnn", "sub_model_name": "frame_cnn", "signature": { "inputs": ["obs"], "call_mode": "auto" } } ], "methods": [ { "name": "blocks_left_in_kv_cache", "method_name": "blocks_left_in_kv_cache" }, { "name": "reset_kv_cache", "method_name": "reset_kv_cache" } ] }, { "name": "tokenizer", "framework": null, "format": "state_dict", "source": { "weights_path": "tokenizer.pt", "class_path": "../../src/tokenizer.py", "class_name": "Tokenizer", "class_args": [{ "image_channels": 3, "image_size": 64, "num_actions": 18, "num_tokens": 4, "decoder_act_channels": 4, "codebook_size": 1024, "codebook_dim": 64, "max_codebook_updates_with_revival": 0, "encoder_config": { "image_channels": 7, "latent_dim": 64, "num_channels": 64, "mult": [1, 1, 2, 2, 4], "down": [1, 0, 1, 1, 0] }, "decoder_config": { "image_channels": 3, "latent_dim": 84, "num_channels": 64, "mult": [1, 1, 2, 2, 4], "down": [1, 0, 1, 1, 0] }, "frame_cnn_config": { "image_channels": 3, "latent_dim": 16, "num_channels": 32, "mult": [1, 1, 2, 2, 4], "down": [1, 0, 1, 1, 0] } }] }, "signature": { "inputs": ["o1", "a", "o2"], "call_mode": "positional" }, "sub_models": [ { "name": "decode", "sub_model_name": "decode", "signature": { "inputs": ["obs", "act", "q", "should_clamp"], "call_mode": "positional" } } ], "methods": [ { "name": "embed_tokens", "method_name": "embed_tokens" } ] } ] }