| { |
| "name": "boxing_delta_iris_world_model", |
| "env": "BoxingNoFrameSkip-v4", |
| "model_type": "delta_iris", |
| "metadata": { |
| "latent_dim": [1, 4, 1024], |
| "two_hot_rews": false, |
| "tokens_per_block": 6, |
| "num_tokens": 4, |
| "tokens_grid_res": 2, |
| "token_res": 4 |
| }, |
| "util_folders":{ |
| "data": "../../src/data", |
| "models": "../../src/models" |
| }, |
| "requirements":{ |
| "-r": "requirements.txt" |
| }, |
| "models": [ |
| { |
| "name": "world_model", |
| "framework": null, |
| "format": "state_dict", |
| "source": { |
| "weights_path": "world_model.pt", |
| "class_path": "../../src/world_model.py", |
| "class_name": "WorldModel", |
| "class_args": [ |
| { |
| "latent_vocab_size": 1024, |
| "num_actions": 18, |
| "image_channels": 3, |
| "image_size": 64, |
| "two_hot_rews": false, |
| "transformer_config": { |
| "tokens_per_block": 6, |
| "max_blocks": 26, |
| "num_layers": 3, |
| "num_heads": 4, |
| "embed_dim": 256, |
| "attention": "causal", |
| "embed_pdrop": 0.0, |
| "resid_pdrop": 0.0, |
| "attn_pdrop": 0.0 |
| }, |
| "frame_cnn_config": { |
| "image_channels": 3, |
| "latent_dim": 4, |
| "num_channels": 32, |
| "mult": [1, 1, 2, 2, 4], |
| "down": [1, 0, 1, 1, 0] |
| } |
| }] |
| }, |
| "signature": { |
| "inputs": ["wm_input_sequence", "use_kv_cache"], |
| "call_mode": "positional" |
| }, |
| "sub_models": |
| [ |
| { |
| "name": "act_emb", |
| "sub_model_name": "act_emb", |
| "signature": |
| { |
| "inputs": ["act"], |
| "call_mode": "positional" |
| } |
| }, |
| { |
| "name": "latents_emb", |
| "sub_model_name": "latents_emb", |
| "signature": |
| { |
| "inputs": ["latent_tokens"], |
| "call_mode": "positional" |
| } |
| }, |
| { |
| "name": "transformer", |
| "sub_model_name": "transformer", |
| "signature": |
| { |
| "call_mode": "auto" |
| } |
| }, |
| { |
| "name": "frame_cnn", |
| "sub_model_name": "frame_cnn", |
| "signature": |
| { |
| "inputs": ["obs"], |
| "call_mode": "auto" |
| } |
| } |
| ], |
| "methods": |
| [ |
| { |
| "name": "blocks_left_in_kv_cache", |
| "method_name": "blocks_left_in_kv_cache" |
| }, |
| { |
| "name": "reset_kv_cache", |
| "method_name": "reset_kv_cache" |
| } |
| ] |
| }, |
| { |
| "name": "tokenizer", |
| "framework": null, |
| "format": "state_dict", |
| "source": { |
| "weights_path": "tokenizer.pt", |
| "class_path": "../../src/tokenizer.py", |
| "class_name": "Tokenizer", |
| "class_args": [{ |
| "image_channels": 3, |
| "image_size": 64, |
| "num_actions": 18, |
| "num_tokens": 4, |
| "decoder_act_channels": 4, |
| "codebook_size": 1024, |
| "codebook_dim": 64, |
| "max_codebook_updates_with_revival": 0, |
| "encoder_config": { |
| "image_channels": 7, |
| "latent_dim": 64, |
| "num_channels": 64, |
| "mult": [1, 1, 2, 2, 4], |
| "down": [1, 0, 1, 1, 0] |
| }, |
| "decoder_config": { |
| "image_channels": 3, |
| "latent_dim": 84, |
| "num_channels": 64, |
| "mult": [1, 1, 2, 2, 4], |
| "down": [1, 0, 1, 1, 0] |
| }, |
| "frame_cnn_config": { |
| "image_channels": 3, |
| "latent_dim": 16, |
| "num_channels": 32, |
| "mult": [1, 1, 2, 2, 4], |
| "down": [1, 0, 1, 1, 0] |
| } |
| }] |
| }, |
| "signature": { |
| "inputs": ["o1", "a", "o2"], |
| "call_mode": "positional" |
| }, |
| "sub_models": |
| [ |
| { |
| "name": "decode", |
| "sub_model_name": "decode", |
| "signature": |
| { |
| "inputs": ["obs", "act", "q", "should_clamp"], |
| "call_mode": "positional" |
| } |
| } |
| ], |
| "methods": |
| [ |
| { |
| "name": "embed_tokens", |
| "method_name": "embed_tokens" |
| } |
| ] |
| } |
| ] |
| } |