| { | |
| "transformer": { | |
| "depth": 2, | |
| "config": { | |
| "rotary_pos_emb": true, | |
| "dim_heads": 32 | |
| } | |
| }, | |
| "encoder": { | |
| "config": { | |
| "in_channels": 2, | |
| "channels": 128, | |
| "c_mults": [1, 2, 4, 8, 16], | |
| "strides": [2, 4, 4, 4, 8], | |
| "latent_dim": 128, | |
| "use_snake": true | |
| } | |
| }, | |
| "decoder": { | |
| "config": { | |
| "out_channels": 2, | |
| "channels": 128, | |
| "c_mults": [1, 2, 4, 8, 16], | |
| "strides": [2, 4, 4, 4, 8], | |
| "latent_dim": 64, | |
| "use_nearest_upsample": false, | |
| "use_snake": true, | |
| "final_tanh": false | |
| } | |
| }, | |
| "latent_dim": 64, | |
| "downsampling_ratio": 1024, | |
| "io_channels": 2 | |
| } | |