| freeze: True | |
| max_vision_token_length: 578 # 24*24 (resolution) + 2 (<img> and <\img>); corresponding to model_config.max_vision_token_length, dataset_config.image_size | |
| params: | |
| embed_dim: 1024 # debug | |
| ckpt_path: vqgan.ckpt | |
| codebook_size: 512 | |
| num_codebook: 2 | |
| ddconfig: | |
| # only_auto_encoder: True | |
| encoder_name: openai-clip-vit-large-patch14-336 | |
| select_layer: [2,10,18,22] | |
| double_z: False | |
| z_channels: 1024 | |
| resolution: 336 # 336 | |
| in_channels: 3 | |
| out_ch: 3 | |
| ch: 128 | |
| ch_mult: [ 1,1,2,4,8] # num_down = len(ch_mult)-1 | |
| num_res_blocks: 2 | |
| attn_resolutions: [24] | |
| dropout: 0.0 | |
| initial_resolution: 24 | |
| num_attn_head: 8 | |