| { | |
| "auto_map": { | |
| "AutoConfig": "configuration_dualvitok.DualViTokConfig", | |
| "AutoModel": "modeling_dualvitok.DualViTok" | |
| }, | |
| "architectures": [ | |
| "DualViTok" | |
| ], | |
| "semantic_encoder": { | |
| "pretrained_semantic_encoder": "Emova-ollm/qwen2vit600m", | |
| "z_channels": 32, | |
| "num_blocks": 4, | |
| "out_layer": "linear", | |
| "embed_dim": 1280, | |
| "target_mlp": "norm" | |
| }, | |
| "semantic_decoder": { | |
| "z_channels": 32, | |
| "num_blocks": 4, | |
| "embed_dim": 1280, | |
| "out_layer": "linear_norm", | |
| "out_channels": 3584 | |
| }, | |
| "semantic_quantizer_type": "simvq", | |
| "pixel_quantizer_type": "simvq", | |
| "semantic_quantizer_codebook_size": 32768, | |
| "pixel_quantizer_codebook_size": 98304, | |
| "attn_implementation": "eager", | |
| "pixel_encoder": { | |
| "codebook_size": 98304, | |
| "embed_dim": 32, | |
| "z_channels": 32, | |
| "double_z": false, | |
| "in_channels": 3, | |
| "out_channels": 3, | |
| "ch": 128, | |
| "ch_mult": [ | |
| 1, | |
| 1, | |
| 2, | |
| 2, | |
| 4 | |
| ], | |
| "num_res_blocks": 2, | |
| "attn_resolutions": [ | |
| 4 | |
| ], | |
| "dropout": 0.0, | |
| "use_dc_up_down_blocks": true | |
| }, | |
| "pixel_decoder": { | |
| "codebook_size": 98304, | |
| "embed_dim": 64, | |
| "z_channels": 64, | |
| "double_z": false, | |
| "in_channels": 3, | |
| "out_channels": 3, | |
| "ch": 384, | |
| "ch_mult": [ | |
| 1, | |
| 1, | |
| 2, | |
| 2, | |
| 4 | |
| ], | |
| "num_res_blocks": 2, | |
| "attn_resolutions": [ | |
| 4 | |
| ], | |
| "dropout": 0.0, | |
| "use_dc_up_down_blocks": true | |
| }, | |
| "torch_dtype": "float16", | |
| "transformers_version": "4.44.2" | |
| } | |