{ "auto_map": { "AutoConfig": "configuration_dualvitok.DualViTokConfig", "AutoModel": "modeling_dualvitok.DualViTok" }, "architectures": [ "DualViTok" ], "semantic_encoder": { "pretrained_semantic_encoder": "Emova-ollm/qwen2vit600m", "z_channels": 32, "num_blocks": 4, "out_layer": "linear", "embed_dim": 1280, "target_mlp": "norm" }, "semantic_decoder": { "z_channels": 32, "num_blocks": 4, "embed_dim": 1280, "out_layer": "linear_norm", "out_channels": 3584 }, "semantic_quantizer_type": "simvq", "pixel_quantizer_type": "simvq", "semantic_quantizer_codebook_size": 32768, "pixel_quantizer_codebook_size": 98304, "attn_implementation": "eager", "pixel_encoder": { "codebook_size": 98304, "embed_dim": 32, "z_channels": 32, "double_z": false, "in_channels": 3, "out_channels": 3, "ch": 128, "ch_mult": [ 1, 1, 2, 2, 4 ], "num_res_blocks": 2, "attn_resolutions": [ 4 ], "dropout": 0.0, "use_dc_up_down_blocks": true }, "pixel_decoder": { "codebook_size": 98304, "embed_dim": 64, "z_channels": 64, "double_z": false, "in_channels": 3, "out_channels": 3, "ch": 384, "ch_mult": [ 1, 1, 2, 2, 4 ], "num_res_blocks": 2, "attn_resolutions": [ 4 ], "dropout": 0.0, "use_dc_up_down_blocks": true }, "torch_dtype": "float16", "transformers_version": "4.44.2" }