| { | |
| "_name_or_path": "", | |
| "architectures": [ | |
| "RQVAESIGLIPTransformer" | |
| ], | |
| "hidden_size": 1024, | |
| "model_type": "rqvaesigliptransformer_model", | |
| "rqtransformer": { | |
| "architectures": [ | |
| "RQTransformer" | |
| ], | |
| "block_size": [ | |
| 16, | |
| 16, | |
| 4 | |
| ], | |
| "embed_dim": 2560, | |
| "head": { | |
| "block": { | |
| "n_head": 40 | |
| }, | |
| "n_layer": 6 | |
| }, | |
| "input_embed_dim_1": 1024, | |
| "input_embed_dim_2": 4096, | |
| "model_type": "rqtransformer_model", | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.36.2", | |
| "vocab_size": 16384 | |
| } | |
| , | |
| "rqvaesiglip": { | |
| "architectures": [ | |
| "RQVAESiglip" | |
| ], | |
| "bottleneck_type": "rq", | |
| "checkpointing": true, | |
| "ckpt_path": null, | |
| "code_shape": [ | |
| 16, | |
| 16, | |
| 4 | |
| ], | |
| "ddconfig": { | |
| "attn_resolutions": [ | |
| 16 | |
| ], | |
| "ch": 128, | |
| "ch_mult": [ | |
| 1, | |
| 1, | |
| 2, | |
| 2, | |
| 4 | |
| ], | |
| "double_z": false, | |
| "dropout": 0.0, | |
| "in_channels": 3, | |
| "num_res_blocks": 2, | |
| "out_ch": 3, | |
| "resolution": 256, | |
| "z_channels": 256 | |
| }, | |
| "decay": 0.99, | |
| "embed_dim": 1024, | |
| "hidden_size": 1024, | |
| "ignore_keys": null, | |
| "latent_loss_weight": 0.25, | |
| "latent_shape": [ | |
| 16, | |
| 16, | |
| 1024 | |
| ], | |
| "loss_type": "mse", | |
| "model_type": "rqvaesiglip_model", | |
| "n_embed": 16384, | |
| "pretrained_model": "google/siglip-large-patch16-256", | |
| "restart_unused_codes": true, | |
| "shared_codebook": true, | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.36.2" | |
| } | |
| , | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.36.2" | |
| } | |