| { |
| "_name_or_path": "", |
| "architectures": [ |
| "RQVAESIGLIPTransformer" |
| ], |
| "hidden_size": 1024, |
| "model_type": "rqvaesigliptransformer_model", |
| "rqtransformer": { |
| "architectures": [ |
| "RQTransformer" |
| ], |
| "block_size": [ |
| 16, |
| 16, |
| 4 |
| ], |
| "embed_dim": 2560, |
| "head": { |
| "block": { |
| "n_head": 40 |
| }, |
| "n_layer": 6 |
| }, |
| "input_embed_dim_1": 1024, |
| "input_embed_dim_2": 4096, |
| "model_type": "rqtransformer_model", |
| "torch_dtype": "float32", |
| "transformers_version": "4.36.2", |
| "vocab_size": 16384 |
| } |
| , |
| "rqvaesiglip": { |
| "architectures": [ |
| "RQVAESiglip" |
| ], |
| "bottleneck_type": "rq", |
| "checkpointing": true, |
| "ckpt_path": null, |
| "code_shape": [ |
| 16, |
| 16, |
| 4 |
| ], |
| "ddconfig": { |
| "attn_resolutions": [ |
| 16 |
| ], |
| "ch": 128, |
| "ch_mult": [ |
| 1, |
| 1, |
| 2, |
| 2, |
| 4 |
| ], |
| "double_z": false, |
| "dropout": 0.0, |
| "in_channels": 3, |
| "num_res_blocks": 2, |
| "out_ch": 3, |
| "resolution": 256, |
| "z_channels": 256 |
| }, |
| "decay": 0.99, |
| "embed_dim": 1024, |
| "hidden_size": 1024, |
| "ignore_keys": null, |
| "latent_loss_weight": 0.25, |
| "latent_shape": [ |
| 16, |
| 16, |
| 1024 |
| ], |
| "loss_type": "mse", |
| "model_type": "rqvaesiglip_model", |
| "n_embed": 16384, |
| "pretrained_model": "google/siglip-large-patch16-256", |
| "restart_unused_codes": true, |
| "shared_codebook": true, |
| "torch_dtype": "float32", |
| "transformers_version": "4.36.2" |
| } |
| , |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.36.2" |
| } |
|
|