{ "_name_or_path": "PLAN-Lab/CALICO", "architectures": [ "CALICOForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "auto_initialize_adaptors": true, "auto_initialize_qformer": true, "bbox_token_idx": 32002, "bos_token_id": 1, "dino_hidden_size": 768, "dino_model_name": "dinov2_vitb14_reg", "eos_token_id": 2, "global_image_size": 224, "grounding_encoder": "sam_vit_h", "grounding_image_size": 1024, "hidden_act": "silu", "hidden_size": 4096, "image_aspect": "square", "image_aspect_ratio": "square", "initializer_range": 0.02, "intermediate_size": 11008, "layer_type": "linear", "max_position_embeddings": 4096, "mlp_bias": false, "mm_projector_type": "linear", "model_type": "calico", "num_attention_heads": 32, "num_attn_heads": 8, "num_hidden_layers": 32, "num_key_value_heads": 32, "num_level_reg_features": 4, "num_query_tokens": 32, "out_dim": 256, "pad_token_id": 0, "pretraining_tp": 1, "q_former_model": "", "qformer_hidden_size": 768, "qformer_vision_encoder": "eva_clip_g", "qformer_vision_width": 1408, "rms_norm_eps": 1e-05, "rope_scaling": null, "rope_theta": 10000.0, "seg_image_tokens": [ [ 32004, 313, 2382, 29896 ], [ 32004, 313, 2382, 29906 ] ], "seg_token_idx": 32004, "tie_word_embeddings": false, "torch_dtype": "bfloat16", "train_mask_decoder": true, "transformers_version": "4.42.3", "update_layers": [ 11, 22 ], "use_cache": false, "use_mm_proj": true, "vocab_size": 32007, "with_region": true }