| { | |
| "resolution_mode": "native", | |
| "init_method": "xavier", | |
| "pe_type": "rope2d", | |
| "rope_theta": 10000, | |
| "patch_size": 14, | |
| "temporal_patch_size": 2, | |
| "spatial_merge_size": 1, | |
| "num_hidden_layers": 32, | |
| "num_attention_heads": 16, | |
| "hidden_size": 1280, | |
| "intermediate_size": 5184, | |
| "patch_embedding_bias": true, | |
| "qk_normalization": true, | |
| "qkv_bias": false, | |
| "use_pre_norm": false, | |
| "use_flash_attn": true, | |
| "norm_type": "RMSNorm", | |
| "layer_norm_eps": 1e-06, | |
| "hidden_act": "SwiGLU", | |
| "patch_dropout": 0.5, | |
| "attention_dropout": 0.0, | |
| "drop_path_rate": 0.0, | |
| "initializer_range": 1e-10, | |
| "initializer_factor": 0.1, | |
| "image_size": 1792, | |
| "min_tokens": 576, | |
| "max_tokens": 5832, | |
| "min_tokens_video": 512, | |
| "max_tokens_video": 28672, | |
| "min_tokens_video_vit": 576, | |
| "max_tokens_video_vit": 156800, | |
| "max_frames": 128, | |
| "image_mean": [0.485, 0.456, 0.406], | |
| "image_std": [0.229, 0.224, 0.225], | |
| "resize_factor": 8, | |
| "relarge_ratio": 1.0, | |
| "adaptor_dim": 7168 | |
| } |