| { | |
| "tr": 1.49, | |
| "num_subjects": 4, | |
| "num_rois": 1000, | |
| "modality_fusion_transformer_num_layers": 1, | |
| "modality_fusion_transformer_num_heads": 4, | |
| "modality_fusion_transformer_dim": 1024, | |
| "modality_fusion_transformer_fuse_mode": "concat", | |
| "modality_fusion_transformer_num_projection_layers": 1, | |
| "predictor_transformer_num_heads": 8, | |
| "predictor_transformer_num_layers": 3, | |
| "text_extractor": "Qwen/Qwen2.5-14B", | |
| "text_extractor_feature_size": 5120, | |
| "text_extractor_dtype": "float16", | |
| "text_extractor_num_last_hidden_states": 4, | |
| "video_extractor": "facebook/vjepa2-vitg-fpc64-256", | |
| "video_extractor_feature_size": 1408, | |
| "video_extractor_pool_size": 2, | |
| "video_extractor_dtype": "bfloat16", | |
| "video_extractor_num_last_hidden_states": 3, | |
| "video_extractor_chunk_length_seconds": 16, | |
| "video_extractor_batch_size": 8, | |
| "audio_extractor_last_layer_index": 2, | |
| "audio_extractor_batch_size": 32, | |
| "audio_extractor_feature_size": 1536, | |
| "architectures": [ | |
| "VIBE" | |
| ], | |
| "model_type": "vibe" | |
| } | |