tea-base / config.json
zhang
Upload folder using huggingface_hub
1491531 verified
{
"_name_or_path": "/data/zhangyan/data/pretrained_model/videot5-mod/t5-instruct-qformer-vision",
"am_adapter_channels": 384,
"am_adapter_kernel_size_l": 3,
"am_adapter_kernel_size_t": 3,
"am_adapter_mask_zero": false,
"am_adapter_scope": "all",
"am_bbox_scale": 100,
"am_ocr_multi_gran": true,
"am_space_attn": "bros-level",
"am_time_mlp_freq": 1,
"am_time_mlp_pretrained": false,
"am_time_type": "adapter-conv",
"am_type": "divided-space-time",
"architectures": [
"VideoQformerT5"
],
"classifier_dropout": 0.0,
"d_ff": 3072,
"d_kv": 64,
"d_model": 768,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"encoder_hidden_states_scope": "all",
"eos_token_id": 1,
"feed_forward_proj": "relu",
"initializer_factor": 1.0,
"initializer_range": 0.02,
"is_encoder_decoder": true,
"is_gated_act": false,
"key_frame_module_batch_attn": true,
"keyframe_module_model_name": "/data/zhangyan/data/pretrained_model/clip-vit-base-patch32",
"layer_norm_epsilon": 1e-06,
"max_length": 65,
"model_type": "multimodalt5",
"n_positions": 512,
"num_decoder_layers": 12,
"num_frames": 10,
"num_heads": 12,
"num_images": null,
"num_key_frames": 2,
"num_layers": 12,
"obj_len": 60,
"ocr_len": 250,
"output_past": true,
"pad_token_id": 0,
"qformer_module_model_name": "/data/zhangyan/data/pretrained_model/Salesforce/instructblip-vicuna-7b",
"qm_decoder_input": "all",
"qm_global_query_init": true,
"qm_interact_type": "self-cross",
"qm_loss": "none",
"qm_loss_scale": 0.5,
"qm_num_layers": 12,
"qm_num_query_tokens": 32,
"qm_query_decoder_qkv": "kv",
"qm_scope": "ocr",
"qm_use_ques_guide": true,
"ques_len": 45,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.42.3",
"use_bbox": true,
"use_cache": true,
"use_feature": false,
"use_frame_type": true,
"use_frcn": false,
"use_key_frame_module": false,
"use_qformer_module": true,
"video_feature_dim": 1024,
"vocab_size": 32128
}