Spaces:
Sleeping
Sleeping
| model: | |
| arch: mini_gpt4_llama_v2 | |
| freeze_vit: True | |
| freeze_qformer: True | |
| max_txt_len: 512 | |
| low_resource: True | |
| image_size: 224 | |
| end_sym: "</s>" | |
| llama_model: "Qwen/Qwen2.5-7B-Instruct" | |
| ckpt: "checkpoints/video_llama_checkpoint_last.pth" | |
| use_grad_checkpoint: True | |
| chat_template: True | |
| lora_r: 64 | |
| lora_alpha: 16 | |
| length: 45 | |
| use_grad_checkpoint_llm: True | |
| max_context_len: 3072 | |
| architectures: [ | |
| "MiniGPT4_Video" | |
| ] | |
| device: "cuda" | |
| drop_path_rate: 0.1 | |
| img_size: 224 | |
| model_type: "minigpt4_video" | |
| num_query_token: 32 | |
| prompt: "" | |
| torch_dtype: "float16" | |
| transformers_version: "4.37.2" | |
| vit_precision: "fp16" | |
| vit_model: "eva_clip_g" | |
| token_pooling: true | |
| lora_target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj"] | |
| lora_dropout: 0.05 | |
| remove_template: false | |
| prompt_path: "" | |
| minigpt4_gpu_id: 0 | |
| whisper_gpu_id: 0 | |
| answer_module_gpu_id: 0 | |
| gradient_accumulation_steps: 1 | |
| warmup_steps: 100 | |
| save_steps: 1000 | |
| logging_steps: 50 | |
| eval_steps: 500 | |
| max_new_tokens: 512 | |
| temperature: 0.7 | |
| top_p: 0.9 | |
| do_sample: true | |
| num_beams: 1 | |
| datasets: | |
| video_chatgpt: #99378 row - 13224 video | |
| batch_size: 2 | |
| num_workers: 2 | |
| vis_processor: | |
| train: | |
| name: "blip2_image_train" | |
| image_size: 224 | |
| mean: [0.48145466, 0.4578275, 0.40821073] | |
| std: [0.26862954, 0.26130258, 0.27577711] | |
| eval: | |
| name: "blip2_image_eval" | |
| image_size: 224 | |
| text_processor: | |
| train: | |
| name: "blip_caption" | |
| max_words: 512 | |
| eval: | |
| name: "blip_caption" | |
| max_words: 512 | |
| sample_ratio: 100 | |
| run: | |
| seed: 42 | |
| amp: true | |
| distributed: false | |
| gpu_id: 0 | |
| world_size: 1 | |
| rank: 0 | |
| dataloader_num_workers: 2 | |
| pin_memory: true | |
| persistent_workers: true | |
| prefetch_factor: 2 | |
| clip_grad_norm: 1.0 | |
| weight_decay: 0.01 | |
| adam_epsilon: 1e-8 | |
| adam_beta1: 0.9 | |
| adam_beta2: 0.999 | |
| inference: | |
| batch_size: 1 | |
| max_frames: 45 | |
| frame_interval: 2 | |
| subtitle_max_len: 400 | |
| enable_subtitles: true | |
| whisper_model: "base" | |
| response_format: "detailed" | |
| include_timestamps: false | |