Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +12 -0
- .gitignore +2 -0
- .ipynb_checkpoints/act_log-checkpoint.err +107 -0
- act_log.err +264 -0
- config/eval/activitynet_qa/.ipynb_checkpoints/llava_next_video_eval0_config-checkpoint.json +23 -0
- config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json +22 -0
- config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json +23 -0
- config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json +23 -0
- config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json +23 -0
- config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json +23 -0
- config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json +23 -0
- config/eval/activitynet_qa/llava_next_video_eval0_config.json +23 -0
- config/eval/activitynet_qa/llava_next_video_eval1_config.json +23 -0
- config/eval/activitynet_qa/llava_next_video_eval2_config.json +23 -0
- config/eval/activitynet_qa/qwen2_vl_eval_config.json +22 -0
- config/eval/activitynet_qa/qwen2_vl_eval_config_prefix1.json +23 -0
- config/eval/activitynet_qa/qwen2_vl_eval_config_prefix2.json +23 -0
- config/eval/activitynet_qa/qwen2_vl_eval_config_prefix3.json +23 -0
- config/eval/activitynet_qa/qwen2_vl_eval_config_prefix4.json +23 -0
- config/eval/activitynet_qa/qwen2_vl_eval_config_prefix5.json +23 -0
- config/eval/activitynet_qa/test_config.json +23 -0
- config/eval/msrvtt_qa/.ipynb_checkpoints/llava_next_video_eval0_config-checkpoint.json +23 -0
- config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json +22 -0
- config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json +23 -0
- config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json +23 -0
- config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json +23 -0
- config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json +23 -0
- config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json +23 -0
- config/eval/msrvtt_qa/llava_next_video_eval0_config.json +23 -0
- config/eval/msrvtt_qa/llava_next_video_eval1_config.json +23 -0
- config/eval/msrvtt_qa/llava_next_video_eval2_config.json +23 -0
- config/eval/msrvtt_qa/qwen2_vl_eval_config.json +22 -0
- config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix1.json +23 -0
- config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix2.json +23 -0
- config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix3.json +23 -0
- config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix4.json +23 -0
- config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix5.json +23 -0
- config/eval/msrvtt_qa/test_config.json +23 -0
- config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json +22 -0
- config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json +23 -0
- config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json +23 -0
- config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json +23 -0
- config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json +23 -0
- config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json +23 -0
- config/eval/msvd_qa/llava_next_video_eval0_config.json +23 -0
- config/eval/msvd_qa/llava_next_video_eval1_config.json +23 -0
- config/eval/msvd_qa/llava_next_video_eval2_config.json +23 -0
- config/eval/msvd_qa/qwen2_vl_eval_config.json +22 -0
- config/eval/msvd_qa/qwen2_vl_eval_config_prefix1.json +23 -0
- config/eval/msvd_qa/qwen2_vl_eval_config_prefix2.json +23 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
result/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_7b_msrvtt_prefix2-checkpoint.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
result/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_7b_msrvtt_prefix3-checkpoint.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
result/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_7b_msrvtt_prefix4-checkpoint.json filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
result/eval/msrvtt_qa/llava_next_video_msrvtt_0.json filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
result/eval/msrvtt_qa/llava_next_video_msrvtt_1.json filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
result/eval/msrvtt_qa/llava_next_video_msrvtt_2.json filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt.json filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt_prefix1.json filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt_prefix2.json filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt_prefix3.json filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt_prefix4.json filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt_prefix5.json filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
**/__pycache__/
|
| 2 |
+
./act_log.err
|
.ipynb_checkpoints/act_log-checkpoint.err
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[h264 @ 0xb32da00] mmco: unref short failure
|
| 2 |
+
[h264 @ 0xb32da00] mmco: unref short failure
|
| 3 |
+
[h264 @ 0xa5b8040] mmco: unref short failure
|
| 4 |
+
[h264 @ 0xa5b8040] mmco: unref short failure
|
| 5 |
+
[h264 @ 0xb32da00] mmco: unref short failure
|
| 6 |
+
[h264 @ 0xb32da00] mmco: unref short failure
|
| 7 |
+
[h264 @ 0xa2c0a80] mmco: unref short failure
|
| 8 |
+
[h264 @ 0xa2c0a80] mmco: unref short failure
|
| 9 |
+
[h264 @ 0xa31e800] mmco: unref short failure
|
| 10 |
+
[h264 @ 0xa9dad00] mmco: unref short failure
|
| 11 |
+
[h264 @ 0xa9dad00] mmco: unref short failure
|
| 12 |
+
[h264 @ 0xa9dad00] mmco: unref short failure
|
| 13 |
+
[h264 @ 0xa9dad00] mmco: unref short failure
|
| 14 |
+
[h264 @ 0x92ff200] mmco: unref short failure
|
| 15 |
+
[h264 @ 0xa6f26c0] mmco: unref short failure
|
| 16 |
+
[h264 @ 0xa6f26c0] mmco: unref short failure
|
| 17 |
+
[h264 @ 0xa239f80] mmco: unref short failure
|
| 18 |
+
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
|
| 19 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 20 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
| 21 |
+
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
|
| 22 |
+
|
| 23 |
+
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
|
| 24 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 25 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
| 26 |
+
|
| 27 |
+
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
|
| 28 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 29 |
+
[h264 @ 0x9e195c0] mmco: unref short failure
|
| 30 |
+
|
| 31 |
+
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
|
| 32 |
+
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
|
| 33 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 34 |
+
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
|
| 35 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 36 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
| 37 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
| 38 |
+
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 44 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
[h264 @ 0x92ff200] mmco: unref short failure
|
| 50 |
+
[h264 @ 0x92ff200] mmco: unref short failure
|
| 51 |
+
|
| 52 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 53 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
| 54 |
+
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
[h264 @ 0x9e195c0] mmco: unref short failure
|
| 58 |
+
[h264 @ 0x9e195c0] mmco: unref short failure
|
| 59 |
+
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
|
| 60 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 61 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
| 62 |
+
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
|
| 63 |
+
|
| 64 |
+
multiprocessing.pool.RemoteTraceback:
|
| 65 |
+
"""
|
| 66 |
+
Traceback (most recent call last):
|
| 67 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/multiprocessing/pool.py", line 125, in worker
|
| 68 |
+
result = (True, func(*args, **kwds))
|
| 69 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/multiprocessing/pool.py", line 48, in mapstar
|
| 70 |
+
return list(map(*args))
|
| 71 |
+
File "/jizhicfs/hymiezhao/ml/video_llm_template/task/eval/qwen2_vl_qa_eval.py", line 205, in batch_run
|
| 72 |
+
llm_messages = qwen2_vl_answer(model, processor, batch_samples, device=rank)
|
| 73 |
+
File "/jizhicfs/hymiezhao/ml/video_llm_template/task/eval/qwen2_vl_qa_eval.py", line 164, in qwen2_vl_answer
|
| 74 |
+
generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False).cpu()
|
| 75 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
|
| 76 |
+
return func(*args, **kwargs)
|
| 77 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/transformers/generation/utils.py", line 2057, in generate
|
| 78 |
+
result = self._sample(
|
| 79 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/transformers/generation/utils.py", line 3021, in _sample
|
| 80 |
+
outputs = self(**model_inputs, return_dict=True)
|
| 81 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
|
| 82 |
+
return self._call_impl(*args, **kwargs)
|
| 83 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
|
| 84 |
+
return forward_call(*args, **kwargs)
|
| 85 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/transformers/models/qwen2_vl/modeling_qwen2_vl.py", line 1719, in forward
|
| 86 |
+
logits = logits.float()
|
| 87 |
+
RuntimeError: r.nvmlDeviceGetNvLinkRemoteDeviceType_ INTERNAL ASSERT FAILED at "../c10/cuda/driver_api.cpp":27, please report a bug to PyTorch. Can't find nvmlDeviceGetNvLinkRemoteDeviceType: /lib64/libnvidia-ml.so.1: undefined symbol: nvmlDeviceGetNvLinkRemoteDeviceType
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
The above exception was the direct cause of the following exception:
|
| 91 |
+
|
| 92 |
+
Traceback (most recent call last):
|
| 93 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/runpy.py", line 196, in _run_module_as_main
|
| 94 |
+
return _run_code(code, main_globals, None,
|
| 95 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/runpy.py", line 86, in _run_code
|
| 96 |
+
exec(code, run_globals)
|
| 97 |
+
File "/jizhicfs/hymiezhao/ml/video_llm_template/task/eval/qwen2_vl_qa_eval.py", line 253, in <module>
|
| 98 |
+
main(args)
|
| 99 |
+
File "/jizhicfs/hymiezhao/ml/video_llm_template/task/eval/qwen2_vl_qa_eval.py", line 240, in main
|
| 100 |
+
result_lists = pool.map(func, range(world_size))
|
| 101 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/multiprocessing/pool.py", line 367, in map
|
| 102 |
+
return self._map_async(func, iterable, mapstar, chunksize).get()
|
| 103 |
+
File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/multiprocessing/pool.py", line 774, in get
|
| 104 |
+
raise self._value
|
| 105 |
+
RuntimeError: r.nvmlDeviceGetNvLinkRemoteDeviceType_ INTERNAL ASSERT FAILED at "../c10/cuda/driver_api.cpp":27, please report a bug to PyTorch. Can't find nvmlDeviceGetNvLinkRemoteDeviceType: /lib64/libnvidia-ml.so.1: undefined symbol: nvmlDeviceGetNvLinkRemoteDeviceType
|
| 106 |
+
/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
|
| 107 |
+
warnings.warn('resource_tracker: There appear to be %d '
|
act_log.err
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[h264 @ 0x9b59c40] mmco: unref short failure
|
| 2 |
+
[h264 @ 0x9b59c40] mmco: unref short failure
|
| 3 |
+
[h264 @ 0x86a1a40] mmco: unref short failure
|
| 4 |
+
[h264 @ 0x86a1a40] mmco: unref short failure
|
| 5 |
+
[h264 @ 0x9eeff00] mmco: unref short failure
|
| 6 |
+
[h264 @ 0x9eeff00] mmco: unref short failure
|
| 7 |
+
[h264 @ 0x9741000] mmco: unref short failure
|
| 8 |
+
[h264 @ 0x8a2a980] mmco: unref short failure
|
| 9 |
+
[h264 @ 0x8a2a980] mmco: unref short failure
|
| 10 |
+
[h264 @ 0x8241080] mmco: unref short failure
|
| 11 |
+
[h264 @ 0x8241080] mmco: unref short failure
|
| 12 |
+
[h264 @ 0x831f5c0] mmco: unref short failure
|
| 13 |
+
[h264 @ 0x9064ac0] mmco: unref short failure
|
| 14 |
+
[h264 @ 0x9064ac0] mmco: unref short failure
|
| 15 |
+
[h264 @ 0x83a9cc0] mmco: unref short failure
|
| 16 |
+
[h264 @ 0x83a9cc0] mmco: unref short failure
|
| 17 |
+
[h264 @ 0xed7b140] mmco: unref short failure
|
| 18 |
+
[h264 @ 0xed7b140] mmco: unref short failure
|
| 19 |
+
[h264 @ 0x8892600] mmco: unref short failure
|
| 20 |
+
[h264 @ 0x20ace480] mmco: unref short failure
|
| 21 |
+
[h264 @ 0x20ace480] mmco: unref short failure
|
| 22 |
+
[h264 @ 0x9aedd80] mmco: unref short failure
|
| 23 |
+
[h264 @ 0x15769000] mmco: unref short failure
|
| 24 |
+
[h264 @ 0x15769000] mmco: unref short failure
|
| 25 |
+
[h264 @ 0x9821380] mmco: unref short failure
|
| 26 |
+
[h264 @ 0x9821380] mmco: unref short failure
|
| 27 |
+
[h264 @ 0x111680c0] mmco: unref short failure
|
| 28 |
+
[h264 @ 0x83b4dc0] mmco: unref short failure
|
| 29 |
+
[h264 @ 0x83b4dc0] mmco: unref short failure
|
| 30 |
+
[h264 @ 0x9d5e500] mmco: unref short failure
|
| 31 |
+
[h264 @ 0x9d5e500] mmco: unref short failure
|
| 32 |
+
[h264 @ 0x12ff4840] mmco: unref short failure
|
| 33 |
+
[h264 @ 0x12ff4840] mmco: unref short failure
|
| 34 |
+
[h264 @ 0x976dfc0] mmco: unref short failure
|
| 35 |
+
[h264 @ 0xd4e1180] mmco: unref short failure
|
| 36 |
+
[h264 @ 0xd4e1180] mmco: unref short failure
|
| 37 |
+
[h264 @ 0x976ab80] mmco: unref short failure
|
| 38 |
+
[h264 @ 0x976ab80] mmco: unref short failure
|
| 39 |
+
[h264 @ 0x9d6b5c0] mmco: unref short failure
|
| 40 |
+
[h264 @ 0xfceee00] mmco: unref short failure
|
| 41 |
+
[h264 @ 0xfceee00] mmco: unref short failure
|
| 42 |
+
[h264 @ 0x838ff40] mmco: unref short failure
|
| 43 |
+
[h264 @ 0x838ff40] mmco: unref short failure
|
| 44 |
+
[h264 @ 0x1e7dba40] mmco: unref short failure
|
| 45 |
+
[h264 @ 0x1e7dba40] mmco: unref short failure
|
| 46 |
+
[h264 @ 0x953b3c0] mmco: unref short failure
|
| 47 |
+
[h264 @ 0x953b3c0] mmco: unref short failure
|
| 48 |
+
[h264 @ 0x9d5e500] mmco: unref short failure
|
| 49 |
+
[h264 @ 0x9d5e500] mmco: unref short failure
|
| 50 |
+
[h264 @ 0x11e6fcc0] mmco: unref short failure
|
| 51 |
+
[h264 @ 0x93d7480] mmco: unref short failure
|
| 52 |
+
[h264 @ 0x93d7480] mmco: unref short failure
|
| 53 |
+
[h264 @ 0x12ff4840] mmco: unref short failure
|
| 54 |
+
[h264 @ 0x12ff4840] mmco: unref short failure
|
| 55 |
+
[h264 @ 0xfceee00] mmco: unref short failure
|
| 56 |
+
[h264 @ 0xfceee00] mmco: unref short failure
|
| 57 |
+
[h264 @ 0xfceee00] mmco: unref short failure
|
| 58 |
+
[h264 @ 0x8bc3980] mmco: unref short failure
|
| 59 |
+
[h264 @ 0x8bc3980] mmco: unref short failure
|
| 60 |
+
[h264 @ 0x9c304c0] mmco: unref short failure
|
| 61 |
+
[h264 @ 0x9c304c0] mmco: unref short failure
|
| 62 |
+
[h264 @ 0xb6e6a40] mmco: unref short failure
|
| 63 |
+
[h264 @ 0x8b27a00] mmco: unref short failure
|
| 64 |
+
[h264 @ 0x8b27a00] mmco: unref short failure
|
| 65 |
+
[h264 @ 0x1b4bdc40] mmco: unref short failure
|
| 66 |
+
[h264 @ 0x1b4bdc40] mmco: unref short failure
|
| 67 |
+
[h264 @ 0x175507c0] mmco: unref short failure
|
| 68 |
+
[h264 @ 0x175507c0] mmco: unref short failure
|
| 69 |
+
[h264 @ 0x95b2cc0] mmco: unref short failure
|
| 70 |
+
[h264 @ 0x13094500] mmco: unref short failure
|
| 71 |
+
[h264 @ 0x97cf300] mmco: unref short failure
|
| 72 |
+
[h264 @ 0x97cf300] mmco: unref short failure
|
| 73 |
+
[h264 @ 0x8338900] mmco: unref short failure
|
| 74 |
+
[h264 @ 0x846fbc0] mmco: unref short failure
|
| 75 |
+
[h264 @ 0x846fbc0] mmco: unref short failure
|
| 76 |
+
[h264 @ 0x8654080] mmco: unref short failure
|
| 77 |
+
[h264 @ 0x15e09fc0] mmco: unref short failure
|
| 78 |
+
[h264 @ 0x15e09fc0] mmco: unref short failure
|
| 79 |
+
[h264 @ 0x8654080] mmco: unref short failure
|
| 80 |
+
[h264 @ 0x120ec300] mmco: unref short failure
|
| 81 |
+
[h264 @ 0x120ec300] mmco: unref short failure
|
| 82 |
+
[h264 @ 0x9518d00] mmco: unref short failure
|
| 83 |
+
[h264 @ 0x9068a00] mmco: unref short failure
|
| 84 |
+
[h264 @ 0x9068a00] mmco: unref short failure
|
| 85 |
+
[h264 @ 0x9068a00] mmco: unref short failure
|
| 86 |
+
[h264 @ 0x9068a00] mmco: unref short failure
|
| 87 |
+
[h264 @ 0x9068a00] mmco: unref short failure
|
| 88 |
+
[h264 @ 0x15769000] mmco: unref short failure
|
| 89 |
+
[h264 @ 0x84bffc0] mmco: unref short failure
|
| 90 |
+
[h264 @ 0x84bffc0] mmco: unref short failure
|
| 91 |
+
[h264 @ 0x15769000] mmco: unref short failure
|
| 92 |
+
[h264 @ 0x1aa11900] mmco: unref short failure
|
| 93 |
+
[h264 @ 0x1aa11900] mmco: unref short failure
|
| 94 |
+
[h264 @ 0x82bb480] mmco: unref short failure
|
| 95 |
+
[h264 @ 0x82bb480] mmco: unref short failure
|
| 96 |
+
[h264 @ 0x10d43940] mmco: unref short failure
|
| 97 |
+
[h264 @ 0x9973540] mmco: unref short failure
|
| 98 |
+
[h264 @ 0x9973540] mmco: unref short failure
|
| 99 |
+
[h264 @ 0xa2da800] mmco: unref short failure
|
| 100 |
+
[h264 @ 0xa2da800] mmco: unref short failure
|
| 101 |
+
[h264 @ 0x9973540] mmco: unref short failure
|
| 102 |
+
[h264 @ 0x9973540] mmco: unref short failure
|
| 103 |
+
[h264 @ 0x10d43940] mmco: unref short failure
|
| 104 |
+
[h264 @ 0x15e09fc0] mmco: unref short failure
|
| 105 |
+
[h264 @ 0x15e09fc0] mmco: unref short failure
|
| 106 |
+
[h264 @ 0x10d43940] mmco: unref short failure
|
| 107 |
+
[h264 @ 0x8bc3980] mmco: unref short failure
|
| 108 |
+
[h264 @ 0x8bc3980] mmco: unref short failure
|
| 109 |
+
[h264 @ 0x90b2bc0] mmco: unref short failure
|
| 110 |
+
[h264 @ 0x715d0d00] mmco: unref short failure
|
| 111 |
+
[h264 @ 0x28108040] mmco: unref short failure
|
| 112 |
+
[h264 @ 0x846fbc0] mmco: unref short failure
|
| 113 |
+
[h264 @ 0x846fbc0] mmco: unref short failure
|
| 114 |
+
[h264 @ 0x90b2bc0] mmco: unref short failure
|
| 115 |
+
[h264 @ 0x90b2bc0] mmco: unref short failure
|
| 116 |
+
[h264 @ 0x9973540] mmco: unref short failure
|
| 117 |
+
[h264 @ 0x9973540] mmco: unref short failure
|
| 118 |
+
[h264 @ 0x715d0d00] mmco: unref short failure
|
| 119 |
+
[h264 @ 0x715d0d00] mmco: unref short failure
|
| 120 |
+
[h264 @ 0x82bb480] mmco: unref short failure
|
| 121 |
+
[h264 @ 0x82bb480] mmco: unref short failure
|
| 122 |
+
[h264 @ 0x13c79580] mmco: unref short failure
|
| 123 |
+
[h264 @ 0x13c79580] mmco: unref short failure
|
| 124 |
+
[h264 @ 0x9518d00] mmco: unref short failure
|
| 125 |
+
[h264 @ 0x85513c0] mmco: unref short failure
|
| 126 |
+
[h264 @ 0x85513c0] mmco: unref short failure
|
| 127 |
+
[h264 @ 0x1029a380] mmco: unref short failure
|
| 128 |
+
[h264 @ 0x1029a380] mmco: unref short failure
|
| 129 |
+
[h264 @ 0x20e53000] mmco: unref short failure
|
| 130 |
+
[h264 @ 0x120ec300] mmco: unref short failure
|
| 131 |
+
[h264 @ 0x93eb780] mmco: unref short failure
|
| 132 |
+
[h264 @ 0x93eb780] mmco: unref short failure
|
| 133 |
+
[h264 @ 0xd3c0080] mmco: unref short failure
|
| 134 |
+
[h264 @ 0xd3c0080] mmco: unref short failure
|
| 135 |
+
[h264 @ 0x1029a380] mmco: unref short failure
|
| 136 |
+
[h264 @ 0x1b4bdc40] mmco: unref short failure
|
| 137 |
+
[h264 @ 0x1b4bdc40] mmco: unref short failure
|
| 138 |
+
[h264 @ 0x20e53000] mmco: unref short failure
|
| 139 |
+
[h264 @ 0x20e53000] mmco: unref short failure
|
| 140 |
+
[h264 @ 0x1aa11900] mmco: unref short failure
|
| 141 |
+
[h264 @ 0x1aa11900] mmco: unref short failure
|
| 142 |
+
[h264 @ 0x8b27a00] mmco: unref short failure
|
| 143 |
+
[h264 @ 0x8b27a00] mmco: unref short failure
|
| 144 |
+
[h264 @ 0x47b64100] mmco: unref short failure
|
| 145 |
+
[h264 @ 0x47b64100] mmco: unref short failure
|
| 146 |
+
[h264 @ 0x8b27a00] mmco: unref short failure
|
| 147 |
+
[h264 @ 0x8b27a00] mmco: unref short failure
|
| 148 |
+
[h264 @ 0xe857000] mmco: unref short failure
|
| 149 |
+
[h264 @ 0xe857000] mmco: unref short failure
|
| 150 |
+
[h264 @ 0x838ff40] mmco: unref short failure
|
| 151 |
+
[h264 @ 0x838ff40] mmco: unref short failure
|
| 152 |
+
[h264 @ 0x97b04c0] mmco: unref short failure
|
| 153 |
+
[h264 @ 0x66aeab00] mmco: unref short failure
|
| 154 |
+
[h264 @ 0x66aeab00] mmco: unref short failure
|
| 155 |
+
[h264 @ 0xf7cf380] mmco: unref short failure
|
| 156 |
+
[h264 @ 0xf7cf380] mmco: unref short failure
|
| 157 |
+
[h264 @ 0xf616140] mmco: unref short failure
|
| 158 |
+
[h264 @ 0xf616140] mmco: unref short failure
|
| 159 |
+
[h264 @ 0x41642140] mmco: unref short failure
|
| 160 |
+
[h264 @ 0x41642140] mmco: unref short failure
|
| 161 |
+
[h264 @ 0x8654080] mmco: unref short failure
|
| 162 |
+
[h264 @ 0x8654080] mmco: unref short failure
|
| 163 |
+
[h264 @ 0x8414080] mmco: unref short failure
|
| 164 |
+
[h264 @ 0xa2da800] mmco: unref short failure
|
| 165 |
+
[h264 @ 0x47b64100] mmco: unref short failure
|
| 166 |
+
[h264 @ 0x47b64100] mmco: unref short failure
|
| 167 |
+
[h264 @ 0x8b27a00] mmco: unref short failure
|
| 168 |
+
[h264 @ 0x8b27a00] mmco: unref short failure
|
| 169 |
+
[h264 @ 0x1aa11900] mmco: unref short failure
|
| 170 |
+
[h264 @ 0x1aa11900] mmco: unref short failure
|
| 171 |
+
[h264 @ 0x18542300] mmco: unref short failure
|
| 172 |
+
[h264 @ 0x18542300] mmco: unref short failure
|
| 173 |
+
[h264 @ 0x1cac37c0] mmco: unref short failure
|
| 174 |
+
[h264 @ 0x1cac37c0] mmco: unref short failure
|
| 175 |
+
[h264 @ 0x551b7980] mmco: unref short failure
|
| 176 |
+
[h264 @ 0x551b7980] mmco: unref short failure
|
| 177 |
+
[h264 @ 0x130a5d00] mmco: unref short failure
|
| 178 |
+
[h264 @ 0x130a5d00] mmco: unref short failure
|
| 179 |
+
[h264 @ 0x47b64100] mmco: unref short failure
|
| 180 |
+
[h264 @ 0x1029a380] mmco: unref short failure
|
| 181 |
+
[h264 @ 0x1029a380] mmco: unref short failure
|
| 182 |
+
[h264 @ 0x551b7980] mmco: unref short failure
|
| 183 |
+
[h264 @ 0x9518d00] mmco: unref short failure
|
| 184 |
+
[h264 @ 0x9518d00] mmco: unref short failure
|
| 185 |
+
[h264 @ 0x12ff4840] mmco: unref short failure
|
| 186 |
+
[h264 @ 0x12ff4840] mmco: unref short failure
|
| 187 |
+
[h264 @ 0x37079f80] mmco: unref short failure
|
| 188 |
+
[h264 @ 0x37079f80] mmco: unref short failure
|
| 189 |
+
[h264 @ 0x130a5d00] mmco: unref short failure
|
| 190 |
+
[h264 @ 0x37079f80] mmco: unref short failure
|
| 191 |
+
[h264 @ 0x37079f80] mmco: unref short failure
|
| 192 |
+
You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
|
| 193 |
+
[h264 @ 0x838ff40] mmco: unref short failure
|
| 194 |
+
[h264 @ 0xe857000] mmco: unref short failure
|
| 195 |
+
[h264 @ 0x13002b80] mmco: unref short failure
|
| 196 |
+
[h264 @ 0x13002b80] mmco: unref short failure
|
| 197 |
+
[h264 @ 0x13c79580] mmco: unref short failure
|
| 198 |
+
[h264 @ 0x13c79580] mmco: unref short failure
|
| 199 |
+
|
| 200 |
+
[h264 @ 0x1cac37c0] mmco: unref short failure
|
| 201 |
+
[h264 @ 0x1cac37c0] mmco: unref short failure
|
| 202 |
+
[h264 @ 0x9518d00] mmco: unref short failure
|
| 203 |
+
[h264 @ 0x9518d00] mmco: unref short failure
|
| 204 |
+
[h264 @ 0x21946e00] mmco: unref short failure
|
| 205 |
+
[h264 @ 0x21946e00] mmco: unref short failure
|
| 206 |
+
[h264 @ 0x869f680] mmco: unref short failure
|
| 207 |
+
[h264 @ 0x9518d00] mmco: unref short failure
|
| 208 |
+
|
| 209 |
+
[h264 @ 0x20e53000] mmco: unref short failure
|
| 210 |
+
[h264 @ 0x37f53c80] mmco: unref short failure
|
| 211 |
+
[h264 @ 0x37f53c80] mmco: unref short failure
|
| 212 |
+
|
| 213 |
+
[h264 @ 0x2bd85600] mmco: unref short failure
|
| 214 |
+
[h264 @ 0x175507c0] mmco: unref short failure
|
| 215 |
+
[h264 @ 0x175507c0] mmco: unref short failure
|
| 216 |
+
You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
|
| 217 |
+
You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
|
| 218 |
+
[h264 @ 0x37f53c80] mmco: unref short failure
|
| 219 |
+
You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
|
| 220 |
+
You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
|
| 221 |
+
[h264 @ 0x2034de00] mmco: unref short failure
|
| 222 |
+
You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
|
| 238 |
+
Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
|
| 239 |
+
Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
|
| 240 |
+
Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
|
| 241 |
+
Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
|
| 242 |
+
Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
|
| 243 |
+
Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 244 |
+
Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
|
| 245 |
+
Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 246 |
+
Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 247 |
+
Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 248 |
+
Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 249 |
+
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
|
| 250 |
+
Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 251 |
+
Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 252 |
+
Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 253 |
+
Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 254 |
+
Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 255 |
+
Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 256 |
+
Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 257 |
+
Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 258 |
+
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
|
| 259 |
+
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
|
| 260 |
+
Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
|
| 261 |
+
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
|
| 262 |
+
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
|
| 263 |
+
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
|
| 264 |
+
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
|
config/eval/activitynet_qa/.ipynb_checkpoints/llava_next_video_eval0_config-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name":"Activitynet_QA",
|
| 4 |
+
"q_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"llava_next_video_activitynet_0",
|
| 21 |
+
"description":"simple inference using llava_next_video with llava_next_video_template in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 19 |
+
"experiment_name":"qwen2_vl_7b_activitynet",
|
| 20 |
+
"description":"simple inference using qwen2 in activitynet",
|
| 21 |
+
"batch_size":4
|
| 22 |
+
}
|
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"Watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_activitynet_prefix1",
|
| 21 |
+
"description":"prefix1 inference using qwen2 in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_activitynet_prefix2",
|
| 21 |
+
"description":"prefix2 inference using qwen2 in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully and pay close attention to the actions and states of the main characters. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_activitynet_prefix3",
|
| 21 |
+
"description":"prefix3 inference using qwen2 in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the content described in the following question. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_activitynet_prefix4",
|
| 21 |
+
"description":"prefix4 inference using qwen2 in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the frames and information which is highly related to the following question. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_activitynet_prefix5",
|
| 21 |
+
"description":"prefix5 inference using qwen2 in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/llava_next_video_eval0_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name":"Activitynet_QA",
|
| 4 |
+
"q_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"llava_next_video_activitynet_0",
|
| 21 |
+
"description":"simple inference using llava_next_video with llava_next_video_template in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/llava_next_video_eval1_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name":"Activitynet_QA",
|
| 4 |
+
"q_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template_with_space_after_assistant",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"llava_next_video_activitynet_1",
|
| 21 |
+
"description":"blank after assistant",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/llava_next_video_eval2_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name":"Activitynet_QA",
|
| 4 |
+
"q_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template_with_space_after_user",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"llava_next_video_activitynet_2",
|
| 21 |
+
"description":"llava_next_video_template_with_space_after_user in activitynet_qa",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/qwen2_vl_eval_config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 19 |
+
"experiment_name":"qwen2_vl_7b_activitynet",
|
| 20 |
+
"description":"simple inference using qwen2 in activitynet",
|
| 21 |
+
"batch_size":4
|
| 22 |
+
}
|
config/eval/activitynet_qa/qwen2_vl_eval_config_prefix1.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"Watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_activitynet_prefix1",
|
| 21 |
+
"description":"prefix1 inference using qwen2 in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/qwen2_vl_eval_config_prefix2.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_activitynet_prefix2",
|
| 21 |
+
"description":"prefix2 inference using qwen2 in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/qwen2_vl_eval_config_prefix3.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully and pay close attention to the actions and states of the main characters. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_activitynet_prefix3",
|
| 21 |
+
"description":"prefix3 inference using qwen2 in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/qwen2_vl_eval_config_prefix4.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the content described in the following question. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_activitynet_prefix4",
|
| 21 |
+
"description":"prefix4 inference using qwen2 in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/qwen2_vl_eval_config_prefix5.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "Activitynet_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the frames and information which is highly related to the following question. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_activitynet_prefix5",
|
| 21 |
+
"description":"prefix5 inference using qwen2 in activitynet",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/activitynet_qa/test_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name":"Activitynet_QA",
|
| 4 |
+
"q_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/all_test",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4",".mkv"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template",
|
| 19 |
+
"save_path":"./result/eval/activitynet_qa",
|
| 20 |
+
"experiment_name":"test",
|
| 21 |
+
"description":"simple inference using llava_next_video with llava_next_video_template in activitynet_qa",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/msrvtt_qa/.ipynb_checkpoints/llava_next_video_eval0_config-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name":"MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"llava_next_video_msrvtt_0",
|
| 21 |
+
"description":"simple inference using llava_next_video with llava_next_video_template in MSRVTT",
|
| 22 |
+
"batch_size":32
|
| 23 |
+
}
|
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 19 |
+
"experiment_name":"qwen2_vl_7b_msrvtt",
|
| 20 |
+
"description":"simple inference using qwen2 in msrvtt",
|
| 21 |
+
"batch_size":8
|
| 22 |
+
}
|
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix": "Watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msrvtt_prefix1",
|
| 21 |
+
"description":"prefix1 inference using qwen2 in msrvtt",
|
| 22 |
+
"batch_size":8
|
| 23 |
+
}
|
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix": "You are an expert at video question answering. Please watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msrvtt_prefix2",
|
| 21 |
+
"description":"prefix2 inference using qwen2 in msrvtt",
|
| 22 |
+
"batch_size":8
|
| 23 |
+
}
|
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix": "You are an expert at video question answering. Please watch this video carefully and pay close attention to the actions and states of the main characters. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msrvtt_prefix3",
|
| 21 |
+
"description":"prefix3 inference using qwen2 in msrvtt",
|
| 22 |
+
"batch_size":8
|
| 23 |
+
}
|
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix": "You are an expert at video question answering. Please watch this video carefully, focus on the content described in the following question. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msrvtt_prefix4",
|
| 21 |
+
"description":"prefix4 inference using qwen2 in msrvtt",
|
| 22 |
+
"batch_size":8
|
| 23 |
+
}
|
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix": "You are an expert at video question answering. Please watch this video carefully, focus on the frames and information which is highly related to the following question. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msrvtt_prefix5",
|
| 21 |
+
"description":"prefix5 inference using qwen2 in msrvtt",
|
| 22 |
+
"batch_size":8
|
| 23 |
+
}
|
config/eval/msrvtt_qa/llava_next_video_eval0_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name":"MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"llava_next_video_msrvtt_0",
|
| 21 |
+
"description":"simple inference using llava_next_video with llava_next_video_template in MSRVTT",
|
| 22 |
+
"batch_size":32
|
| 23 |
+
}
|
config/eval/msrvtt_qa/llava_next_video_eval1_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name":"MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template_with_space_after_assistant",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"llava_next_video_msrvtt_1",
|
| 21 |
+
"description":"blank after assistant",
|
| 22 |
+
"batch_size":32
|
| 23 |
+
}
|
config/eval/msrvtt_qa/llava_next_video_eval2_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name":"MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template_with_space_after_user",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"llava_next_video_msrvtt_2",
|
| 21 |
+
"description":"llava_next_video_template_with_space_after_user in msrvtt_qa",
|
| 22 |
+
"batch_size":32
|
| 23 |
+
}
|
config/eval/msrvtt_qa/qwen2_vl_eval_config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 19 |
+
"experiment_name":"qwen2_vl_7b_msrvtt",
|
| 20 |
+
"description":"simple inference using qwen2 in msrvtt",
|
| 21 |
+
"batch_size":8
|
| 22 |
+
}
|
config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix1.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix": "Watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msrvtt_prefix1",
|
| 21 |
+
"description":"prefix1 inference using qwen2 in msrvtt",
|
| 22 |
+
"batch_size":8
|
| 23 |
+
}
|
config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix2.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix": "You are an expert at video question answering. Please watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msrvtt_prefix2",
|
| 21 |
+
"description":"prefix2 inference using qwen2 in msrvtt",
|
| 22 |
+
"batch_size":8
|
| 23 |
+
}
|
config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix3.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix": "You are an expert at video question answering. Please watch this video carefully and pay close attention to the actions and states of the main characters. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msrvtt_prefix3",
|
| 21 |
+
"description":"prefix3 inference using qwen2 in msrvtt",
|
| 22 |
+
"batch_size":8
|
| 23 |
+
}
|
config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix4.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix": "You are an expert at video question answering. Please watch this video carefully, focus on the content described in the following question. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msrvtt_prefix4",
|
| 21 |
+
"description":"prefix4 inference using qwen2 in msrvtt",
|
| 22 |
+
"batch_size":8
|
| 23 |
+
}
|
config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix5.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix": "You are an expert at video question answering. Please watch this video carefully, focus on the frames and information which is highly related to the following question. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msrvtt_prefix5",
|
| 21 |
+
"description":"prefix5 inference using qwen2 in msrvtt",
|
| 22 |
+
"batch_size":8
|
| 23 |
+
}
|
config/eval/msrvtt_qa/test_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name":"MSRVTT_QA",
|
| 4 |
+
"q_json_path":"/home/user/students/ml/dataset/MSRVTT_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/home/user/students/ml/dataset/MSRVTT_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/home/user/students/ml/dataset/MSRVTT_Zero_Shot_QA/videos/all",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".mp4"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template",
|
| 19 |
+
"save_path":"./result/eval/msrvtt_qa",
|
| 20 |
+
"experiment_name":"test",
|
| 21 |
+
"description":"simple inference using llava_next_video with llava_next_video_template in msrvtt_qa",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"save_path":"./result/eval/msvd_qa",
|
| 19 |
+
"experiment_name":"qwen2_vl_7b_msvd",
|
| 20 |
+
"description":"simple inference using qwen2 in msvd",
|
| 21 |
+
"batch_size":4
|
| 22 |
+
}
|
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"Watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msvd_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msvd_prefix1",
|
| 21 |
+
"description":"prefix1 inference using qwen2 in msvd",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msvd_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msvd_prefix2",
|
| 21 |
+
"description":"prefix2 inference using qwen2 in msvd",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully and pay close attention to the actions and states of the main characters. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msvd_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msvd_prefix3",
|
| 21 |
+
"description":"prefix3 inference using qwen2 in msvd",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the content described in the following question. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msvd_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msvd_prefix4",
|
| 21 |
+
"description":"prefix4 inference using qwen2 in msvd",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the frames and information which is highly related to the following question. Then, answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msvd_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msvd_prefix5",
|
| 21 |
+
"description":"prefix5 inference using qwen2 in msvd",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/msvd_qa/llava_next_video_eval0_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template",
|
| 19 |
+
"save_path":"./result/eval/msvd_qa",
|
| 20 |
+
"experiment_name":"llava_next_video_msvd_0",
|
| 21 |
+
"description":"simple inference using llava_next_video with llava_next_video_template in msvd",
|
| 22 |
+
"batch_size":32
|
| 23 |
+
}
|
config/eval/msvd_qa/llava_next_video_eval1_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template_with_space_after_assistant",
|
| 19 |
+
"save_path":"./result/eval/msvd_qa",
|
| 20 |
+
"experiment_name":"llava_next_video_msvd_1",
|
| 21 |
+
"description":"blank after assistant",
|
| 22 |
+
"batch_size":32
|
| 23 |
+
}
|
config/eval/msvd_qa/llava_next_video_eval2_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
|
| 17 |
+
},
|
| 18 |
+
"conv_mode":"llava_next_video_template_with_space_after_user",
|
| 19 |
+
"save_path":"./result/eval/msvd_qa",
|
| 20 |
+
"experiment_name":"llava_next_video_msvd_2",
|
| 21 |
+
"description":"llava_next_video_template_with_space_after_user in msvd",
|
| 22 |
+
"batch_size":32
|
| 23 |
+
}
|
config/eval/msvd_qa/qwen2_vl_eval_config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"save_path":"./result/eval/msvd_qa",
|
| 19 |
+
"experiment_name":"qwen2_vl_7b_msvd",
|
| 20 |
+
"description":"simple inference using qwen2 in msvd",
|
| 21 |
+
"batch_size":4
|
| 22 |
+
}
|
config/eval/msvd_qa/qwen2_vl_eval_config_prefix1.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"Watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msvd_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msvd_prefix1",
|
| 21 |
+
"description":"prefix1 inference using qwen2 in msvd",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|
config/eval/msvd_qa/qwen2_vl_eval_config_prefix2.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset":{
|
| 3 |
+
"dataset_name": "MSVD_QA",
|
| 4 |
+
"q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
|
| 5 |
+
"a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
|
| 6 |
+
"video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
|
| 7 |
+
"data_type":"video",
|
| 8 |
+
"bound":"False",
|
| 9 |
+
"question_key":"question",
|
| 10 |
+
"answer_key":"answer",
|
| 11 |
+
"name_key":"video_name",
|
| 12 |
+
"video_postfix":[".avi"],
|
| 13 |
+
"num_segments":8
|
| 14 |
+
},
|
| 15 |
+
"model":{
|
| 16 |
+
"model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
|
| 17 |
+
},
|
| 18 |
+
"prefix":"You are an expert at video question answering. Please watch this video carefully and answer the question: ",
|
| 19 |
+
"save_path":"./result/eval/msvd_qa",
|
| 20 |
+
"experiment_name":"qwen2_vl_7b_msvd_prefix2",
|
| 21 |
+
"description":"prefix2 inference using qwen2 in msvd",
|
| 22 |
+
"batch_size":4
|
| 23 |
+
}
|