RoadQAQ commited on
Commit
710b71f
·
verified ·
1 Parent(s): 31f579d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +12 -0
  2. .gitignore +2 -0
  3. .ipynb_checkpoints/act_log-checkpoint.err +107 -0
  4. act_log.err +264 -0
  5. config/eval/activitynet_qa/.ipynb_checkpoints/llava_next_video_eval0_config-checkpoint.json +23 -0
  6. config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json +22 -0
  7. config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json +23 -0
  8. config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json +23 -0
  9. config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json +23 -0
  10. config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json +23 -0
  11. config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json +23 -0
  12. config/eval/activitynet_qa/llava_next_video_eval0_config.json +23 -0
  13. config/eval/activitynet_qa/llava_next_video_eval1_config.json +23 -0
  14. config/eval/activitynet_qa/llava_next_video_eval2_config.json +23 -0
  15. config/eval/activitynet_qa/qwen2_vl_eval_config.json +22 -0
  16. config/eval/activitynet_qa/qwen2_vl_eval_config_prefix1.json +23 -0
  17. config/eval/activitynet_qa/qwen2_vl_eval_config_prefix2.json +23 -0
  18. config/eval/activitynet_qa/qwen2_vl_eval_config_prefix3.json +23 -0
  19. config/eval/activitynet_qa/qwen2_vl_eval_config_prefix4.json +23 -0
  20. config/eval/activitynet_qa/qwen2_vl_eval_config_prefix5.json +23 -0
  21. config/eval/activitynet_qa/test_config.json +23 -0
  22. config/eval/msrvtt_qa/.ipynb_checkpoints/llava_next_video_eval0_config-checkpoint.json +23 -0
  23. config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json +22 -0
  24. config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json +23 -0
  25. config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json +23 -0
  26. config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json +23 -0
  27. config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json +23 -0
  28. config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json +23 -0
  29. config/eval/msrvtt_qa/llava_next_video_eval0_config.json +23 -0
  30. config/eval/msrvtt_qa/llava_next_video_eval1_config.json +23 -0
  31. config/eval/msrvtt_qa/llava_next_video_eval2_config.json +23 -0
  32. config/eval/msrvtt_qa/qwen2_vl_eval_config.json +22 -0
  33. config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix1.json +23 -0
  34. config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix2.json +23 -0
  35. config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix3.json +23 -0
  36. config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix4.json +23 -0
  37. config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix5.json +23 -0
  38. config/eval/msrvtt_qa/test_config.json +23 -0
  39. config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json +22 -0
  40. config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json +23 -0
  41. config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json +23 -0
  42. config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json +23 -0
  43. config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json +23 -0
  44. config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json +23 -0
  45. config/eval/msvd_qa/llava_next_video_eval0_config.json +23 -0
  46. config/eval/msvd_qa/llava_next_video_eval1_config.json +23 -0
  47. config/eval/msvd_qa/llava_next_video_eval2_config.json +23 -0
  48. config/eval/msvd_qa/qwen2_vl_eval_config.json +22 -0
  49. config/eval/msvd_qa/qwen2_vl_eval_config_prefix1.json +23 -0
  50. config/eval/msvd_qa/qwen2_vl_eval_config_prefix2.json +23 -0
.gitattributes CHANGED
@@ -33,3 +33,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ result/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_7b_msrvtt_prefix2-checkpoint.json filter=lfs diff=lfs merge=lfs -text
37
+ result/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_7b_msrvtt_prefix3-checkpoint.json filter=lfs diff=lfs merge=lfs -text
38
+ result/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_7b_msrvtt_prefix4-checkpoint.json filter=lfs diff=lfs merge=lfs -text
39
+ result/eval/msrvtt_qa/llava_next_video_msrvtt_0.json filter=lfs diff=lfs merge=lfs -text
40
+ result/eval/msrvtt_qa/llava_next_video_msrvtt_1.json filter=lfs diff=lfs merge=lfs -text
41
+ result/eval/msrvtt_qa/llava_next_video_msrvtt_2.json filter=lfs diff=lfs merge=lfs -text
42
+ result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt.json filter=lfs diff=lfs merge=lfs -text
43
+ result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt_prefix1.json filter=lfs diff=lfs merge=lfs -text
44
+ result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt_prefix2.json filter=lfs diff=lfs merge=lfs -text
45
+ result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt_prefix3.json filter=lfs diff=lfs merge=lfs -text
46
+ result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt_prefix4.json filter=lfs diff=lfs merge=lfs -text
47
+ result/eval/msrvtt_qa/qwen2_vl_7b_msrvtt_prefix5.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ **/__pycache__/
2
+ ./act_log.err
.ipynb_checkpoints/act_log-checkpoint.err ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [h264 @ 0xb32da00] mmco: unref short failure
2
+ [h264 @ 0xb32da00] mmco: unref short failure
3
+ [h264 @ 0xa5b8040] mmco: unref short failure
4
+ [h264 @ 0xa5b8040] mmco: unref short failure
5
+ [h264 @ 0xb32da00] mmco: unref short failure
6
+ [h264 @ 0xb32da00] mmco: unref short failure
7
+ [h264 @ 0xa2c0a80] mmco: unref short failure
8
+ [h264 @ 0xa2c0a80] mmco: unref short failure
9
+ [h264 @ 0xa31e800] mmco: unref short failure
10
+ [h264 @ 0xa9dad00] mmco: unref short failure
11
+ [h264 @ 0xa9dad00] mmco: unref short failure
12
+ [h264 @ 0xa9dad00] mmco: unref short failure
13
+ [h264 @ 0xa9dad00] mmco: unref short failure
14
+ [h264 @ 0x92ff200] mmco: unref short failure
15
+ [h264 @ 0xa6f26c0] mmco: unref short failure
16
+ [h264 @ 0xa6f26c0] mmco: unref short failure
17
+ [h264 @ 0xa239f80] mmco: unref short failure
18
+ Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
19
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
20
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
21
+ `Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
22
+
23
+ Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
24
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
25
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
26
+
27
+ Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
28
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
29
+ [h264 @ 0x9e195c0] mmco: unref short failure
30
+
31
+ `Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
32
+ Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
33
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
34
+ Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
35
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
36
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
37
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
38
+ `Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
39
+
40
+
41
+
42
+
43
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
44
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
45
+
46
+
47
+
48
+
49
+ [h264 @ 0x92ff200] mmco: unref short failure
50
+ [h264 @ 0x92ff200] mmco: unref short failure
51
+
52
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
53
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
54
+ `Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
55
+
56
+
57
+ [h264 @ 0x9e195c0] mmco: unref short failure
58
+ [h264 @ 0x9e195c0] mmco: unref short failure
59
+ Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
60
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
61
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
62
+ `Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
63
+
64
+ multiprocessing.pool.RemoteTraceback:
65
+ """
66
+ Traceback (most recent call last):
67
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/multiprocessing/pool.py", line 125, in worker
68
+ result = (True, func(*args, **kwds))
69
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/multiprocessing/pool.py", line 48, in mapstar
70
+ return list(map(*args))
71
+ File "/jizhicfs/hymiezhao/ml/video_llm_template/task/eval/qwen2_vl_qa_eval.py", line 205, in batch_run
72
+ llm_messages = qwen2_vl_answer(model, processor, batch_samples, device=rank)
73
+ File "/jizhicfs/hymiezhao/ml/video_llm_template/task/eval/qwen2_vl_qa_eval.py", line 164, in qwen2_vl_answer
74
+ generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False).cpu()
75
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
76
+ return func(*args, **kwargs)
77
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/transformers/generation/utils.py", line 2057, in generate
78
+ result = self._sample(
79
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/transformers/generation/utils.py", line 3021, in _sample
80
+ outputs = self(**model_inputs, return_dict=True)
81
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
82
+ return self._call_impl(*args, **kwargs)
83
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
84
+ return forward_call(*args, **kwargs)
85
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/site-packages/transformers/models/qwen2_vl/modeling_qwen2_vl.py", line 1719, in forward
86
+ logits = logits.float()
87
+ RuntimeError: r.nvmlDeviceGetNvLinkRemoteDeviceType_ INTERNAL ASSERT FAILED at "../c10/cuda/driver_api.cpp":27, please report a bug to PyTorch. Can't find nvmlDeviceGetNvLinkRemoteDeviceType: /lib64/libnvidia-ml.so.1: undefined symbol: nvmlDeviceGetNvLinkRemoteDeviceType
88
+ """
89
+
90
+ The above exception was the direct cause of the following exception:
91
+
92
+ Traceback (most recent call last):
93
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/runpy.py", line 196, in _run_module_as_main
94
+ return _run_code(code, main_globals, None,
95
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/runpy.py", line 86, in _run_code
96
+ exec(code, run_globals)
97
+ File "/jizhicfs/hymiezhao/ml/video_llm_template/task/eval/qwen2_vl_qa_eval.py", line 253, in <module>
98
+ main(args)
99
+ File "/jizhicfs/hymiezhao/ml/video_llm_template/task/eval/qwen2_vl_qa_eval.py", line 240, in main
100
+ result_lists = pool.map(func, range(world_size))
101
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/multiprocessing/pool.py", line 367, in map
102
+ return self._map_async(func, iterable, mapstar, chunksize).get()
103
+ File "/jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/multiprocessing/pool.py", line 774, in get
104
+ raise self._value
105
+ RuntimeError: r.nvmlDeviceGetNvLinkRemoteDeviceType_ INTERNAL ASSERT FAILED at "../c10/cuda/driver_api.cpp":27, please report a bug to PyTorch. Can't find nvmlDeviceGetNvLinkRemoteDeviceType: /lib64/libnvidia-ml.so.1: undefined symbol: nvmlDeviceGetNvLinkRemoteDeviceType
106
+ /jizhicfs/hymiezhao/miniconda3_lh/envs/ml_hf/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
107
+ warnings.warn('resource_tracker: There appear to be %d '
act_log.err ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [h264 @ 0x9b59c40] mmco: unref short failure
2
+ [h264 @ 0x9b59c40] mmco: unref short failure
3
+ [h264 @ 0x86a1a40] mmco: unref short failure
4
+ [h264 @ 0x86a1a40] mmco: unref short failure
5
+ [h264 @ 0x9eeff00] mmco: unref short failure
6
+ [h264 @ 0x9eeff00] mmco: unref short failure
7
+ [h264 @ 0x9741000] mmco: unref short failure
8
+ [h264 @ 0x8a2a980] mmco: unref short failure
9
+ [h264 @ 0x8a2a980] mmco: unref short failure
10
+ [h264 @ 0x8241080] mmco: unref short failure
11
+ [h264 @ 0x8241080] mmco: unref short failure
12
+ [h264 @ 0x831f5c0] mmco: unref short failure
13
+ [h264 @ 0x9064ac0] mmco: unref short failure
14
+ [h264 @ 0x9064ac0] mmco: unref short failure
15
+ [h264 @ 0x83a9cc0] mmco: unref short failure
16
+ [h264 @ 0x83a9cc0] mmco: unref short failure
17
+ [h264 @ 0xed7b140] mmco: unref short failure
18
+ [h264 @ 0xed7b140] mmco: unref short failure
19
+ [h264 @ 0x8892600] mmco: unref short failure
20
+ [h264 @ 0x20ace480] mmco: unref short failure
21
+ [h264 @ 0x20ace480] mmco: unref short failure
22
+ [h264 @ 0x9aedd80] mmco: unref short failure
23
+ [h264 @ 0x15769000] mmco: unref short failure
24
+ [h264 @ 0x15769000] mmco: unref short failure
25
+ [h264 @ 0x9821380] mmco: unref short failure
26
+ [h264 @ 0x9821380] mmco: unref short failure
27
+ [h264 @ 0x111680c0] mmco: unref short failure
28
+ [h264 @ 0x83b4dc0] mmco: unref short failure
29
+ [h264 @ 0x83b4dc0] mmco: unref short failure
30
+ [h264 @ 0x9d5e500] mmco: unref short failure
31
+ [h264 @ 0x9d5e500] mmco: unref short failure
32
+ [h264 @ 0x12ff4840] mmco: unref short failure
33
+ [h264 @ 0x12ff4840] mmco: unref short failure
34
+ [h264 @ 0x976dfc0] mmco: unref short failure
35
+ [h264 @ 0xd4e1180] mmco: unref short failure
36
+ [h264 @ 0xd4e1180] mmco: unref short failure
37
+ [h264 @ 0x976ab80] mmco: unref short failure
38
+ [h264 @ 0x976ab80] mmco: unref short failure
39
+ [h264 @ 0x9d6b5c0] mmco: unref short failure
40
+ [h264 @ 0xfceee00] mmco: unref short failure
41
+ [h264 @ 0xfceee00] mmco: unref short failure
42
+ [h264 @ 0x838ff40] mmco: unref short failure
43
+ [h264 @ 0x838ff40] mmco: unref short failure
44
+ [h264 @ 0x1e7dba40] mmco: unref short failure
45
+ [h264 @ 0x1e7dba40] mmco: unref short failure
46
+ [h264 @ 0x953b3c0] mmco: unref short failure
47
+ [h264 @ 0x953b3c0] mmco: unref short failure
48
+ [h264 @ 0x9d5e500] mmco: unref short failure
49
+ [h264 @ 0x9d5e500] mmco: unref short failure
50
+ [h264 @ 0x11e6fcc0] mmco: unref short failure
51
+ [h264 @ 0x93d7480] mmco: unref short failure
52
+ [h264 @ 0x93d7480] mmco: unref short failure
53
+ [h264 @ 0x12ff4840] mmco: unref short failure
54
+ [h264 @ 0x12ff4840] mmco: unref short failure
55
+ [h264 @ 0xfceee00] mmco: unref short failure
56
+ [h264 @ 0xfceee00] mmco: unref short failure
57
+ [h264 @ 0xfceee00] mmco: unref short failure
58
+ [h264 @ 0x8bc3980] mmco: unref short failure
59
+ [h264 @ 0x8bc3980] mmco: unref short failure
60
+ [h264 @ 0x9c304c0] mmco: unref short failure
61
+ [h264 @ 0x9c304c0] mmco: unref short failure
62
+ [h264 @ 0xb6e6a40] mmco: unref short failure
63
+ [h264 @ 0x8b27a00] mmco: unref short failure
64
+ [h264 @ 0x8b27a00] mmco: unref short failure
65
+ [h264 @ 0x1b4bdc40] mmco: unref short failure
66
+ [h264 @ 0x1b4bdc40] mmco: unref short failure
67
+ [h264 @ 0x175507c0] mmco: unref short failure
68
+ [h264 @ 0x175507c0] mmco: unref short failure
69
+ [h264 @ 0x95b2cc0] mmco: unref short failure
70
+ [h264 @ 0x13094500] mmco: unref short failure
71
+ [h264 @ 0x97cf300] mmco: unref short failure
72
+ [h264 @ 0x97cf300] mmco: unref short failure
73
+ [h264 @ 0x8338900] mmco: unref short failure
74
+ [h264 @ 0x846fbc0] mmco: unref short failure
75
+ [h264 @ 0x846fbc0] mmco: unref short failure
76
+ [h264 @ 0x8654080] mmco: unref short failure
77
+ [h264 @ 0x15e09fc0] mmco: unref short failure
78
+ [h264 @ 0x15e09fc0] mmco: unref short failure
79
+ [h264 @ 0x8654080] mmco: unref short failure
80
+ [h264 @ 0x120ec300] mmco: unref short failure
81
+ [h264 @ 0x120ec300] mmco: unref short failure
82
+ [h264 @ 0x9518d00] mmco: unref short failure
83
+ [h264 @ 0x9068a00] mmco: unref short failure
84
+ [h264 @ 0x9068a00] mmco: unref short failure
85
+ [h264 @ 0x9068a00] mmco: unref short failure
86
+ [h264 @ 0x9068a00] mmco: unref short failure
87
+ [h264 @ 0x9068a00] mmco: unref short failure
88
+ [h264 @ 0x15769000] mmco: unref short failure
89
+ [h264 @ 0x84bffc0] mmco: unref short failure
90
+ [h264 @ 0x84bffc0] mmco: unref short failure
91
+ [h264 @ 0x15769000] mmco: unref short failure
92
+ [h264 @ 0x1aa11900] mmco: unref short failure
93
+ [h264 @ 0x1aa11900] mmco: unref short failure
94
+ [h264 @ 0x82bb480] mmco: unref short failure
95
+ [h264 @ 0x82bb480] mmco: unref short failure
96
+ [h264 @ 0x10d43940] mmco: unref short failure
97
+ [h264 @ 0x9973540] mmco: unref short failure
98
+ [h264 @ 0x9973540] mmco: unref short failure
99
+ [h264 @ 0xa2da800] mmco: unref short failure
100
+ [h264 @ 0xa2da800] mmco: unref short failure
101
+ [h264 @ 0x9973540] mmco: unref short failure
102
+ [h264 @ 0x9973540] mmco: unref short failure
103
+ [h264 @ 0x10d43940] mmco: unref short failure
104
+ [h264 @ 0x15e09fc0] mmco: unref short failure
105
+ [h264 @ 0x15e09fc0] mmco: unref short failure
106
+ [h264 @ 0x10d43940] mmco: unref short failure
107
+ [h264 @ 0x8bc3980] mmco: unref short failure
108
+ [h264 @ 0x8bc3980] mmco: unref short failure
109
+ [h264 @ 0x90b2bc0] mmco: unref short failure
110
+ [h264 @ 0x715d0d00] mmco: unref short failure
111
+ [h264 @ 0x28108040] mmco: unref short failure
112
+ [h264 @ 0x846fbc0] mmco: unref short failure
113
+ [h264 @ 0x846fbc0] mmco: unref short failure
114
+ [h264 @ 0x90b2bc0] mmco: unref short failure
115
+ [h264 @ 0x90b2bc0] mmco: unref short failure
116
+ [h264 @ 0x9973540] mmco: unref short failure
117
+ [h264 @ 0x9973540] mmco: unref short failure
118
+ [h264 @ 0x715d0d00] mmco: unref short failure
119
+ [h264 @ 0x715d0d00] mmco: unref short failure
120
+ [h264 @ 0x82bb480] mmco: unref short failure
121
+ [h264 @ 0x82bb480] mmco: unref short failure
122
+ [h264 @ 0x13c79580] mmco: unref short failure
123
+ [h264 @ 0x13c79580] mmco: unref short failure
124
+ [h264 @ 0x9518d00] mmco: unref short failure
125
+ [h264 @ 0x85513c0] mmco: unref short failure
126
+ [h264 @ 0x85513c0] mmco: unref short failure
127
+ [h264 @ 0x1029a380] mmco: unref short failure
128
+ [h264 @ 0x1029a380] mmco: unref short failure
129
+ [h264 @ 0x20e53000] mmco: unref short failure
130
+ [h264 @ 0x120ec300] mmco: unref short failure
131
+ [h264 @ 0x93eb780] mmco: unref short failure
132
+ [h264 @ 0x93eb780] mmco: unref short failure
133
+ [h264 @ 0xd3c0080] mmco: unref short failure
134
+ [h264 @ 0xd3c0080] mmco: unref short failure
135
+ [h264 @ 0x1029a380] mmco: unref short failure
136
+ [h264 @ 0x1b4bdc40] mmco: unref short failure
137
+ [h264 @ 0x1b4bdc40] mmco: unref short failure
138
+ [h264 @ 0x20e53000] mmco: unref short failure
139
+ [h264 @ 0x20e53000] mmco: unref short failure
140
+ [h264 @ 0x1aa11900] mmco: unref short failure
141
+ [h264 @ 0x1aa11900] mmco: unref short failure
142
+ [h264 @ 0x8b27a00] mmco: unref short failure
143
+ [h264 @ 0x8b27a00] mmco: unref short failure
144
+ [h264 @ 0x47b64100] mmco: unref short failure
145
+ [h264 @ 0x47b64100] mmco: unref short failure
146
+ [h264 @ 0x8b27a00] mmco: unref short failure
147
+ [h264 @ 0x8b27a00] mmco: unref short failure
148
+ [h264 @ 0xe857000] mmco: unref short failure
149
+ [h264 @ 0xe857000] mmco: unref short failure
150
+ [h264 @ 0x838ff40] mmco: unref short failure
151
+ [h264 @ 0x838ff40] mmco: unref short failure
152
+ [h264 @ 0x97b04c0] mmco: unref short failure
153
+ [h264 @ 0x66aeab00] mmco: unref short failure
154
+ [h264 @ 0x66aeab00] mmco: unref short failure
155
+ [h264 @ 0xf7cf380] mmco: unref short failure
156
+ [h264 @ 0xf7cf380] mmco: unref short failure
157
+ [h264 @ 0xf616140] mmco: unref short failure
158
+ [h264 @ 0xf616140] mmco: unref short failure
159
+ [h264 @ 0x41642140] mmco: unref short failure
160
+ [h264 @ 0x41642140] mmco: unref short failure
161
+ [h264 @ 0x8654080] mmco: unref short failure
162
+ [h264 @ 0x8654080] mmco: unref short failure
163
+ [h264 @ 0x8414080] mmco: unref short failure
164
+ [h264 @ 0xa2da800] mmco: unref short failure
165
+ [h264 @ 0x47b64100] mmco: unref short failure
166
+ [h264 @ 0x47b64100] mmco: unref short failure
167
+ [h264 @ 0x8b27a00] mmco: unref short failure
168
+ [h264 @ 0x8b27a00] mmco: unref short failure
169
+ [h264 @ 0x1aa11900] mmco: unref short failure
170
+ [h264 @ 0x1aa11900] mmco: unref short failure
171
+ [h264 @ 0x18542300] mmco: unref short failure
172
+ [h264 @ 0x18542300] mmco: unref short failure
173
+ [h264 @ 0x1cac37c0] mmco: unref short failure
174
+ [h264 @ 0x1cac37c0] mmco: unref short failure
175
+ [h264 @ 0x551b7980] mmco: unref short failure
176
+ [h264 @ 0x551b7980] mmco: unref short failure
177
+ [h264 @ 0x130a5d00] mmco: unref short failure
178
+ [h264 @ 0x130a5d00] mmco: unref short failure
179
+ [h264 @ 0x47b64100] mmco: unref short failure
180
+ [h264 @ 0x1029a380] mmco: unref short failure
181
+ [h264 @ 0x1029a380] mmco: unref short failure
182
+ [h264 @ 0x551b7980] mmco: unref short failure
183
+ [h264 @ 0x9518d00] mmco: unref short failure
184
+ [h264 @ 0x9518d00] mmco: unref short failure
185
+ [h264 @ 0x12ff4840] mmco: unref short failure
186
+ [h264 @ 0x12ff4840] mmco: unref short failure
187
+ [h264 @ 0x37079f80] mmco: unref short failure
188
+ [h264 @ 0x37079f80] mmco: unref short failure
189
+ [h264 @ 0x130a5d00] mmco: unref short failure
190
+ [h264 @ 0x37079f80] mmco: unref short failure
191
+ [h264 @ 0x37079f80] mmco: unref short failure
192
+ You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
193
+ [h264 @ 0x838ff40] mmco: unref short failure
194
+ [h264 @ 0xe857000] mmco: unref short failure
195
+ [h264 @ 0x13002b80] mmco: unref short failure
196
+ [h264 @ 0x13002b80] mmco: unref short failure
197
+ [h264 @ 0x13c79580] mmco: unref short failure
198
+ [h264 @ 0x13c79580] mmco: unref short failure
199
+
200
+ [h264 @ 0x1cac37c0] mmco: unref short failure
201
+ [h264 @ 0x1cac37c0] mmco: unref short failure
202
+ [h264 @ 0x9518d00] mmco: unref short failure
203
+ [h264 @ 0x9518d00] mmco: unref short failure
204
+ [h264 @ 0x21946e00] mmco: unref short failure
205
+ [h264 @ 0x21946e00] mmco: unref short failure
206
+ [h264 @ 0x869f680] mmco: unref short failure
207
+ [h264 @ 0x9518d00] mmco: unref short failure
208
+
209
+ [h264 @ 0x20e53000] mmco: unref short failure
210
+ [h264 @ 0x37f53c80] mmco: unref short failure
211
+ [h264 @ 0x37f53c80] mmco: unref short failure
212
+
213
+ [h264 @ 0x2bd85600] mmco: unref short failure
214
+ [h264 @ 0x175507c0] mmco: unref short failure
215
+ [h264 @ 0x175507c0] mmco: unref short failure
216
+ You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
217
+ You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
218
+ [h264 @ 0x37f53c80] mmco: unref short failure
219
+ You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
220
+ You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
221
+ [h264 @ 0x2034de00] mmco: unref short failure
222
+ You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
223
+
224
+
225
+
226
+
227
+
228
+
229
+ Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+ Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
238
+ Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
239
+ Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
240
+ Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
241
+ Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
242
+ Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
243
+ Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
244
+ Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
245
+ Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
246
+ Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
247
+ Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
248
+ Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
249
+ Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
250
+ Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
251
+ Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
252
+ Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
253
+ Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
254
+ Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
255
+ Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
256
+ Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
257
+ Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
258
+ Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
259
+ Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
260
+ Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
261
+ Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
262
+ Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
263
+ Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
264
+ Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
config/eval/activitynet_qa/.ipynb_checkpoints/llava_next_video_eval0_config-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name":"Activitynet_QA",
4
+ "q_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"llava_next_video_activitynet_0",
21
+ "description":"simple inference using llava_next_video with llava_next_video_template in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "save_path":"./result/eval/activitynet_qa",
19
+ "experiment_name":"qwen2_vl_7b_activitynet",
20
+ "description":"simple inference using qwen2 in activitynet",
21
+ "batch_size":4
22
+ }
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"Watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"qwen2_vl_7b_activitynet_prefix1",
21
+ "description":"prefix1 inference using qwen2 in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"qwen2_vl_7b_activitynet_prefix2",
21
+ "description":"prefix2 inference using qwen2 in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully and pay close attention to the actions and states of the main characters. Then, answer the question: ",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"qwen2_vl_7b_activitynet_prefix3",
21
+ "description":"prefix3 inference using qwen2 in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the content described in the following question. Then, answer the question: ",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"qwen2_vl_7b_activitynet_prefix4",
21
+ "description":"prefix4 inference using qwen2 in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the frames and information which is highly related to the following question. Then, answer the question: ",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"qwen2_vl_7b_activitynet_prefix5",
21
+ "description":"prefix5 inference using qwen2 in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/llava_next_video_eval0_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name":"Activitynet_QA",
4
+ "q_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"llava_next_video_activitynet_0",
21
+ "description":"simple inference using llava_next_video with llava_next_video_template in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/llava_next_video_eval1_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name":"Activitynet_QA",
4
+ "q_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template_with_space_after_assistant",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"llava_next_video_activitynet_1",
21
+ "description":"blank after assistant",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/llava_next_video_eval2_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name":"Activitynet_QA",
4
+ "q_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template_with_space_after_user",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"llava_next_video_activitynet_2",
21
+ "description":"llava_next_video_template_with_space_after_user in activitynet_qa",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/qwen2_vl_eval_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "save_path":"./result/eval/activitynet_qa",
19
+ "experiment_name":"qwen2_vl_7b_activitynet",
20
+ "description":"simple inference using qwen2 in activitynet",
21
+ "batch_size":4
22
+ }
config/eval/activitynet_qa/qwen2_vl_eval_config_prefix1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"Watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"qwen2_vl_7b_activitynet_prefix1",
21
+ "description":"prefix1 inference using qwen2 in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/qwen2_vl_eval_config_prefix2.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"qwen2_vl_7b_activitynet_prefix2",
21
+ "description":"prefix2 inference using qwen2 in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/qwen2_vl_eval_config_prefix3.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully and pay close attention to the actions and states of the main characters. Then, answer the question: ",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"qwen2_vl_7b_activitynet_prefix3",
21
+ "description":"prefix3 inference using qwen2 in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/qwen2_vl_eval_config_prefix4.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the content described in the following question. Then, answer the question: ",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"qwen2_vl_7b_activitynet_prefix4",
21
+ "description":"prefix4 inference using qwen2 in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/qwen2_vl_eval_config_prefix5.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "Activitynet_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the frames and information which is highly related to the following question. Then, answer the question: ",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"qwen2_vl_7b_activitynet_prefix5",
21
+ "description":"prefix5 inference using qwen2 in activitynet",
22
+ "batch_size":4
23
+ }
config/eval/activitynet_qa/test_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name":"Activitynet_QA",
4
+ "q_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/home/user/students/ml/dataset/Activitynet_Zero_Shot_QA/all_test",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4",".mkv"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template",
19
+ "save_path":"./result/eval/activitynet_qa",
20
+ "experiment_name":"test",
21
+ "description":"simple inference using llava_next_video with llava_next_video_template in activitynet_qa",
22
+ "batch_size":4
23
+ }
config/eval/msrvtt_qa/.ipynb_checkpoints/llava_next_video_eval0_config-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name":"MSRVTT_QA",
4
+ "q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"llava_next_video_msrvtt_0",
21
+ "description":"simple inference using llava_next_video with llava_next_video_template in MSRVTT",
22
+ "batch_size":32
23
+ }
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "save_path":"./result/eval/msrvtt_qa",
19
+ "experiment_name":"qwen2_vl_7b_msrvtt",
20
+ "description":"simple inference using qwen2 in msrvtt",
21
+ "batch_size":8
22
+ }
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix": "Watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"qwen2_vl_7b_msrvtt_prefix1",
21
+ "description":"prefix1 inference using qwen2 in msrvtt",
22
+ "batch_size":8
23
+ }
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix": "You are an expert at video question answering. Please watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"qwen2_vl_7b_msrvtt_prefix2",
21
+ "description":"prefix2 inference using qwen2 in msrvtt",
22
+ "batch_size":8
23
+ }
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix": "You are an expert at video question answering. Please watch this video carefully and pay close attention to the actions and states of the main characters. Then, answer the question: ",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"qwen2_vl_7b_msrvtt_prefix3",
21
+ "description":"prefix3 inference using qwen2 in msrvtt",
22
+ "batch_size":8
23
+ }
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix": "You are an expert at video question answering. Please watch this video carefully, focus on the content described in the following question. Then, answer the question: ",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"qwen2_vl_7b_msrvtt_prefix4",
21
+ "description":"prefix4 inference using qwen2 in msrvtt",
22
+ "batch_size":8
23
+ }
config/eval/msrvtt_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix": "You are an expert at video question answering. Please watch this video carefully, focus on the frames and information which is highly related to the following question. Then, answer the question: ",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"qwen2_vl_7b_msrvtt_prefix5",
21
+ "description":"prefix5 inference using qwen2 in msrvtt",
22
+ "batch_size":8
23
+ }
config/eval/msrvtt_qa/llava_next_video_eval0_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name":"MSRVTT_QA",
4
+ "q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"llava_next_video_msrvtt_0",
21
+ "description":"simple inference using llava_next_video with llava_next_video_template in MSRVTT",
22
+ "batch_size":32
23
+ }
config/eval/msrvtt_qa/llava_next_video_eval1_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name":"MSRVTT_QA",
4
+ "q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template_with_space_after_assistant",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"llava_next_video_msrvtt_1",
21
+ "description":"blank after assistant",
22
+ "batch_size":32
23
+ }
config/eval/msrvtt_qa/llava_next_video_eval2_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name":"MSRVTT_QA",
4
+ "q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template_with_space_after_user",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"llava_next_video_msrvtt_2",
21
+ "description":"llava_next_video_template_with_space_after_user in msrvtt_qa",
22
+ "batch_size":32
23
+ }
config/eval/msrvtt_qa/qwen2_vl_eval_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "save_path":"./result/eval/msrvtt_qa",
19
+ "experiment_name":"qwen2_vl_7b_msrvtt",
20
+ "description":"simple inference using qwen2 in msrvtt",
21
+ "batch_size":8
22
+ }
config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix": "Watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"qwen2_vl_7b_msrvtt_prefix1",
21
+ "description":"prefix1 inference using qwen2 in msrvtt",
22
+ "batch_size":8
23
+ }
config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix2.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix": "You are an expert at video question answering. Please watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"qwen2_vl_7b_msrvtt_prefix2",
21
+ "description":"prefix2 inference using qwen2 in msrvtt",
22
+ "batch_size":8
23
+ }
config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix3.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix": "You are an expert at video question answering. Please watch this video carefully and pay close attention to the actions and states of the main characters. Then, answer the question: ",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"qwen2_vl_7b_msrvtt_prefix3",
21
+ "description":"prefix3 inference using qwen2 in msrvtt",
22
+ "batch_size":8
23
+ }
config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix4.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix": "You are an expert at video question answering. Please watch this video carefully, focus on the content described in the following question. Then, answer the question: ",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"qwen2_vl_7b_msrvtt_prefix4",
21
+ "description":"prefix4 inference using qwen2 in msrvtt",
22
+ "batch_size":8
23
+ }
config/eval/msrvtt_qa/qwen2_vl_eval_config_prefix5.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSRVTT_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix": "You are an expert at video question answering. Please watch this video carefully, focus on the frames and information which is highly related to the following question. Then, answer the question: ",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"qwen2_vl_7b_msrvtt_prefix5",
21
+ "description":"prefix5 inference using qwen2 in msrvtt",
22
+ "batch_size":8
23
+ }
config/eval/msrvtt_qa/test_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name":"MSRVTT_QA",
4
+ "q_json_path":"/home/user/students/ml/dataset/MSRVTT_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/home/user/students/ml/dataset/MSRVTT_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/home/user/students/ml/dataset/MSRVTT_Zero_Shot_QA/videos/all",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".mp4"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/home/user/students/ml/model/LLaVA-NeXT-Video-7B-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template",
19
+ "save_path":"./result/eval/msrvtt_qa",
20
+ "experiment_name":"test",
21
+ "description":"simple inference using llava_next_video with llava_next_video_template in msrvtt_qa",
22
+ "batch_size":4
23
+ }
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config-checkpoint.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "save_path":"./result/eval/msvd_qa",
19
+ "experiment_name":"qwen2_vl_7b_msvd",
20
+ "description":"simple inference using qwen2 in msvd",
21
+ "batch_size":4
22
+ }
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix1-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"Watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/msvd_qa",
20
+ "experiment_name":"qwen2_vl_7b_msvd_prefix1",
21
+ "description":"prefix1 inference using qwen2 in msvd",
22
+ "batch_size":4
23
+ }
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix2-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/msvd_qa",
20
+ "experiment_name":"qwen2_vl_7b_msvd_prefix2",
21
+ "description":"prefix2 inference using qwen2 in msvd",
22
+ "batch_size":4
23
+ }
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix3-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully and pay close attention to the actions and states of the main characters. Then, answer the question: ",
19
+ "save_path":"./result/eval/msvd_qa",
20
+ "experiment_name":"qwen2_vl_7b_msvd_prefix3",
21
+ "description":"prefix3 inference using qwen2 in msvd",
22
+ "batch_size":4
23
+ }
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix4-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the content described in the following question. Then, answer the question: ",
19
+ "save_path":"./result/eval/msvd_qa",
20
+ "experiment_name":"qwen2_vl_7b_msvd_prefix4",
21
+ "description":"prefix4 inference using qwen2 in msvd",
22
+ "batch_size":4
23
+ }
config/eval/msvd_qa/.ipynb_checkpoints/qwen2_vl_eval_config_prefix5-checkpoint.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully, focus on the frames and information which is highly related to the following question. Then, answer the question: ",
19
+ "save_path":"./result/eval/msvd_qa",
20
+ "experiment_name":"qwen2_vl_7b_msvd_prefix5",
21
+ "description":"prefix5 inference using qwen2 in msvd",
22
+ "batch_size":4
23
+ }
config/eval/msvd_qa/llava_next_video_eval0_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template",
19
+ "save_path":"./result/eval/msvd_qa",
20
+ "experiment_name":"llava_next_video_msvd_0",
21
+ "description":"simple inference using llava_next_video with llava_next_video_template in msvd",
22
+ "batch_size":32
23
+ }
config/eval/msvd_qa/llava_next_video_eval1_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template_with_space_after_assistant",
19
+ "save_path":"./result/eval/msvd_qa",
20
+ "experiment_name":"llava_next_video_msvd_1",
21
+ "description":"blank after assistant",
22
+ "batch_size":32
23
+ }
config/eval/msvd_qa/llava_next_video_eval2_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/dataset/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/mnt/hwfile/opendatalab/chenjingzhou/ml/model/llava-next-video-7b-DPO-hf"
17
+ },
18
+ "conv_mode":"llava_next_video_template_with_space_after_user",
19
+ "save_path":"./result/eval/msvd_qa",
20
+ "experiment_name":"llava_next_video_msvd_2",
21
+ "description":"llava_next_video_template_with_space_after_user in msvd",
22
+ "batch_size":32
23
+ }
config/eval/msvd_qa/qwen2_vl_eval_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "save_path":"./result/eval/msvd_qa",
19
+ "experiment_name":"qwen2_vl_7b_msvd",
20
+ "description":"simple inference using qwen2 in msvd",
21
+ "batch_size":4
22
+ }
config/eval/msvd_qa/qwen2_vl_eval_config_prefix1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"Watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/msvd_qa",
20
+ "experiment_name":"qwen2_vl_7b_msvd_prefix1",
21
+ "description":"prefix1 inference using qwen2 in msvd",
22
+ "batch_size":4
23
+ }
config/eval/msvd_qa/qwen2_vl_eval_config_prefix2.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset":{
3
+ "dataset_name": "MSVD_QA",
4
+ "q_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_q.json",
5
+ "a_json_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/test_a.json",
6
+ "video_path":"/jizhicfs/hymiezhao/datasets/MSVD_Zero_Shot_QA/videos",
7
+ "data_type":"video",
8
+ "bound":"False",
9
+ "question_key":"question",
10
+ "answer_key":"answer",
11
+ "name_key":"video_name",
12
+ "video_postfix":[".avi"],
13
+ "num_segments":8
14
+ },
15
+ "model":{
16
+ "model_path":"/jizhicfs/hymiezhao/models/Qwen2-VL-7B-Instruct"
17
+ },
18
+ "prefix":"You are an expert at video question answering. Please watch this video carefully and answer the question: ",
19
+ "save_path":"./result/eval/msvd_qa",
20
+ "experiment_name":"qwen2_vl_7b_msvd_prefix2",
21
+ "description":"prefix2 inference using qwen2 in msvd",
22
+ "batch_size":4
23
+ }