DavidNguyen commited on
Commit
cf4b8a4
·
verified ·
1 Parent(s): 4c1e80c

c6958928eaa0728e8f86d477818d55e904c4c81d569b351f238be73418fef9ff

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/infovqa_val.json +3 -0
  3. sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/mmmu_pro_standard.json +0 -0
  4. sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/mmmu_pro_vision.json +0 -0
  5. sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
  6. sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
  7. sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank2_metric_eval_done.txt +1 -0
  8. sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank3_metric_eval_done.txt +1 -0
  9. sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/realworldqa.json +0 -0
  10. sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/results.json +245 -0
  11. sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json +0 -0
  12. sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json +0 -0
  13. sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/hallusion_bench_image.json +0 -0
  14. sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/mathvista_testmini.json +0 -0
  15. sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
  16. sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
  17. sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/results.json +146 -0
  18. sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json +0 -0
  19. sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json +0 -0
  20. sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json +0 -0
  21. sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/hallusion_bench_image.json +0 -0
  22. sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/mathvista_testmini.json +0 -0
  23. sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
  24. sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
  25. sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/results.json +146 -0
  26. sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json +0 -0
  27. sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json +0 -0
  28. sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json +0 -0
  29. sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/hallusion_bench_image.json +0 -0
  30. sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/mathvista_testmini.json +0 -0
  31. sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
  32. sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
  33. sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/results.json +146 -0
  34. sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json +0 -0
  35. sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json +82 -0
  36. sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
  37. sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank2_metric_eval_done.txt +1 -0
  38. sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank3_metric_eval_done.txt +1 -0
  39. sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json +0 -0
  40. sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json +0 -0
  41. sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/hallusion_bench_image.json +0 -0
  42. sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/mathvista_testmini.json +0 -0
  43. sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
  44. sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
  45. sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/results.json +146 -0
  46. sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json +0 -0
  47. sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/chartqa.json +0 -0
  48. sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
  49. sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
  50. sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/results.json +168 -0
.gitattributes CHANGED
@@ -197,3 +197,4 @@ sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/m
197
  sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/pope.json filter=lfs diff=lfs merge=lfs -text
198
  sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/submissions/mmbench_en_dev_results.xlsx filter=lfs diff=lfs merge=lfs -text
199
  sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/textvqa_val.json filter=lfs diff=lfs merge=lfs -text
 
 
197
  sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/pope.json filter=lfs diff=lfs merge=lfs -text
198
  sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/submissions/mmbench_en_dev_results.xlsx filter=lfs diff=lfs merge=lfs -text
199
  sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/textvqa_val.json filter=lfs diff=lfs merge=lfs -text
200
+ sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/infovqa_val.json filter=lfs diff=lfs merge=lfs -text
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/infovqa_val.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:919df637cc0e9a4231bc4b409d9d6870bfa4cfc858793bca4b4348a654587d48
3
+ size 576443538
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/mmmu_pro_standard.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/mmmu_pro_vision.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank0_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 0 eval done
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank1_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 1 eval done
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank2_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 2 eval done
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank3_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 3 eval done
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/realworldqa.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/results.json ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "infovqa_val": {
4
+ "anls,none": 0.26990360585505174,
5
+ "anls_stderr,none": 0.00838910112249176,
6
+ "alias": "infovqa_val"
7
+ },
8
+ "mmmu_pro": {
9
+ "mmmu_acc,none": 0.18872999999999998,
10
+ "mmmu_acc_stderr,none": 0.026738864276958818,
11
+ "alias": "mmmu_pro"
12
+ },
13
+ "mmmu_pro_standard": {
14
+ "mmmu_acc,none": 0.2422,
15
+ "mmmu_acc_stderr,none": "N/A",
16
+ "alias": " - mmmu_pro_standard"
17
+ },
18
+ "mmmu_pro_vision": {
19
+ "mmmu_acc,none": 0.13526,
20
+ "mmmu_acc_stderr,none": "N/A",
21
+ "alias": " - mmmu_pro_vision"
22
+ },
23
+ "realworldqa": {
24
+ "exact_match,flexible-extract": 0.4418300653594771,
25
+ "exact_match_stderr,flexible-extract": 0.01796652860560496,
26
+ "alias": "realworldqa"
27
+ }
28
+ },
29
+ "groups": {
30
+ "mmmu_pro": {
31
+ "mmmu_acc,none": 0.18872999999999998,
32
+ "mmmu_acc_stderr,none": 0.026738864276958818,
33
+ "alias": "mmmu_pro"
34
+ }
35
+ },
36
+ "configs": {
37
+ "infovqa_val": {
38
+ "task": "infovqa_val",
39
+ "dataset_path": "lmms-lab/DocVQA",
40
+ "dataset_name": "InfographicVQA",
41
+ "dataset_kwargs": {
42
+ "token": true
43
+ },
44
+ "test_split": "validation",
45
+ "doc_to_visual": "<function infovqa_doc_to_visual at 0x7f3f21688a60>",
46
+ "doc_to_text": "<function infovqa_doc_to_text at 0x7f3f21688d30>",
47
+ "doc_to_target": "answers",
48
+ "description": "",
49
+ "target_delimiter": " ",
50
+ "fewshot_delimiter": "\n\n",
51
+ "metric_list": [
52
+ {
53
+ "metric": "anls",
54
+ "aggregation": "mean",
55
+ "higher_is_better": true
56
+ }
57
+ ],
58
+ "output_type": "generate_until",
59
+ "generation_kwargs": {
60
+ "max_new_tokens": 32,
61
+ "temperature": 0.0,
62
+ "do_sample": false,
63
+ "until": [
64
+ "\n\n"
65
+ ]
66
+ },
67
+ "repeats": 1,
68
+ "should_decontaminate": false,
69
+ "model_specific_prompt_kwargs": {
70
+ "default": {
71
+ "pre_prompt": "",
72
+ "post_prompt": "\nAnswer the question using a single word or phrase."
73
+ }
74
+ }
75
+ },
76
+ "mmmu_pro_standard": {
77
+ "task": "mmmu_pro_standard",
78
+ "dataset_path": "MMMU/MMMU_Pro",
79
+ "dataset_name": "standard (10 options)",
80
+ "test_split": "test",
81
+ "doc_to_visual": "<function mmmu_pro_doc_to_visual at 0x7f3f4cd48820>",
82
+ "doc_to_text": "<function mmmu_pro_doc_to_text at 0x7f3f4cd525e0>",
83
+ "doc_to_target": "{{answer}}",
84
+ "process_results": "<function mmmu_pro_process_results at 0x7f3f4cd5c550>",
85
+ "description": "",
86
+ "target_delimiter": " ",
87
+ "fewshot_delimiter": "\n\n",
88
+ "metric_list": [
89
+ {
90
+ "metric": "mmmu_acc",
91
+ "aggregation": "<function mmmu_pro_aggregate_results at 0x7f3f4cd664c0>",
92
+ "higher_is_better": true
93
+ }
94
+ ],
95
+ "output_type": "generate_until",
96
+ "generation_kwargs": {
97
+ "max_new_tokens": 256,
98
+ "until": [
99
+ "\n\n"
100
+ ]
101
+ },
102
+ "repeats": 1,
103
+ "should_decontaminate": false,
104
+ "metadata": {
105
+ "version": 0.0,
106
+ "interleaved_format": false
107
+ },
108
+ "model_specific_prompt_kwargs": {
109
+ "default": {
110
+ "pre_prompt": "",
111
+ "post_prompt": "Answer with the option letter from the given choices directly."
112
+ }
113
+ }
114
+ },
115
+ "mmmu_pro_vision": {
116
+ "task": "mmmu_pro_vision",
117
+ "dataset_path": "MMMU/MMMU_Pro",
118
+ "dataset_name": "vision",
119
+ "test_split": "test",
120
+ "doc_to_visual": "<function mmmu_pro_doc_to_visual at 0x7f3f4cd36d30>",
121
+ "doc_to_text": "Answer with the option letter from the given choices directly.",
122
+ "doc_to_target": "{{answer}}",
123
+ "process_results": "<function mmmu_pro_process_results at 0x7f3f4cd3dc10>",
124
+ "description": "",
125
+ "target_delimiter": " ",
126
+ "fewshot_delimiter": "\n\n",
127
+ "metric_list": [
128
+ {
129
+ "metric": "mmmu_acc",
130
+ "aggregation": "<function mmmu_pro_aggregate_results at 0x7f3f4cd44b80>",
131
+ "higher_is_better": true
132
+ }
133
+ ],
134
+ "output_type": "generate_until",
135
+ "generation_kwargs": {
136
+ "max_new_tokens": 256,
137
+ "until": [
138
+ "\n\n"
139
+ ]
140
+ },
141
+ "repeats": 1,
142
+ "should_decontaminate": false,
143
+ "metadata": {
144
+ "version": 0.0,
145
+ "interleaved_format": false
146
+ }
147
+ },
148
+ "realworldqa": {
149
+ "task": "realworldqa",
150
+ "dataset_path": "lmms-lab/RealWorldQA",
151
+ "dataset_kwargs": {
152
+ "token": true
153
+ },
154
+ "test_split": "test",
155
+ "doc_to_visual": "<function realworldqa_doc_to_visual at 0x7f3f18164af0>",
156
+ "doc_to_text": "<function realworldqa_doc_to_text at 0x7f3f1811c160>",
157
+ "doc_to_target": "answer",
158
+ "description": "",
159
+ "target_delimiter": " ",
160
+ "fewshot_delimiter": "\n\n",
161
+ "metric_list": [
162
+ {
163
+ "metric": "exact_match",
164
+ "aggregation": "mean",
165
+ "higher_is_better": true,
166
+ "ignore_case": true,
167
+ "ignore_punctuation": true
168
+ }
169
+ ],
170
+ "output_type": "generate_until",
171
+ "generation_kwargs": {
172
+ "max_new_tokens": 16,
173
+ "temperature": 0.0,
174
+ "top_p": 1.0,
175
+ "num_beams": 1,
176
+ "do_sample": false,
177
+ "until": [
178
+ "\n\n"
179
+ ]
180
+ },
181
+ "repeats": 1,
182
+ "filter_list": [
183
+ {
184
+ "name": "flexible-extract",
185
+ "filter": [
186
+ {
187
+ "function": "<class 'utils.NumberWordsToDigitsFilter'>"
188
+ },
189
+ {
190
+ "function": "<class 'utils.MultiChoiceRegexFilter'>",
191
+ "group_select": 0,
192
+ "ignore_case": true,
193
+ "ignore_punctuation": true,
194
+ "regex_pattern": "(\\([A-Z]\\))"
195
+ }
196
+ ]
197
+ }
198
+ ],
199
+ "should_decontaminate": false,
200
+ "metadata": [
201
+ {
202
+ "version": 0.0
203
+ }
204
+ ],
205
+ "model_specific_prompt_kwargs": {
206
+ "default": {
207
+ "pre_prompt": "",
208
+ "post_prompt": ""
209
+ },
210
+ "gpt4v": {
211
+ "pre_prompt": "",
212
+ "post_prompt": ""
213
+ },
214
+ "xcomposer2_4khd": {
215
+ "pre_prompt": "[UNUSED_TOKEN_146]user\nQuestion: ",
216
+ "post_prompt": "[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is"
217
+ }
218
+ }
219
+ }
220
+ },
221
+ "versions": {
222
+ "infovqa_val": "Yaml",
223
+ "mmmu_pro": "N/A",
224
+ "mmmu_pro_standard": "Yaml",
225
+ "mmmu_pro_vision": "Yaml",
226
+ "realworldqa": "Yaml"
227
+ },
228
+ "n-shot": {
229
+ "infovqa_val": 0,
230
+ "mmmu_pro": 0,
231
+ "mmmu_pro_standard": 0,
232
+ "mmmu_pro_vision": 0,
233
+ "realworldqa": 0
234
+ },
235
+ "model_configs": {
236
+ "model": "llava",
237
+ "model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
238
+ "batch_size": "1",
239
+ "device": null,
240
+ "limit": null,
241
+ "bootstrap_iters": 100000,
242
+ "gen_kwargs": ""
243
+ },
244
+ "git_hash": "289c7fe5"
245
+ }
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/hallusion_bench_image.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/mathvista_testmini.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 0 eval done
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 1 eval done
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/results.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hallusion_bench_image": {
4
+ "aAcc,none": 42.061,
5
+ "aAcc_stderr,none": "N/A",
6
+ "fAcc,none": 16.474,
7
+ "fAcc_stderr,none": "N/A",
8
+ "qAcc,none": 12.7473,
9
+ "qAcc_stderr,none": "N/A",
10
+ "alias": "hallusion_bench_image"
11
+ },
12
+ "mathvista_testmini": {
13
+ "gpt_eval_score,none": 31.9,
14
+ "gpt_eval_score_stderr,none": "N/A",
15
+ "alias": "mathvista_testmini"
16
+ }
17
+ },
18
+ "configs": {
19
+ "hallusion_bench_image": {
20
+ "task": "hallusion_bench_image",
21
+ "dataset_path": "lmms-lab/HallusionBench",
22
+ "dataset_kwargs": {
23
+ "token": true
24
+ },
25
+ "test_split": "image",
26
+ "doc_to_visual": "<function hb_doc_to_visual at 0x7f1b6331a1f0>",
27
+ "doc_to_text": "<function hb_doc_to_text at 0x7f1b6331a940>",
28
+ "doc_to_target": "gt_answer_details",
29
+ "process_results": "<function hb_process_results at 0x7f1b63331160>",
30
+ "description": "",
31
+ "target_delimiter": " ",
32
+ "fewshot_delimiter": "\n\n",
33
+ "metric_list": [
34
+ {
35
+ "metric": "aAcc",
36
+ "aggregation": "<function hb_aggregation_result_aAcc at 0x7f1b63331af0>",
37
+ "higher_is_better": true
38
+ },
39
+ {
40
+ "metric": "qAcc",
41
+ "aggregation": "<function hb_aggregation_result_qAcc at 0x7f1b6325e160>",
42
+ "higher_is_better": true
43
+ },
44
+ {
45
+ "metric": "fAcc",
46
+ "aggregation": "<function hb_aggregation_result_fAcc at 0x7f1b6325e940>",
47
+ "higher_is_better": true
48
+ }
49
+ ],
50
+ "output_type": "generate_until",
51
+ "generation_kwargs": {
52
+ "max_new_tokens": 128,
53
+ "temperature": 0.0,
54
+ "top_p": 1.0,
55
+ "num_beams": 1,
56
+ "do_sample": false,
57
+ "until": [
58
+ "\n\n"
59
+ ]
60
+ },
61
+ "repeats": 1,
62
+ "should_decontaminate": false,
63
+ "metadata": [
64
+ {
65
+ "version": 0.0
66
+ }
67
+ ],
68
+ "model_specific_prompt_kwargs": {
69
+ "default": {
70
+ "pre_prompt": "",
71
+ "post_prompt": ""
72
+ }
73
+ }
74
+ },
75
+ "mathvista_testmini": {
76
+ "task": "mathvista_testmini",
77
+ "dataset_path": "AI4Math/MathVista",
78
+ "dataset_kwargs": {
79
+ "token": true
80
+ },
81
+ "test_split": "testmini",
82
+ "doc_to_visual": "<function mathvista_doc_to_visual at 0x7f1b388a5c10>",
83
+ "doc_to_text": "<function mathvista_doc_to_text at 0x7f1b3849f310>",
84
+ "doc_to_target": "answer",
85
+ "process_results": "<function mathvista_process_results at 0x7f1b384a99d0>",
86
+ "description": "",
87
+ "target_delimiter": " ",
88
+ "fewshot_delimiter": "\n\n",
89
+ "metric_list": [
90
+ {
91
+ "metric": "gpt_eval_score",
92
+ "aggregation": "<function mathvista_aggregate_results at 0x7f1b384b50d0>",
93
+ "higher_is_better": true
94
+ }
95
+ ],
96
+ "output_type": "generate_until",
97
+ "generation_kwargs": {
98
+ "until": [
99
+ "ASSISTANT:"
100
+ ],
101
+ "max_new_tokens": 1024,
102
+ "temperature": 0.0,
103
+ "top_p": 1.0,
104
+ "num_beams": 1,
105
+ "do_sample": false,
106
+ "image_aspect_ratio": "original"
107
+ },
108
+ "repeats": 1,
109
+ "should_decontaminate": false,
110
+ "model_specific_prompt_kwargs": {
111
+ "default": {
112
+ "shot_type": "format-prompt",
113
+ "shot": 0,
114
+ "use_caption": false,
115
+ "use_ocr": false
116
+ },
117
+ "phi3v": {
118
+ "shot_type": "solution"
119
+ }
120
+ },
121
+ "model_specific_generation_kwargs": {
122
+ "llava": {
123
+ "image_aspect_ratio": "original"
124
+ }
125
+ }
126
+ }
127
+ },
128
+ "versions": {
129
+ "hallusion_bench_image": "Yaml",
130
+ "mathvista_testmini": "Yaml"
131
+ },
132
+ "n-shot": {
133
+ "hallusion_bench_image": 0,
134
+ "mathvista_testmini": 0
135
+ },
136
+ "model_configs": {
137
+ "model": "llava",
138
+ "model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
139
+ "batch_size": "1",
140
+ "device": null,
141
+ "limit": null,
142
+ "bootstrap_iters": 100000,
143
+ "gen_kwargs": ""
144
+ },
145
+ "git_hash": "289c7fe5"
146
+ }
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/hallusion_bench_image.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/mathvista_testmini.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 0 eval done
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 1 eval done
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/results.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hallusion_bench_image": {
4
+ "aAcc,none": 42.3764,
5
+ "aAcc_stderr,none": "N/A",
6
+ "fAcc,none": 14.7399,
7
+ "fAcc_stderr,none": "N/A",
8
+ "qAcc,none": 12.967,
9
+ "qAcc_stderr,none": "N/A",
10
+ "alias": "hallusion_bench_image"
11
+ },
12
+ "mathvista_testmini": {
13
+ "gpt_eval_score,none": 31.9,
14
+ "gpt_eval_score_stderr,none": "N/A",
15
+ "alias": "mathvista_testmini"
16
+ }
17
+ },
18
+ "configs": {
19
+ "hallusion_bench_image": {
20
+ "task": "hallusion_bench_image",
21
+ "dataset_path": "lmms-lab/HallusionBench",
22
+ "dataset_kwargs": {
23
+ "token": true
24
+ },
25
+ "test_split": "image",
26
+ "doc_to_visual": "<function hb_doc_to_visual at 0x7ff3306fa0d0>",
27
+ "doc_to_text": "<function hb_doc_to_text at 0x7ff3306fa820>",
28
+ "doc_to_target": "gt_answer_details",
29
+ "process_results": "<function hb_process_results at 0x7ff330733040>",
30
+ "description": "",
31
+ "target_delimiter": " ",
32
+ "fewshot_delimiter": "\n\n",
33
+ "metric_list": [
34
+ {
35
+ "metric": "aAcc",
36
+ "aggregation": "<function hb_aggregation_result_aAcc at 0x7ff3307339d0>",
37
+ "higher_is_better": true
38
+ },
39
+ {
40
+ "metric": "qAcc",
41
+ "aggregation": "<function hb_aggregation_result_qAcc at 0x7ff3309ac040>",
42
+ "higher_is_better": true
43
+ },
44
+ {
45
+ "metric": "fAcc",
46
+ "aggregation": "<function hb_aggregation_result_fAcc at 0x7ff3309ac820>",
47
+ "higher_is_better": true
48
+ }
49
+ ],
50
+ "output_type": "generate_until",
51
+ "generation_kwargs": {
52
+ "max_new_tokens": 128,
53
+ "temperature": 0.0,
54
+ "top_p": 1.0,
55
+ "num_beams": 1,
56
+ "do_sample": false,
57
+ "until": [
58
+ "\n\n"
59
+ ]
60
+ },
61
+ "repeats": 1,
62
+ "should_decontaminate": false,
63
+ "metadata": [
64
+ {
65
+ "version": 0.0
66
+ }
67
+ ],
68
+ "model_specific_prompt_kwargs": {
69
+ "default": {
70
+ "pre_prompt": "",
71
+ "post_prompt": ""
72
+ }
73
+ }
74
+ },
75
+ "mathvista_testmini": {
76
+ "task": "mathvista_testmini",
77
+ "dataset_path": "AI4Math/MathVista",
78
+ "dataset_kwargs": {
79
+ "token": true
80
+ },
81
+ "test_split": "testmini",
82
+ "doc_to_visual": "<function mathvista_doc_to_visual at 0x7ff3068a5a60>",
83
+ "doc_to_text": "<function mathvista_doc_to_text at 0x7ff306149160>",
84
+ "doc_to_target": "answer",
85
+ "process_results": "<function mathvista_process_results at 0x7ff306152820>",
86
+ "description": "",
87
+ "target_delimiter": " ",
88
+ "fewshot_delimiter": "\n\n",
89
+ "metric_list": [
90
+ {
91
+ "metric": "gpt_eval_score",
92
+ "aggregation": "<function mathvista_aggregate_results at 0x7ff30615bee0>",
93
+ "higher_is_better": true
94
+ }
95
+ ],
96
+ "output_type": "generate_until",
97
+ "generation_kwargs": {
98
+ "until": [
99
+ "ASSISTANT:"
100
+ ],
101
+ "max_new_tokens": 1024,
102
+ "temperature": 0.0,
103
+ "top_p": 1.0,
104
+ "num_beams": 1,
105
+ "do_sample": false,
106
+ "image_aspect_ratio": "original"
107
+ },
108
+ "repeats": 1,
109
+ "should_decontaminate": false,
110
+ "model_specific_prompt_kwargs": {
111
+ "default": {
112
+ "shot_type": "format-prompt",
113
+ "shot": 0,
114
+ "use_caption": false,
115
+ "use_ocr": false
116
+ },
117
+ "phi3v": {
118
+ "shot_type": "solution"
119
+ }
120
+ },
121
+ "model_specific_generation_kwargs": {
122
+ "llava": {
123
+ "image_aspect_ratio": "original"
124
+ }
125
+ }
126
+ }
127
+ },
128
+ "versions": {
129
+ "hallusion_bench_image": "Yaml",
130
+ "mathvista_testmini": "Yaml"
131
+ },
132
+ "n-shot": {
133
+ "hallusion_bench_image": 0,
134
+ "mathvista_testmini": 0
135
+ },
136
+ "model_configs": {
137
+ "model": "llava",
138
+ "model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
139
+ "batch_size": "1",
140
+ "device": null,
141
+ "limit": null,
142
+ "bootstrap_iters": 100000,
143
+ "gen_kwargs": ""
144
+ },
145
+ "git_hash": "289c7fe5"
146
+ }
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/hallusion_bench_image.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/mathvista_testmini.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 0 eval done
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 1 eval done
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/results.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hallusion_bench_image": {
4
+ "aAcc,none": 41.9558,
5
+ "aAcc_stderr,none": "N/A",
6
+ "fAcc,none": 15.0289,
7
+ "fAcc_stderr,none": "N/A",
8
+ "qAcc,none": 12.967,
9
+ "qAcc_stderr,none": "N/A",
10
+ "alias": "hallusion_bench_image"
11
+ },
12
+ "mathvista_testmini": {
13
+ "gpt_eval_score,none": 31.9,
14
+ "gpt_eval_score_stderr,none": "N/A",
15
+ "alias": "mathvista_testmini"
16
+ }
17
+ },
18
+ "configs": {
19
+ "hallusion_bench_image": {
20
+ "task": "hallusion_bench_image",
21
+ "dataset_path": "lmms-lab/HallusionBench",
22
+ "dataset_kwargs": {
23
+ "token": true
24
+ },
25
+ "test_split": "image",
26
+ "doc_to_visual": "<function hb_doc_to_visual at 0x7f11359811f0>",
27
+ "doc_to_text": "<function hb_doc_to_text at 0x7f1135981940>",
28
+ "doc_to_target": "gt_answer_details",
29
+ "process_results": "<function hb_process_results at 0x7f11358e7160>",
30
+ "description": "",
31
+ "target_delimiter": " ",
32
+ "fewshot_delimiter": "\n\n",
33
+ "metric_list": [
34
+ {
35
+ "metric": "aAcc",
36
+ "aggregation": "<function hb_aggregation_result_aAcc at 0x7f11358e7af0>",
37
+ "higher_is_better": true
38
+ },
39
+ {
40
+ "metric": "qAcc",
41
+ "aggregation": "<function hb_aggregation_result_qAcc at 0x7f1135851160>",
42
+ "higher_is_better": true
43
+ },
44
+ {
45
+ "metric": "fAcc",
46
+ "aggregation": "<function hb_aggregation_result_fAcc at 0x7f1135851940>",
47
+ "higher_is_better": true
48
+ }
49
+ ],
50
+ "output_type": "generate_until",
51
+ "generation_kwargs": {
52
+ "max_new_tokens": 128,
53
+ "temperature": 0.0,
54
+ "top_p": 1.0,
55
+ "num_beams": 1,
56
+ "do_sample": false,
57
+ "until": [
58
+ "\n\n"
59
+ ]
60
+ },
61
+ "repeats": 1,
62
+ "should_decontaminate": false,
63
+ "metadata": [
64
+ {
65
+ "version": 0.0
66
+ }
67
+ ],
68
+ "model_specific_prompt_kwargs": {
69
+ "default": {
70
+ "pre_prompt": "",
71
+ "post_prompt": ""
72
+ }
73
+ }
74
+ },
75
+ "mathvista_testmini": {
76
+ "task": "mathvista_testmini",
77
+ "dataset_path": "AI4Math/MathVista",
78
+ "dataset_kwargs": {
79
+ "token": true
80
+ },
81
+ "test_split": "testmini",
82
+ "doc_to_visual": "<function mathvista_doc_to_visual at 0x7f110aa65c10>",
83
+ "doc_to_text": "<function mathvista_doc_to_text at 0x7f110a65f310>",
84
+ "doc_to_target": "answer",
85
+ "process_results": "<function mathvista_process_results at 0x7f110a6679d0>",
86
+ "description": "",
87
+ "target_delimiter": " ",
88
+ "fewshot_delimiter": "\n\n",
89
+ "metric_list": [
90
+ {
91
+ "metric": "gpt_eval_score",
92
+ "aggregation": "<function mathvista_aggregate_results at 0x7f110a6730d0>",
93
+ "higher_is_better": true
94
+ }
95
+ ],
96
+ "output_type": "generate_until",
97
+ "generation_kwargs": {
98
+ "until": [
99
+ "ASSISTANT:"
100
+ ],
101
+ "max_new_tokens": 1024,
102
+ "temperature": 0.0,
103
+ "top_p": 1.0,
104
+ "num_beams": 1,
105
+ "do_sample": false,
106
+ "image_aspect_ratio": "original"
107
+ },
108
+ "repeats": 1,
109
+ "should_decontaminate": false,
110
+ "model_specific_prompt_kwargs": {
111
+ "default": {
112
+ "shot_type": "format-prompt",
113
+ "shot": 0,
114
+ "use_caption": false,
115
+ "use_ocr": false
116
+ },
117
+ "phi3v": {
118
+ "shot_type": "solution"
119
+ }
120
+ },
121
+ "model_specific_generation_kwargs": {
122
+ "llava": {
123
+ "image_aspect_ratio": "original"
124
+ }
125
+ }
126
+ }
127
+ },
128
+ "versions": {
129
+ "hallusion_bench_image": "Yaml",
130
+ "mathvista_testmini": "Yaml"
131
+ },
132
+ "n-shot": {
133
+ "hallusion_bench_image": 0,
134
+ "mathvista_testmini": 0
135
+ },
136
+ "model_configs": {
137
+ "model": "llava",
138
+ "model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
139
+ "batch_size": "1",
140
+ "device": null,
141
+ "limit": null,
142
+ "bootstrap_iters": 100000,
143
+ "gen_kwargs": ""
144
+ },
145
+ "git_hash": "289c7fe5"
146
+ }
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "category": "VD",
4
+ "subcategory": "illusion",
5
+ "visual_input": "1",
6
+ "set_id": "0",
7
+ "figure_id": "0",
8
+ "sample_note": "circle",
9
+ "question_id": "0",
10
+ "question": "Is the right orange circle the same size as the left orange circle?",
11
+ "gt_answer_details": "The right orange circle is the same size as the left orange circle.",
12
+ "gt_answer": "1",
13
+ "filename": "./VD/illusion/0_0.png",
14
+ "model_prediction": "no",
15
+ "gpt4v_output_gpt_check": "0",
16
+ "gpt_answer": "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. If the prediction answer does not conflict with the reference answer, please generate \u201ccorrect\u201d. If the prediction answer conflict with the reference answer, please generate \u201cincorrect\u201d. If the prediction answer is unclear about the answer, please generate \"unclear\". \n\n Question:Is the right orange circle the same size as the left orange circle?\nReference answer: The right orange circle is the same size as the left orange circle.\nPrediction answer:no\nOutput:incorrect"
17
+ },
18
+ {
19
+ "category": "VD",
20
+ "subcategory": "illusion",
21
+ "visual_input": "2",
22
+ "set_id": "0",
23
+ "figure_id": "1",
24
+ "sample_note": "circle",
25
+ "question_id": "1",
26
+ "question": "Is the right orange circle larger than the left orange circle?",
27
+ "gt_answer_details": "The right orange circle is larger than the left orange circle.",
28
+ "gt_answer": "1",
29
+ "filename": "./VD/illusion/0_1.png",
30
+ "model_prediction": "Yes, the right orange circle is larger than the left orange circle.",
31
+ "gpt4v_output_gpt_check": "1",
32
+ "gpt_answer": "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. If the prediction answer does not conflict with the reference answer, please generate \u201ccorrect\u201d. If the prediction answer conflict with the reference answer, please generate \u201cincorrect\u201d. If the prediction answer is unclear about the answer, please generate \"unclear\". \n\n Question:Is the right orange circle larger than the left orange circle?\nReference answer: The right orange circle is larger than the left orange circle.\nPrediction answer:Yes, the right orange circle is larger than the left orange circle.\nOutput:correct"
33
+ },
34
+ {
35
+ "category": "VD",
36
+ "subcategory": "illusion",
37
+ "visual_input": "1",
38
+ "set_id": "1",
39
+ "figure_id": "0",
40
+ "sample_note": "box",
41
+ "question_id": "2",
42
+ "question": "Is the right grey box the same size as the left grey box?",
43
+ "gt_answer_details": "The right grey box is the same size as the left grey box.",
44
+ "gt_answer": "1",
45
+ "filename": "./VD/illusion/1_0.png",
46
+ "model_prediction": "Yes, the right grey box is the same size as the left grey box.",
47
+ "gpt4v_output_gpt_check": "1",
48
+ "gpt_answer": "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. If the prediction answer does not conflict with the reference answer, please generate \u201ccorrect\u201d. If the prediction answer conflict with the reference answer, please generate \u201cincorrect\u201d. If the prediction answer is unclear about the answer, please generate \"unclear\". \n\n Question:Is the right grey box the same size as the left grey box?\nReference answer: The right grey box is the same size as the left grey box.\nPrediction answer:Yes, the right grey box is the same size as the left grey box.\nOutput:correct"
49
+ },
50
+ {
51
+ "category": "VD",
52
+ "subcategory": "illusion",
53
+ "visual_input": "1",
54
+ "set_id": "2",
55
+ "figure_id": "0",
56
+ "sample_note": "line",
57
+ "question_id": "0",
58
+ "question": "Is the top line longer than the bottom line?",
59
+ "gt_answer_details": "The top line is the same size as the bottom line.",
60
+ "gt_answer": "0",
61
+ "filename": "./VD/illusion/2_0.png",
62
+ "model_prediction": "Yes, the top line is longer than the bottom line.",
63
+ "gpt4v_output_gpt_check": "0",
64
+ "gpt_answer": "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. If the prediction answer does not conflict with the reference answer, please generate \u201ccorrect\u201d. If the prediction answer conflict with the reference answer, please generate \u201cincorrect\u201d. If the prediction answer is unclear about the answer, please generate \"unclear\". \n\n Question:Is the top line longer than the bottom line?\nReference answer: The top line is the same size as the bottom line.\nPrediction answer:Yes, the top line is longer than the bottom line.\nOutput:incorrect"
65
+ },
66
+ {
67
+ "category": "VD",
68
+ "subcategory": "illusion",
69
+ "visual_input": "2",
70
+ "set_id": "2",
71
+ "figure_id": "1",
72
+ "sample_note": "line",
73
+ "question_id": "1",
74
+ "question": "Is the top line shorter than the bottom line?",
75
+ "gt_answer_details": "The top line is longer than the bottom line.",
76
+ "gt_answer": "0",
77
+ "filename": "./VD/illusion/2_1.png",
78
+ "model_prediction": "Yes, the top line is shorter than the bottom line.",
79
+ "gpt4v_output_gpt_check": "0",
80
+ "gpt_answer": "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. If the prediction answer does not conflict with the reference answer, please generate \u201ccorrect\u201d. If the prediction answer conflict with the reference answer, please generate \u201cincorrect\u201d. If the prediction answer is unclear about the answer, please generate \"unclear\". \n\n Question:Is the top line shorter than the bottom line?\nReference answer: The top line is longer than the bottom line.\nPrediction answer:Yes, the top line is shorter than the bottom line.\nOutput:incorrect"
81
+ }
82
+ ]
sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 1 eval done
sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank2_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 2 eval done
sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank3_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 3 eval done
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/hallusion_bench_image.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/mathvista_testmini.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 0 eval done
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 1 eval done
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/results.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hallusion_bench_image": {
4
+ "aAcc,none": 42.3764,
5
+ "aAcc_stderr,none": "N/A",
6
+ "fAcc,none": 14.7399,
7
+ "fAcc_stderr,none": "N/A",
8
+ "qAcc,none": 12.967,
9
+ "qAcc_stderr,none": "N/A",
10
+ "alias": "hallusion_bench_image"
11
+ },
12
+ "mathvista_testmini": {
13
+ "gpt_eval_score,none": 32.2,
14
+ "gpt_eval_score_stderr,none": "N/A",
15
+ "alias": "mathvista_testmini"
16
+ }
17
+ },
18
+ "configs": {
19
+ "hallusion_bench_image": {
20
+ "task": "hallusion_bench_image",
21
+ "dataset_path": "lmms-lab/HallusionBench",
22
+ "dataset_kwargs": {
23
+ "token": true
24
+ },
25
+ "test_split": "image",
26
+ "doc_to_visual": "<function hb_doc_to_visual at 0x7f5490024160>",
27
+ "doc_to_text": "<function hb_doc_to_text at 0x7f54900248b0>",
28
+ "doc_to_target": "gt_answer_details",
29
+ "process_results": "<function hb_process_results at 0x7f548ff1a0d0>",
30
+ "description": "",
31
+ "target_delimiter": " ",
32
+ "fewshot_delimiter": "\n\n",
33
+ "metric_list": [
34
+ {
35
+ "metric": "aAcc",
36
+ "aggregation": "<function hb_aggregation_result_aAcc at 0x7f548ff1aa60>",
37
+ "higher_is_better": true
38
+ },
39
+ {
40
+ "metric": "qAcc",
41
+ "aggregation": "<function hb_aggregation_result_qAcc at 0x7f548fef00d0>",
42
+ "higher_is_better": true
43
+ },
44
+ {
45
+ "metric": "fAcc",
46
+ "aggregation": "<function hb_aggregation_result_fAcc at 0x7f548fef08b0>",
47
+ "higher_is_better": true
48
+ }
49
+ ],
50
+ "output_type": "generate_until",
51
+ "generation_kwargs": {
52
+ "max_new_tokens": 128,
53
+ "temperature": 0.0,
54
+ "top_p": 1.0,
55
+ "num_beams": 1,
56
+ "do_sample": false,
57
+ "until": [
58
+ "\n\n"
59
+ ]
60
+ },
61
+ "repeats": 1,
62
+ "should_decontaminate": false,
63
+ "metadata": [
64
+ {
65
+ "version": 0.0
66
+ }
67
+ ],
68
+ "model_specific_prompt_kwargs": {
69
+ "default": {
70
+ "pre_prompt": "",
71
+ "post_prompt": ""
72
+ }
73
+ }
74
+ },
75
+ "mathvista_testmini": {
76
+ "task": "mathvista_testmini",
77
+ "dataset_path": "AI4Math/MathVista",
78
+ "dataset_kwargs": {
79
+ "token": true
80
+ },
81
+ "test_split": "testmini",
82
+ "doc_to_visual": "<function mathvista_doc_to_visual at 0x7f54654a7b80>",
83
+ "doc_to_text": "<function mathvista_doc_to_text at 0x7f54650a0280>",
84
+ "doc_to_target": "answer",
85
+ "process_results": "<function mathvista_process_results at 0x7f54650a8940>",
86
+ "description": "",
87
+ "target_delimiter": " ",
88
+ "fewshot_delimiter": "\n\n",
89
+ "metric_list": [
90
+ {
91
+ "metric": "gpt_eval_score",
92
+ "aggregation": "<function mathvista_aggregate_results at 0x7f54650b5040>",
93
+ "higher_is_better": true
94
+ }
95
+ ],
96
+ "output_type": "generate_until",
97
+ "generation_kwargs": {
98
+ "until": [
99
+ "ASSISTANT:"
100
+ ],
101
+ "max_new_tokens": 1024,
102
+ "temperature": 0.0,
103
+ "top_p": 1.0,
104
+ "num_beams": 1,
105
+ "do_sample": false,
106
+ "image_aspect_ratio": "original"
107
+ },
108
+ "repeats": 1,
109
+ "should_decontaminate": false,
110
+ "model_specific_prompt_kwargs": {
111
+ "default": {
112
+ "shot_type": "format-prompt",
113
+ "shot": 0,
114
+ "use_caption": false,
115
+ "use_ocr": false
116
+ },
117
+ "phi3v": {
118
+ "shot_type": "solution"
119
+ }
120
+ },
121
+ "model_specific_generation_kwargs": {
122
+ "llava": {
123
+ "image_aspect_ratio": "original"
124
+ }
125
+ }
126
+ }
127
+ },
128
+ "versions": {
129
+ "hallusion_bench_image": "Yaml",
130
+ "mathvista_testmini": "Yaml"
131
+ },
132
+ "n-shot": {
133
+ "hallusion_bench_image": 0,
134
+ "mathvista_testmini": 0
135
+ },
136
+ "model_configs": {
137
+ "model": "llava",
138
+ "model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
139
+ "batch_size": "1",
140
+ "device": null,
141
+ "limit": null,
142
+ "bootstrap_iters": 100000,
143
+ "gen_kwargs": ""
144
+ },
145
+ "git_hash": "289c7fe5"
146
+ }
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/chartqa.json ADDED
The diff for this file is too large to render. See raw diff
 
sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/rank0_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 0 eval done
sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/rank1_metric_eval_done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rank 1 eval done
sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/results.json ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "chartqa": {
4
+ "relaxed_overall,none": 0.1252,
5
+ "relaxed_overall_stderr,none": 0.0066202352681760356,
6
+ "relaxed_human_split,none": 0.1344,
7
+ "relaxed_human_split_stderr,none": 0.009651104965839433,
8
+ "relaxed_augmented_split,none": 0.116,
9
+ "relaxed_augmented_split_stderr,none": 0.009060953631079097,
10
+ "alias": "chartqa"
11
+ },
12
+ "seedbench_2_plus": {
13
+ "seedbench_2_plus_Chart,none": 0.4617283950617284,
14
+ "seedbench_2_plus_Chart_stderr,none": "N/A",
15
+ "seedbench_2_plus_all,none": 0.4782608695652174,
16
+ "seedbench_2_plus_all_stderr,none": "N/A",
17
+ "seedbench_2_plus_Web,none": 0.5287878787878788,
18
+ "seedbench_2_plus_Web_stderr,none": "N/A",
19
+ "seedbench_2_plus_Map,none": 0.45353159851301117,
20
+ "seedbench_2_plus_Map_stderr,none": "N/A",
21
+ "alias": "seedbench_2_plus"
22
+ }
23
+ },
24
+ "configs": {
25
+ "chartqa": {
26
+ "task": "chartqa",
27
+ "dataset_path": "lmms-lab/ChartQA",
28
+ "dataset_kwargs": {
29
+ "token": true
30
+ },
31
+ "test_split": "test",
32
+ "doc_to_visual": "<function chartqa_doc_to_visual at 0x7f1006b5fdc0>",
33
+ "doc_to_text": "<function chartqa_doc_to_text at 0x7f1006b6a820>",
34
+ "doc_to_target": "answer",
35
+ "process_results": "<function chartqa_process_results at 0x7f1006b6aaf0>",
36
+ "description": "",
37
+ "target_delimiter": " ",
38
+ "fewshot_delimiter": "\n\n",
39
+ "metric_list": [
40
+ {
41
+ "metric": "relaxed_overall",
42
+ "aggregation": "mean",
43
+ "higher_is_better": true
44
+ },
45
+ {
46
+ "metric": "relaxed_human_split",
47
+ "aggregation": "mean",
48
+ "higher_is_better": true
49
+ },
50
+ {
51
+ "metric": "relaxed_augmented_split",
52
+ "aggregation": "mean",
53
+ "higher_is_better": true
54
+ }
55
+ ],
56
+ "output_type": "generate_until",
57
+ "generation_kwargs": {
58
+ "max_new_tokens": 16,
59
+ "temperature": 0.0,
60
+ "do_sample": false,
61
+ "until": [
62
+ "\n\n"
63
+ ]
64
+ },
65
+ "repeats": 1,
66
+ "should_decontaminate": false,
67
+ "metadata": [
68
+ {
69
+ "version": 0.0
70
+ }
71
+ ],
72
+ "model_specific_prompt_kwargs": {
73
+ "default": {
74
+ "pre_prompt": "",
75
+ "post_prompt": "\nAnswer the question with a single word."
76
+ },
77
+ "qwen_vl": {
78
+ "pre_prompt": "",
79
+ "post_prompt": " Answer:"
80
+ }
81
+ }
82
+ },
83
+ "seedbench_2_plus": {
84
+ "task": "seedbench_2_plus",
85
+ "dataset_path": "doolayer/SEED-Bench-2-Plus",
86
+ "dataset_kwargs": {
87
+ "token": true
88
+ },
89
+ "test_split": "test",
90
+ "doc_to_visual": "<function seed_doc_to_visual at 0x7f10209cdd30>",
91
+ "doc_to_text": "<function seed_doc_to_text at 0x7f10208ec3a0>",
92
+ "doc_to_target": "answer",
93
+ "process_results": "<function seed_process_result at 0x7f10208ec8b0>",
94
+ "description": "",
95
+ "target_delimiter": " ",
96
+ "fewshot_delimiter": "\n\n",
97
+ "metric_list": [
98
+ {
99
+ "metric": "seedbench_2_plus_Chart",
100
+ "aggregation": "<function seed_aggregation_result at 0x7f10208ecdc0>",
101
+ "higher_is_better": true
102
+ },
103
+ {
104
+ "metric": "seedbench_2_plus_Map",
105
+ "aggregation": "<function seed_aggregation_result at 0x7f102090d280>",
106
+ "higher_is_better": true
107
+ },
108
+ {
109
+ "metric": "seedbench_2_plus_Web",
110
+ "aggregation": "<function seed_aggregation_result at 0x7f102090d700>",
111
+ "higher_is_better": true
112
+ },
113
+ {
114
+ "metric": "seedbench_2_plus_all",
115
+ "aggregation": "<function seed_aggregation_result at 0x7f102090db80>",
116
+ "higher_is_better": true
117
+ }
118
+ ],
119
+ "output_type": "generate_until",
120
+ "generation_kwargs": {
121
+ "until": [
122
+ "ASSISTANT:"
123
+ ],
124
+ "max_new_tokens": 16,
125
+ "image_aspect_ratio": "original"
126
+ },
127
+ "repeats": 1,
128
+ "should_decontaminate": false,
129
+ "metadata": [
130
+ {
131
+ "version": 0.0
132
+ }
133
+ ],
134
+ "model_specific_prompt_kwargs": {
135
+ "llava": {
136
+ "img_token": "<image>",
137
+ "post_prompt": "Answer with the option's letter from the given choices directly."
138
+ },
139
+ "gpt4V": {
140
+ "img_token": "<image>",
141
+ "post_prompt": "Answer with the option's letter from the given choices directly."
142
+ },
143
+ "default": {
144
+ "img_token": "<image>",
145
+ "post_prompt": "Answer with the option's letter from the given choices directly."
146
+ }
147
+ }
148
+ }
149
+ },
150
+ "versions": {
151
+ "chartqa": "Yaml",
152
+ "seedbench_2_plus": "Yaml"
153
+ },
154
+ "n-shot": {
155
+ "chartqa": 0,
156
+ "seedbench_2_plus": 0
157
+ },
158
+ "model_configs": {
159
+ "model": "llava",
160
+ "model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
161
+ "batch_size": "1",
162
+ "device": null,
163
+ "limit": null,
164
+ "bootstrap_iters": 100000,
165
+ "gen_kwargs": ""
166
+ },
167
+ "git_hash": "289c7fe5"
168
+ }