c6958928eaa0728e8f86d477818d55e904c4c81d569b351f238be73418fef9ff
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/infovqa_val.json +3 -0
- sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/mmmu_pro_standard.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/mmmu_pro_vision.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank2_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank3_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/realworldqa.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/results.json +245 -0
- sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/hallusion_bench_image.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/mathvista_testmini.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/results.json +146 -0
- sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/hallusion_bench_image.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/mathvista_testmini.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/results.json +146 -0
- sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/hallusion_bench_image.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/mathvista_testmini.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/results.json +146 -0
- sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json +82 -0
- sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank2_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank3_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/hallusion_bench_image.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/mathvista_testmini.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/results.json +146 -0
- sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/chartqa.json +0 -0
- sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/rank0_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/rank1_metric_eval_done.txt +1 -0
- sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/results.json +168 -0
.gitattributes
CHANGED
|
@@ -197,3 +197,4 @@ sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/m
|
|
| 197 |
sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/pope.json filter=lfs diff=lfs merge=lfs -text
|
| 198 |
sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/submissions/mmbench_en_dev_results.xlsx filter=lfs diff=lfs merge=lfs -text
|
| 199 |
sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/textvqa_val.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 197 |
sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/pope.json filter=lfs diff=lfs merge=lfs -text
|
| 198 |
sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/submissions/mmbench_en_dev_results.xlsx filter=lfs diff=lfs merge=lfs -text
|
| 199 |
sft/1M3/revise_Full_remoe/logs/0625_1719_llava..._pope_llava_model_args_179bff/textvqa_val.json filter=lfs diff=lfs merge=lfs -text
|
| 200 |
+
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/infovqa_val.json filter=lfs diff=lfs merge=lfs -text
|
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/infovqa_val.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:919df637cc0e9a4231bc4b409d9d6870bfa4cfc858793bca4b4348a654587d48
|
| 3 |
+
size 576443538
|
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/mmmu_pro_standard.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/mmmu_pro_vision.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank0_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 0 eval done
|
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank1_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 1 eval done
|
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank2_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 2 eval done
|
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/rank3_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 3 eval done
|
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/realworldqa.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0625_1751_llava...a_val_llava_model_args_179bff/results.json
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"infovqa_val": {
|
| 4 |
+
"anls,none": 0.26990360585505174,
|
| 5 |
+
"anls_stderr,none": 0.00838910112249176,
|
| 6 |
+
"alias": "infovqa_val"
|
| 7 |
+
},
|
| 8 |
+
"mmmu_pro": {
|
| 9 |
+
"mmmu_acc,none": 0.18872999999999998,
|
| 10 |
+
"mmmu_acc_stderr,none": 0.026738864276958818,
|
| 11 |
+
"alias": "mmmu_pro"
|
| 12 |
+
},
|
| 13 |
+
"mmmu_pro_standard": {
|
| 14 |
+
"mmmu_acc,none": 0.2422,
|
| 15 |
+
"mmmu_acc_stderr,none": "N/A",
|
| 16 |
+
"alias": " - mmmu_pro_standard"
|
| 17 |
+
},
|
| 18 |
+
"mmmu_pro_vision": {
|
| 19 |
+
"mmmu_acc,none": 0.13526,
|
| 20 |
+
"mmmu_acc_stderr,none": "N/A",
|
| 21 |
+
"alias": " - mmmu_pro_vision"
|
| 22 |
+
},
|
| 23 |
+
"realworldqa": {
|
| 24 |
+
"exact_match,flexible-extract": 0.4418300653594771,
|
| 25 |
+
"exact_match_stderr,flexible-extract": 0.01796652860560496,
|
| 26 |
+
"alias": "realworldqa"
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
"groups": {
|
| 30 |
+
"mmmu_pro": {
|
| 31 |
+
"mmmu_acc,none": 0.18872999999999998,
|
| 32 |
+
"mmmu_acc_stderr,none": 0.026738864276958818,
|
| 33 |
+
"alias": "mmmu_pro"
|
| 34 |
+
}
|
| 35 |
+
},
|
| 36 |
+
"configs": {
|
| 37 |
+
"infovqa_val": {
|
| 38 |
+
"task": "infovqa_val",
|
| 39 |
+
"dataset_path": "lmms-lab/DocVQA",
|
| 40 |
+
"dataset_name": "InfographicVQA",
|
| 41 |
+
"dataset_kwargs": {
|
| 42 |
+
"token": true
|
| 43 |
+
},
|
| 44 |
+
"test_split": "validation",
|
| 45 |
+
"doc_to_visual": "<function infovqa_doc_to_visual at 0x7f3f21688a60>",
|
| 46 |
+
"doc_to_text": "<function infovqa_doc_to_text at 0x7f3f21688d30>",
|
| 47 |
+
"doc_to_target": "answers",
|
| 48 |
+
"description": "",
|
| 49 |
+
"target_delimiter": " ",
|
| 50 |
+
"fewshot_delimiter": "\n\n",
|
| 51 |
+
"metric_list": [
|
| 52 |
+
{
|
| 53 |
+
"metric": "anls",
|
| 54 |
+
"aggregation": "mean",
|
| 55 |
+
"higher_is_better": true
|
| 56 |
+
}
|
| 57 |
+
],
|
| 58 |
+
"output_type": "generate_until",
|
| 59 |
+
"generation_kwargs": {
|
| 60 |
+
"max_new_tokens": 32,
|
| 61 |
+
"temperature": 0.0,
|
| 62 |
+
"do_sample": false,
|
| 63 |
+
"until": [
|
| 64 |
+
"\n\n"
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
"repeats": 1,
|
| 68 |
+
"should_decontaminate": false,
|
| 69 |
+
"model_specific_prompt_kwargs": {
|
| 70 |
+
"default": {
|
| 71 |
+
"pre_prompt": "",
|
| 72 |
+
"post_prompt": "\nAnswer the question using a single word or phrase."
|
| 73 |
+
}
|
| 74 |
+
}
|
| 75 |
+
},
|
| 76 |
+
"mmmu_pro_standard": {
|
| 77 |
+
"task": "mmmu_pro_standard",
|
| 78 |
+
"dataset_path": "MMMU/MMMU_Pro",
|
| 79 |
+
"dataset_name": "standard (10 options)",
|
| 80 |
+
"test_split": "test",
|
| 81 |
+
"doc_to_visual": "<function mmmu_pro_doc_to_visual at 0x7f3f4cd48820>",
|
| 82 |
+
"doc_to_text": "<function mmmu_pro_doc_to_text at 0x7f3f4cd525e0>",
|
| 83 |
+
"doc_to_target": "{{answer}}",
|
| 84 |
+
"process_results": "<function mmmu_pro_process_results at 0x7f3f4cd5c550>",
|
| 85 |
+
"description": "",
|
| 86 |
+
"target_delimiter": " ",
|
| 87 |
+
"fewshot_delimiter": "\n\n",
|
| 88 |
+
"metric_list": [
|
| 89 |
+
{
|
| 90 |
+
"metric": "mmmu_acc",
|
| 91 |
+
"aggregation": "<function mmmu_pro_aggregate_results at 0x7f3f4cd664c0>",
|
| 92 |
+
"higher_is_better": true
|
| 93 |
+
}
|
| 94 |
+
],
|
| 95 |
+
"output_type": "generate_until",
|
| 96 |
+
"generation_kwargs": {
|
| 97 |
+
"max_new_tokens": 256,
|
| 98 |
+
"until": [
|
| 99 |
+
"\n\n"
|
| 100 |
+
]
|
| 101 |
+
},
|
| 102 |
+
"repeats": 1,
|
| 103 |
+
"should_decontaminate": false,
|
| 104 |
+
"metadata": {
|
| 105 |
+
"version": 0.0,
|
| 106 |
+
"interleaved_format": false
|
| 107 |
+
},
|
| 108 |
+
"model_specific_prompt_kwargs": {
|
| 109 |
+
"default": {
|
| 110 |
+
"pre_prompt": "",
|
| 111 |
+
"post_prompt": "Answer with the option letter from the given choices directly."
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"mmmu_pro_vision": {
|
| 116 |
+
"task": "mmmu_pro_vision",
|
| 117 |
+
"dataset_path": "MMMU/MMMU_Pro",
|
| 118 |
+
"dataset_name": "vision",
|
| 119 |
+
"test_split": "test",
|
| 120 |
+
"doc_to_visual": "<function mmmu_pro_doc_to_visual at 0x7f3f4cd36d30>",
|
| 121 |
+
"doc_to_text": "Answer with the option letter from the given choices directly.",
|
| 122 |
+
"doc_to_target": "{{answer}}",
|
| 123 |
+
"process_results": "<function mmmu_pro_process_results at 0x7f3f4cd3dc10>",
|
| 124 |
+
"description": "",
|
| 125 |
+
"target_delimiter": " ",
|
| 126 |
+
"fewshot_delimiter": "\n\n",
|
| 127 |
+
"metric_list": [
|
| 128 |
+
{
|
| 129 |
+
"metric": "mmmu_acc",
|
| 130 |
+
"aggregation": "<function mmmu_pro_aggregate_results at 0x7f3f4cd44b80>",
|
| 131 |
+
"higher_is_better": true
|
| 132 |
+
}
|
| 133 |
+
],
|
| 134 |
+
"output_type": "generate_until",
|
| 135 |
+
"generation_kwargs": {
|
| 136 |
+
"max_new_tokens": 256,
|
| 137 |
+
"until": [
|
| 138 |
+
"\n\n"
|
| 139 |
+
]
|
| 140 |
+
},
|
| 141 |
+
"repeats": 1,
|
| 142 |
+
"should_decontaminate": false,
|
| 143 |
+
"metadata": {
|
| 144 |
+
"version": 0.0,
|
| 145 |
+
"interleaved_format": false
|
| 146 |
+
}
|
| 147 |
+
},
|
| 148 |
+
"realworldqa": {
|
| 149 |
+
"task": "realworldqa",
|
| 150 |
+
"dataset_path": "lmms-lab/RealWorldQA",
|
| 151 |
+
"dataset_kwargs": {
|
| 152 |
+
"token": true
|
| 153 |
+
},
|
| 154 |
+
"test_split": "test",
|
| 155 |
+
"doc_to_visual": "<function realworldqa_doc_to_visual at 0x7f3f18164af0>",
|
| 156 |
+
"doc_to_text": "<function realworldqa_doc_to_text at 0x7f3f1811c160>",
|
| 157 |
+
"doc_to_target": "answer",
|
| 158 |
+
"description": "",
|
| 159 |
+
"target_delimiter": " ",
|
| 160 |
+
"fewshot_delimiter": "\n\n",
|
| 161 |
+
"metric_list": [
|
| 162 |
+
{
|
| 163 |
+
"metric": "exact_match",
|
| 164 |
+
"aggregation": "mean",
|
| 165 |
+
"higher_is_better": true,
|
| 166 |
+
"ignore_case": true,
|
| 167 |
+
"ignore_punctuation": true
|
| 168 |
+
}
|
| 169 |
+
],
|
| 170 |
+
"output_type": "generate_until",
|
| 171 |
+
"generation_kwargs": {
|
| 172 |
+
"max_new_tokens": 16,
|
| 173 |
+
"temperature": 0.0,
|
| 174 |
+
"top_p": 1.0,
|
| 175 |
+
"num_beams": 1,
|
| 176 |
+
"do_sample": false,
|
| 177 |
+
"until": [
|
| 178 |
+
"\n\n"
|
| 179 |
+
]
|
| 180 |
+
},
|
| 181 |
+
"repeats": 1,
|
| 182 |
+
"filter_list": [
|
| 183 |
+
{
|
| 184 |
+
"name": "flexible-extract",
|
| 185 |
+
"filter": [
|
| 186 |
+
{
|
| 187 |
+
"function": "<class 'utils.NumberWordsToDigitsFilter'>"
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"function": "<class 'utils.MultiChoiceRegexFilter'>",
|
| 191 |
+
"group_select": 0,
|
| 192 |
+
"ignore_case": true,
|
| 193 |
+
"ignore_punctuation": true,
|
| 194 |
+
"regex_pattern": "(\\([A-Z]\\))"
|
| 195 |
+
}
|
| 196 |
+
]
|
| 197 |
+
}
|
| 198 |
+
],
|
| 199 |
+
"should_decontaminate": false,
|
| 200 |
+
"metadata": [
|
| 201 |
+
{
|
| 202 |
+
"version": 0.0
|
| 203 |
+
}
|
| 204 |
+
],
|
| 205 |
+
"model_specific_prompt_kwargs": {
|
| 206 |
+
"default": {
|
| 207 |
+
"pre_prompt": "",
|
| 208 |
+
"post_prompt": ""
|
| 209 |
+
},
|
| 210 |
+
"gpt4v": {
|
| 211 |
+
"pre_prompt": "",
|
| 212 |
+
"post_prompt": ""
|
| 213 |
+
},
|
| 214 |
+
"xcomposer2_4khd": {
|
| 215 |
+
"pre_prompt": "[UNUSED_TOKEN_146]user\nQuestion: ",
|
| 216 |
+
"post_prompt": "[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is"
|
| 217 |
+
}
|
| 218 |
+
}
|
| 219 |
+
}
|
| 220 |
+
},
|
| 221 |
+
"versions": {
|
| 222 |
+
"infovqa_val": "Yaml",
|
| 223 |
+
"mmmu_pro": "N/A",
|
| 224 |
+
"mmmu_pro_standard": "Yaml",
|
| 225 |
+
"mmmu_pro_vision": "Yaml",
|
| 226 |
+
"realworldqa": "Yaml"
|
| 227 |
+
},
|
| 228 |
+
"n-shot": {
|
| 229 |
+
"infovqa_val": 0,
|
| 230 |
+
"mmmu_pro": 0,
|
| 231 |
+
"mmmu_pro_standard": 0,
|
| 232 |
+
"mmmu_pro_vision": 0,
|
| 233 |
+
"realworldqa": 0
|
| 234 |
+
},
|
| 235 |
+
"model_configs": {
|
| 236 |
+
"model": "llava",
|
| 237 |
+
"model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
|
| 238 |
+
"batch_size": "1",
|
| 239 |
+
"device": null,
|
| 240 |
+
"limit": null,
|
| 241 |
+
"bootstrap_iters": 100000,
|
| 242 |
+
"gen_kwargs": ""
|
| 243 |
+
},
|
| 244 |
+
"git_hash": "289c7fe5"
|
| 245 |
+
}
|
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/hallusion_bench_image.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/mathvista_testmini.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 0 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 1 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/results.json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"hallusion_bench_image": {
|
| 4 |
+
"aAcc,none": 42.061,
|
| 5 |
+
"aAcc_stderr,none": "N/A",
|
| 6 |
+
"fAcc,none": 16.474,
|
| 7 |
+
"fAcc_stderr,none": "N/A",
|
| 8 |
+
"qAcc,none": 12.7473,
|
| 9 |
+
"qAcc_stderr,none": "N/A",
|
| 10 |
+
"alias": "hallusion_bench_image"
|
| 11 |
+
},
|
| 12 |
+
"mathvista_testmini": {
|
| 13 |
+
"gpt_eval_score,none": 31.9,
|
| 14 |
+
"gpt_eval_score_stderr,none": "N/A",
|
| 15 |
+
"alias": "mathvista_testmini"
|
| 16 |
+
}
|
| 17 |
+
},
|
| 18 |
+
"configs": {
|
| 19 |
+
"hallusion_bench_image": {
|
| 20 |
+
"task": "hallusion_bench_image",
|
| 21 |
+
"dataset_path": "lmms-lab/HallusionBench",
|
| 22 |
+
"dataset_kwargs": {
|
| 23 |
+
"token": true
|
| 24 |
+
},
|
| 25 |
+
"test_split": "image",
|
| 26 |
+
"doc_to_visual": "<function hb_doc_to_visual at 0x7f1b6331a1f0>",
|
| 27 |
+
"doc_to_text": "<function hb_doc_to_text at 0x7f1b6331a940>",
|
| 28 |
+
"doc_to_target": "gt_answer_details",
|
| 29 |
+
"process_results": "<function hb_process_results at 0x7f1b63331160>",
|
| 30 |
+
"description": "",
|
| 31 |
+
"target_delimiter": " ",
|
| 32 |
+
"fewshot_delimiter": "\n\n",
|
| 33 |
+
"metric_list": [
|
| 34 |
+
{
|
| 35 |
+
"metric": "aAcc",
|
| 36 |
+
"aggregation": "<function hb_aggregation_result_aAcc at 0x7f1b63331af0>",
|
| 37 |
+
"higher_is_better": true
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"metric": "qAcc",
|
| 41 |
+
"aggregation": "<function hb_aggregation_result_qAcc at 0x7f1b6325e160>",
|
| 42 |
+
"higher_is_better": true
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"metric": "fAcc",
|
| 46 |
+
"aggregation": "<function hb_aggregation_result_fAcc at 0x7f1b6325e940>",
|
| 47 |
+
"higher_is_better": true
|
| 48 |
+
}
|
| 49 |
+
],
|
| 50 |
+
"output_type": "generate_until",
|
| 51 |
+
"generation_kwargs": {
|
| 52 |
+
"max_new_tokens": 128,
|
| 53 |
+
"temperature": 0.0,
|
| 54 |
+
"top_p": 1.0,
|
| 55 |
+
"num_beams": 1,
|
| 56 |
+
"do_sample": false,
|
| 57 |
+
"until": [
|
| 58 |
+
"\n\n"
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
"repeats": 1,
|
| 62 |
+
"should_decontaminate": false,
|
| 63 |
+
"metadata": [
|
| 64 |
+
{
|
| 65 |
+
"version": 0.0
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"model_specific_prompt_kwargs": {
|
| 69 |
+
"default": {
|
| 70 |
+
"pre_prompt": "",
|
| 71 |
+
"post_prompt": ""
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
},
|
| 75 |
+
"mathvista_testmini": {
|
| 76 |
+
"task": "mathvista_testmini",
|
| 77 |
+
"dataset_path": "AI4Math/MathVista",
|
| 78 |
+
"dataset_kwargs": {
|
| 79 |
+
"token": true
|
| 80 |
+
},
|
| 81 |
+
"test_split": "testmini",
|
| 82 |
+
"doc_to_visual": "<function mathvista_doc_to_visual at 0x7f1b388a5c10>",
|
| 83 |
+
"doc_to_text": "<function mathvista_doc_to_text at 0x7f1b3849f310>",
|
| 84 |
+
"doc_to_target": "answer",
|
| 85 |
+
"process_results": "<function mathvista_process_results at 0x7f1b384a99d0>",
|
| 86 |
+
"description": "",
|
| 87 |
+
"target_delimiter": " ",
|
| 88 |
+
"fewshot_delimiter": "\n\n",
|
| 89 |
+
"metric_list": [
|
| 90 |
+
{
|
| 91 |
+
"metric": "gpt_eval_score",
|
| 92 |
+
"aggregation": "<function mathvista_aggregate_results at 0x7f1b384b50d0>",
|
| 93 |
+
"higher_is_better": true
|
| 94 |
+
}
|
| 95 |
+
],
|
| 96 |
+
"output_type": "generate_until",
|
| 97 |
+
"generation_kwargs": {
|
| 98 |
+
"until": [
|
| 99 |
+
"ASSISTANT:"
|
| 100 |
+
],
|
| 101 |
+
"max_new_tokens": 1024,
|
| 102 |
+
"temperature": 0.0,
|
| 103 |
+
"top_p": 1.0,
|
| 104 |
+
"num_beams": 1,
|
| 105 |
+
"do_sample": false,
|
| 106 |
+
"image_aspect_ratio": "original"
|
| 107 |
+
},
|
| 108 |
+
"repeats": 1,
|
| 109 |
+
"should_decontaminate": false,
|
| 110 |
+
"model_specific_prompt_kwargs": {
|
| 111 |
+
"default": {
|
| 112 |
+
"shot_type": "format-prompt",
|
| 113 |
+
"shot": 0,
|
| 114 |
+
"use_caption": false,
|
| 115 |
+
"use_ocr": false
|
| 116 |
+
},
|
| 117 |
+
"phi3v": {
|
| 118 |
+
"shot_type": "solution"
|
| 119 |
+
}
|
| 120 |
+
},
|
| 121 |
+
"model_specific_generation_kwargs": {
|
| 122 |
+
"llava": {
|
| 123 |
+
"image_aspect_ratio": "original"
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"versions": {
|
| 129 |
+
"hallusion_bench_image": "Yaml",
|
| 130 |
+
"mathvista_testmini": "Yaml"
|
| 131 |
+
},
|
| 132 |
+
"n-shot": {
|
| 133 |
+
"hallusion_bench_image": 0,
|
| 134 |
+
"mathvista_testmini": 0
|
| 135 |
+
},
|
| 136 |
+
"model_configs": {
|
| 137 |
+
"model": "llava",
|
| 138 |
+
"model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
|
| 139 |
+
"batch_size": "1",
|
| 140 |
+
"device": null,
|
| 141 |
+
"limit": null,
|
| 142 |
+
"bootstrap_iters": 100000,
|
| 143 |
+
"gen_kwargs": ""
|
| 144 |
+
},
|
| 145 |
+
"git_hash": "289c7fe5"
|
| 146 |
+
}
|
sft/1M3/revise_Full_remoe/logs/0626_0722_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/hallusion_bench_image.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/mathvista_testmini.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 0 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 1 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/results.json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"hallusion_bench_image": {
|
| 4 |
+
"aAcc,none": 42.3764,
|
| 5 |
+
"aAcc_stderr,none": "N/A",
|
| 6 |
+
"fAcc,none": 14.7399,
|
| 7 |
+
"fAcc_stderr,none": "N/A",
|
| 8 |
+
"qAcc,none": 12.967,
|
| 9 |
+
"qAcc_stderr,none": "N/A",
|
| 10 |
+
"alias": "hallusion_bench_image"
|
| 11 |
+
},
|
| 12 |
+
"mathvista_testmini": {
|
| 13 |
+
"gpt_eval_score,none": 31.9,
|
| 14 |
+
"gpt_eval_score_stderr,none": "N/A",
|
| 15 |
+
"alias": "mathvista_testmini"
|
| 16 |
+
}
|
| 17 |
+
},
|
| 18 |
+
"configs": {
|
| 19 |
+
"hallusion_bench_image": {
|
| 20 |
+
"task": "hallusion_bench_image",
|
| 21 |
+
"dataset_path": "lmms-lab/HallusionBench",
|
| 22 |
+
"dataset_kwargs": {
|
| 23 |
+
"token": true
|
| 24 |
+
},
|
| 25 |
+
"test_split": "image",
|
| 26 |
+
"doc_to_visual": "<function hb_doc_to_visual at 0x7ff3306fa0d0>",
|
| 27 |
+
"doc_to_text": "<function hb_doc_to_text at 0x7ff3306fa820>",
|
| 28 |
+
"doc_to_target": "gt_answer_details",
|
| 29 |
+
"process_results": "<function hb_process_results at 0x7ff330733040>",
|
| 30 |
+
"description": "",
|
| 31 |
+
"target_delimiter": " ",
|
| 32 |
+
"fewshot_delimiter": "\n\n",
|
| 33 |
+
"metric_list": [
|
| 34 |
+
{
|
| 35 |
+
"metric": "aAcc",
|
| 36 |
+
"aggregation": "<function hb_aggregation_result_aAcc at 0x7ff3307339d0>",
|
| 37 |
+
"higher_is_better": true
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"metric": "qAcc",
|
| 41 |
+
"aggregation": "<function hb_aggregation_result_qAcc at 0x7ff3309ac040>",
|
| 42 |
+
"higher_is_better": true
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"metric": "fAcc",
|
| 46 |
+
"aggregation": "<function hb_aggregation_result_fAcc at 0x7ff3309ac820>",
|
| 47 |
+
"higher_is_better": true
|
| 48 |
+
}
|
| 49 |
+
],
|
| 50 |
+
"output_type": "generate_until",
|
| 51 |
+
"generation_kwargs": {
|
| 52 |
+
"max_new_tokens": 128,
|
| 53 |
+
"temperature": 0.0,
|
| 54 |
+
"top_p": 1.0,
|
| 55 |
+
"num_beams": 1,
|
| 56 |
+
"do_sample": false,
|
| 57 |
+
"until": [
|
| 58 |
+
"\n\n"
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
"repeats": 1,
|
| 62 |
+
"should_decontaminate": false,
|
| 63 |
+
"metadata": [
|
| 64 |
+
{
|
| 65 |
+
"version": 0.0
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"model_specific_prompt_kwargs": {
|
| 69 |
+
"default": {
|
| 70 |
+
"pre_prompt": "",
|
| 71 |
+
"post_prompt": ""
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
},
|
| 75 |
+
"mathvista_testmini": {
|
| 76 |
+
"task": "mathvista_testmini",
|
| 77 |
+
"dataset_path": "AI4Math/MathVista",
|
| 78 |
+
"dataset_kwargs": {
|
| 79 |
+
"token": true
|
| 80 |
+
},
|
| 81 |
+
"test_split": "testmini",
|
| 82 |
+
"doc_to_visual": "<function mathvista_doc_to_visual at 0x7ff3068a5a60>",
|
| 83 |
+
"doc_to_text": "<function mathvista_doc_to_text at 0x7ff306149160>",
|
| 84 |
+
"doc_to_target": "answer",
|
| 85 |
+
"process_results": "<function mathvista_process_results at 0x7ff306152820>",
|
| 86 |
+
"description": "",
|
| 87 |
+
"target_delimiter": " ",
|
| 88 |
+
"fewshot_delimiter": "\n\n",
|
| 89 |
+
"metric_list": [
|
| 90 |
+
{
|
| 91 |
+
"metric": "gpt_eval_score",
|
| 92 |
+
"aggregation": "<function mathvista_aggregate_results at 0x7ff30615bee0>",
|
| 93 |
+
"higher_is_better": true
|
| 94 |
+
}
|
| 95 |
+
],
|
| 96 |
+
"output_type": "generate_until",
|
| 97 |
+
"generation_kwargs": {
|
| 98 |
+
"until": [
|
| 99 |
+
"ASSISTANT:"
|
| 100 |
+
],
|
| 101 |
+
"max_new_tokens": 1024,
|
| 102 |
+
"temperature": 0.0,
|
| 103 |
+
"top_p": 1.0,
|
| 104 |
+
"num_beams": 1,
|
| 105 |
+
"do_sample": false,
|
| 106 |
+
"image_aspect_ratio": "original"
|
| 107 |
+
},
|
| 108 |
+
"repeats": 1,
|
| 109 |
+
"should_decontaminate": false,
|
| 110 |
+
"model_specific_prompt_kwargs": {
|
| 111 |
+
"default": {
|
| 112 |
+
"shot_type": "format-prompt",
|
| 113 |
+
"shot": 0,
|
| 114 |
+
"use_caption": false,
|
| 115 |
+
"use_ocr": false
|
| 116 |
+
},
|
| 117 |
+
"phi3v": {
|
| 118 |
+
"shot_type": "solution"
|
| 119 |
+
}
|
| 120 |
+
},
|
| 121 |
+
"model_specific_generation_kwargs": {
|
| 122 |
+
"llava": {
|
| 123 |
+
"image_aspect_ratio": "original"
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"versions": {
|
| 129 |
+
"hallusion_bench_image": "Yaml",
|
| 130 |
+
"mathvista_testmini": "Yaml"
|
| 131 |
+
},
|
| 132 |
+
"n-shot": {
|
| 133 |
+
"hallusion_bench_image": 0,
|
| 134 |
+
"mathvista_testmini": 0
|
| 135 |
+
},
|
| 136 |
+
"model_configs": {
|
| 137 |
+
"model": "llava",
|
| 138 |
+
"model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
|
| 139 |
+
"batch_size": "1",
|
| 140 |
+
"device": null,
|
| 141 |
+
"limit": null,
|
| 142 |
+
"bootstrap_iters": 100000,
|
| 143 |
+
"gen_kwargs": ""
|
| 144 |
+
},
|
| 145 |
+
"git_hash": "289c7fe5"
|
| 146 |
+
}
|
sft/1M3/revise_Full_remoe/logs/0626_0748_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/hallusion_bench_image.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/mathvista_testmini.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 0 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 1 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/results.json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"hallusion_bench_image": {
|
| 4 |
+
"aAcc,none": 41.9558,
|
| 5 |
+
"aAcc_stderr,none": "N/A",
|
| 6 |
+
"fAcc,none": 15.0289,
|
| 7 |
+
"fAcc_stderr,none": "N/A",
|
| 8 |
+
"qAcc,none": 12.967,
|
| 9 |
+
"qAcc_stderr,none": "N/A",
|
| 10 |
+
"alias": "hallusion_bench_image"
|
| 11 |
+
},
|
| 12 |
+
"mathvista_testmini": {
|
| 13 |
+
"gpt_eval_score,none": 31.9,
|
| 14 |
+
"gpt_eval_score_stderr,none": "N/A",
|
| 15 |
+
"alias": "mathvista_testmini"
|
| 16 |
+
}
|
| 17 |
+
},
|
| 18 |
+
"configs": {
|
| 19 |
+
"hallusion_bench_image": {
|
| 20 |
+
"task": "hallusion_bench_image",
|
| 21 |
+
"dataset_path": "lmms-lab/HallusionBench",
|
| 22 |
+
"dataset_kwargs": {
|
| 23 |
+
"token": true
|
| 24 |
+
},
|
| 25 |
+
"test_split": "image",
|
| 26 |
+
"doc_to_visual": "<function hb_doc_to_visual at 0x7f11359811f0>",
|
| 27 |
+
"doc_to_text": "<function hb_doc_to_text at 0x7f1135981940>",
|
| 28 |
+
"doc_to_target": "gt_answer_details",
|
| 29 |
+
"process_results": "<function hb_process_results at 0x7f11358e7160>",
|
| 30 |
+
"description": "",
|
| 31 |
+
"target_delimiter": " ",
|
| 32 |
+
"fewshot_delimiter": "\n\n",
|
| 33 |
+
"metric_list": [
|
| 34 |
+
{
|
| 35 |
+
"metric": "aAcc",
|
| 36 |
+
"aggregation": "<function hb_aggregation_result_aAcc at 0x7f11358e7af0>",
|
| 37 |
+
"higher_is_better": true
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"metric": "qAcc",
|
| 41 |
+
"aggregation": "<function hb_aggregation_result_qAcc at 0x7f1135851160>",
|
| 42 |
+
"higher_is_better": true
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"metric": "fAcc",
|
| 46 |
+
"aggregation": "<function hb_aggregation_result_fAcc at 0x7f1135851940>",
|
| 47 |
+
"higher_is_better": true
|
| 48 |
+
}
|
| 49 |
+
],
|
| 50 |
+
"output_type": "generate_until",
|
| 51 |
+
"generation_kwargs": {
|
| 52 |
+
"max_new_tokens": 128,
|
| 53 |
+
"temperature": 0.0,
|
| 54 |
+
"top_p": 1.0,
|
| 55 |
+
"num_beams": 1,
|
| 56 |
+
"do_sample": false,
|
| 57 |
+
"until": [
|
| 58 |
+
"\n\n"
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
"repeats": 1,
|
| 62 |
+
"should_decontaminate": false,
|
| 63 |
+
"metadata": [
|
| 64 |
+
{
|
| 65 |
+
"version": 0.0
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"model_specific_prompt_kwargs": {
|
| 69 |
+
"default": {
|
| 70 |
+
"pre_prompt": "",
|
| 71 |
+
"post_prompt": ""
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
},
|
| 75 |
+
"mathvista_testmini": {
|
| 76 |
+
"task": "mathvista_testmini",
|
| 77 |
+
"dataset_path": "AI4Math/MathVista",
|
| 78 |
+
"dataset_kwargs": {
|
| 79 |
+
"token": true
|
| 80 |
+
},
|
| 81 |
+
"test_split": "testmini",
|
| 82 |
+
"doc_to_visual": "<function mathvista_doc_to_visual at 0x7f110aa65c10>",
|
| 83 |
+
"doc_to_text": "<function mathvista_doc_to_text at 0x7f110a65f310>",
|
| 84 |
+
"doc_to_target": "answer",
|
| 85 |
+
"process_results": "<function mathvista_process_results at 0x7f110a6679d0>",
|
| 86 |
+
"description": "",
|
| 87 |
+
"target_delimiter": " ",
|
| 88 |
+
"fewshot_delimiter": "\n\n",
|
| 89 |
+
"metric_list": [
|
| 90 |
+
{
|
| 91 |
+
"metric": "gpt_eval_score",
|
| 92 |
+
"aggregation": "<function mathvista_aggregate_results at 0x7f110a6730d0>",
|
| 93 |
+
"higher_is_better": true
|
| 94 |
+
}
|
| 95 |
+
],
|
| 96 |
+
"output_type": "generate_until",
|
| 97 |
+
"generation_kwargs": {
|
| 98 |
+
"until": [
|
| 99 |
+
"ASSISTANT:"
|
| 100 |
+
],
|
| 101 |
+
"max_new_tokens": 1024,
|
| 102 |
+
"temperature": 0.0,
|
| 103 |
+
"top_p": 1.0,
|
| 104 |
+
"num_beams": 1,
|
| 105 |
+
"do_sample": false,
|
| 106 |
+
"image_aspect_ratio": "original"
|
| 107 |
+
},
|
| 108 |
+
"repeats": 1,
|
| 109 |
+
"should_decontaminate": false,
|
| 110 |
+
"model_specific_prompt_kwargs": {
|
| 111 |
+
"default": {
|
| 112 |
+
"shot_type": "format-prompt",
|
| 113 |
+
"shot": 0,
|
| 114 |
+
"use_caption": false,
|
| 115 |
+
"use_ocr": false
|
| 116 |
+
},
|
| 117 |
+
"phi3v": {
|
| 118 |
+
"shot_type": "solution"
|
| 119 |
+
}
|
| 120 |
+
},
|
| 121 |
+
"model_specific_generation_kwargs": {
|
| 122 |
+
"llava": {
|
| 123 |
+
"image_aspect_ratio": "original"
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"versions": {
|
| 129 |
+
"hallusion_bench_image": "Yaml",
|
| 130 |
+
"mathvista_testmini": "Yaml"
|
| 131 |
+
},
|
| 132 |
+
"n-shot": {
|
| 133 |
+
"hallusion_bench_image": 0,
|
| 134 |
+
"mathvista_testmini": 0
|
| 135 |
+
},
|
| 136 |
+
"model_configs": {
|
| 137 |
+
"model": "llava",
|
| 138 |
+
"model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
|
| 139 |
+
"batch_size": "1",
|
| 140 |
+
"device": null,
|
| 141 |
+
"limit": null,
|
| 142 |
+
"bootstrap_iters": 100000,
|
| 143 |
+
"gen_kwargs": ""
|
| 144 |
+
},
|
| 145 |
+
"git_hash": "289c7fe5"
|
| 146 |
+
}
|
sft/1M3/revise_Full_remoe/logs/0626_0910_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"category": "VD",
|
| 4 |
+
"subcategory": "illusion",
|
| 5 |
+
"visual_input": "1",
|
| 6 |
+
"set_id": "0",
|
| 7 |
+
"figure_id": "0",
|
| 8 |
+
"sample_note": "circle",
|
| 9 |
+
"question_id": "0",
|
| 10 |
+
"question": "Is the right orange circle the same size as the left orange circle?",
|
| 11 |
+
"gt_answer_details": "The right orange circle is the same size as the left orange circle.",
|
| 12 |
+
"gt_answer": "1",
|
| 13 |
+
"filename": "./VD/illusion/0_0.png",
|
| 14 |
+
"model_prediction": "no",
|
| 15 |
+
"gpt4v_output_gpt_check": "0",
|
| 16 |
+
"gpt_answer": "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. If the prediction answer does not conflict with the reference answer, please generate \u201ccorrect\u201d. If the prediction answer conflict with the reference answer, please generate \u201cincorrect\u201d. If the prediction answer is unclear about the answer, please generate \"unclear\". \n\n Question:Is the right orange circle the same size as the left orange circle?\nReference answer: The right orange circle is the same size as the left orange circle.\nPrediction answer:no\nOutput:incorrect"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"category": "VD",
|
| 20 |
+
"subcategory": "illusion",
|
| 21 |
+
"visual_input": "2",
|
| 22 |
+
"set_id": "0",
|
| 23 |
+
"figure_id": "1",
|
| 24 |
+
"sample_note": "circle",
|
| 25 |
+
"question_id": "1",
|
| 26 |
+
"question": "Is the right orange circle larger than the left orange circle?",
|
| 27 |
+
"gt_answer_details": "The right orange circle is larger than the left orange circle.",
|
| 28 |
+
"gt_answer": "1",
|
| 29 |
+
"filename": "./VD/illusion/0_1.png",
|
| 30 |
+
"model_prediction": "Yes, the right orange circle is larger than the left orange circle.",
|
| 31 |
+
"gpt4v_output_gpt_check": "1",
|
| 32 |
+
"gpt_answer": "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. If the prediction answer does not conflict with the reference answer, please generate \u201ccorrect\u201d. If the prediction answer conflict with the reference answer, please generate \u201cincorrect\u201d. If the prediction answer is unclear about the answer, please generate \"unclear\". \n\n Question:Is the right orange circle larger than the left orange circle?\nReference answer: The right orange circle is larger than the left orange circle.\nPrediction answer:Yes, the right orange circle is larger than the left orange circle.\nOutput:correct"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"category": "VD",
|
| 36 |
+
"subcategory": "illusion",
|
| 37 |
+
"visual_input": "1",
|
| 38 |
+
"set_id": "1",
|
| 39 |
+
"figure_id": "0",
|
| 40 |
+
"sample_note": "box",
|
| 41 |
+
"question_id": "2",
|
| 42 |
+
"question": "Is the right grey box the same size as the left grey box?",
|
| 43 |
+
"gt_answer_details": "The right grey box is the same size as the left grey box.",
|
| 44 |
+
"gt_answer": "1",
|
| 45 |
+
"filename": "./VD/illusion/1_0.png",
|
| 46 |
+
"model_prediction": "Yes, the right grey box is the same size as the left grey box.",
|
| 47 |
+
"gpt4v_output_gpt_check": "1",
|
| 48 |
+
"gpt_answer": "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. If the prediction answer does not conflict with the reference answer, please generate \u201ccorrect\u201d. If the prediction answer conflict with the reference answer, please generate \u201cincorrect\u201d. If the prediction answer is unclear about the answer, please generate \"unclear\". \n\n Question:Is the right grey box the same size as the left grey box?\nReference answer: The right grey box is the same size as the left grey box.\nPrediction answer:Yes, the right grey box is the same size as the left grey box.\nOutput:correct"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"category": "VD",
|
| 52 |
+
"subcategory": "illusion",
|
| 53 |
+
"visual_input": "1",
|
| 54 |
+
"set_id": "2",
|
| 55 |
+
"figure_id": "0",
|
| 56 |
+
"sample_note": "line",
|
| 57 |
+
"question_id": "0",
|
| 58 |
+
"question": "Is the top line longer than the bottom line?",
|
| 59 |
+
"gt_answer_details": "The top line is the same size as the bottom line.",
|
| 60 |
+
"gt_answer": "0",
|
| 61 |
+
"filename": "./VD/illusion/2_0.png",
|
| 62 |
+
"model_prediction": "Yes, the top line is longer than the bottom line.",
|
| 63 |
+
"gpt4v_output_gpt_check": "0",
|
| 64 |
+
"gpt_answer": "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. If the prediction answer does not conflict with the reference answer, please generate \u201ccorrect\u201d. If the prediction answer conflict with the reference answer, please generate \u201cincorrect\u201d. If the prediction answer is unclear about the answer, please generate \"unclear\". \n\n Question:Is the top line longer than the bottom line?\nReference answer: The top line is the same size as the bottom line.\nPrediction answer:Yes, the top line is longer than the bottom line.\nOutput:incorrect"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"category": "VD",
|
| 68 |
+
"subcategory": "illusion",
|
| 69 |
+
"visual_input": "2",
|
| 70 |
+
"set_id": "2",
|
| 71 |
+
"figure_id": "1",
|
| 72 |
+
"sample_note": "line",
|
| 73 |
+
"question_id": "1",
|
| 74 |
+
"question": "Is the top line shorter than the bottom line?",
|
| 75 |
+
"gt_answer_details": "The top line is longer than the bottom line.",
|
| 76 |
+
"gt_answer": "0",
|
| 77 |
+
"filename": "./VD/illusion/2_1.png",
|
| 78 |
+
"model_prediction": "Yes, the top line is shorter than the bottom line.",
|
| 79 |
+
"gpt4v_output_gpt_check": "0",
|
| 80 |
+
"gpt_answer": "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. If the prediction answer does not conflict with the reference answer, please generate \u201ccorrect\u201d. If the prediction answer conflict with the reference answer, please generate \u201cincorrect\u201d. If the prediction answer is unclear about the answer, please generate \"unclear\". \n\n Question:Is the top line shorter than the bottom line?\nReference answer: The top line is longer than the bottom line.\nPrediction answer:Yes, the top line is shorter than the bottom line.\nOutput:incorrect"
|
| 81 |
+
}
|
| 82 |
+
]
|
sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 1 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank2_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 2 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_1006_llava...image_llava_model_args_179bff/rank3_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 3 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vd_model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/gpt_response/hallusion_output_vs_model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/hallusion_bench_image.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/mathvista_testmini.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/rank0_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 0 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/rank1_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 1 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/results.json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"hallusion_bench_image": {
|
| 4 |
+
"aAcc,none": 42.3764,
|
| 5 |
+
"aAcc_stderr,none": "N/A",
|
| 6 |
+
"fAcc,none": 14.7399,
|
| 7 |
+
"fAcc_stderr,none": "N/A",
|
| 8 |
+
"qAcc,none": 12.967,
|
| 9 |
+
"qAcc_stderr,none": "N/A",
|
| 10 |
+
"alias": "hallusion_bench_image"
|
| 11 |
+
},
|
| 12 |
+
"mathvista_testmini": {
|
| 13 |
+
"gpt_eval_score,none": 32.2,
|
| 14 |
+
"gpt_eval_score_stderr,none": "N/A",
|
| 15 |
+
"alias": "mathvista_testmini"
|
| 16 |
+
}
|
| 17 |
+
},
|
| 18 |
+
"configs": {
|
| 19 |
+
"hallusion_bench_image": {
|
| 20 |
+
"task": "hallusion_bench_image",
|
| 21 |
+
"dataset_path": "lmms-lab/HallusionBench",
|
| 22 |
+
"dataset_kwargs": {
|
| 23 |
+
"token": true
|
| 24 |
+
},
|
| 25 |
+
"test_split": "image",
|
| 26 |
+
"doc_to_visual": "<function hb_doc_to_visual at 0x7f5490024160>",
|
| 27 |
+
"doc_to_text": "<function hb_doc_to_text at 0x7f54900248b0>",
|
| 28 |
+
"doc_to_target": "gt_answer_details",
|
| 29 |
+
"process_results": "<function hb_process_results at 0x7f548ff1a0d0>",
|
| 30 |
+
"description": "",
|
| 31 |
+
"target_delimiter": " ",
|
| 32 |
+
"fewshot_delimiter": "\n\n",
|
| 33 |
+
"metric_list": [
|
| 34 |
+
{
|
| 35 |
+
"metric": "aAcc",
|
| 36 |
+
"aggregation": "<function hb_aggregation_result_aAcc at 0x7f548ff1aa60>",
|
| 37 |
+
"higher_is_better": true
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"metric": "qAcc",
|
| 41 |
+
"aggregation": "<function hb_aggregation_result_qAcc at 0x7f548fef00d0>",
|
| 42 |
+
"higher_is_better": true
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"metric": "fAcc",
|
| 46 |
+
"aggregation": "<function hb_aggregation_result_fAcc at 0x7f548fef08b0>",
|
| 47 |
+
"higher_is_better": true
|
| 48 |
+
}
|
| 49 |
+
],
|
| 50 |
+
"output_type": "generate_until",
|
| 51 |
+
"generation_kwargs": {
|
| 52 |
+
"max_new_tokens": 128,
|
| 53 |
+
"temperature": 0.0,
|
| 54 |
+
"top_p": 1.0,
|
| 55 |
+
"num_beams": 1,
|
| 56 |
+
"do_sample": false,
|
| 57 |
+
"until": [
|
| 58 |
+
"\n\n"
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
"repeats": 1,
|
| 62 |
+
"should_decontaminate": false,
|
| 63 |
+
"metadata": [
|
| 64 |
+
{
|
| 65 |
+
"version": 0.0
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"model_specific_prompt_kwargs": {
|
| 69 |
+
"default": {
|
| 70 |
+
"pre_prompt": "",
|
| 71 |
+
"post_prompt": ""
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
},
|
| 75 |
+
"mathvista_testmini": {
|
| 76 |
+
"task": "mathvista_testmini",
|
| 77 |
+
"dataset_path": "AI4Math/MathVista",
|
| 78 |
+
"dataset_kwargs": {
|
| 79 |
+
"token": true
|
| 80 |
+
},
|
| 81 |
+
"test_split": "testmini",
|
| 82 |
+
"doc_to_visual": "<function mathvista_doc_to_visual at 0x7f54654a7b80>",
|
| 83 |
+
"doc_to_text": "<function mathvista_doc_to_text at 0x7f54650a0280>",
|
| 84 |
+
"doc_to_target": "answer",
|
| 85 |
+
"process_results": "<function mathvista_process_results at 0x7f54650a8940>",
|
| 86 |
+
"description": "",
|
| 87 |
+
"target_delimiter": " ",
|
| 88 |
+
"fewshot_delimiter": "\n\n",
|
| 89 |
+
"metric_list": [
|
| 90 |
+
{
|
| 91 |
+
"metric": "gpt_eval_score",
|
| 92 |
+
"aggregation": "<function mathvista_aggregate_results at 0x7f54650b5040>",
|
| 93 |
+
"higher_is_better": true
|
| 94 |
+
}
|
| 95 |
+
],
|
| 96 |
+
"output_type": "generate_until",
|
| 97 |
+
"generation_kwargs": {
|
| 98 |
+
"until": [
|
| 99 |
+
"ASSISTANT:"
|
| 100 |
+
],
|
| 101 |
+
"max_new_tokens": 1024,
|
| 102 |
+
"temperature": 0.0,
|
| 103 |
+
"top_p": 1.0,
|
| 104 |
+
"num_beams": 1,
|
| 105 |
+
"do_sample": false,
|
| 106 |
+
"image_aspect_ratio": "original"
|
| 107 |
+
},
|
| 108 |
+
"repeats": 1,
|
| 109 |
+
"should_decontaminate": false,
|
| 110 |
+
"model_specific_prompt_kwargs": {
|
| 111 |
+
"default": {
|
| 112 |
+
"shot_type": "format-prompt",
|
| 113 |
+
"shot": 0,
|
| 114 |
+
"use_caption": false,
|
| 115 |
+
"use_ocr": false
|
| 116 |
+
},
|
| 117 |
+
"phi3v": {
|
| 118 |
+
"shot_type": "solution"
|
| 119 |
+
}
|
| 120 |
+
},
|
| 121 |
+
"model_specific_generation_kwargs": {
|
| 122 |
+
"llava": {
|
| 123 |
+
"image_aspect_ratio": "original"
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"versions": {
|
| 129 |
+
"hallusion_bench_image": "Yaml",
|
| 130 |
+
"mathvista_testmini": "Yaml"
|
| 131 |
+
},
|
| 132 |
+
"n-shot": {
|
| 133 |
+
"hallusion_bench_image": 0,
|
| 134 |
+
"mathvista_testmini": 0
|
| 135 |
+
},
|
| 136 |
+
"model_configs": {
|
| 137 |
+
"model": "llava",
|
| 138 |
+
"model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
|
| 139 |
+
"batch_size": "1",
|
| 140 |
+
"device": null,
|
| 141 |
+
"limit": null,
|
| 142 |
+
"bootstrap_iters": 100000,
|
| 143 |
+
"gen_kwargs": ""
|
| 144 |
+
},
|
| 145 |
+
"git_hash": "289c7fe5"
|
| 146 |
+
}
|
sft/1M3/revise_Full_remoe/logs/0626_1014_llava...image_llava_model_args_179bff/submissions/mathvista_testmini_scores.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/chartqa.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/rank0_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 0 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/rank1_metric_eval_done.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rank 1 eval done
|
sft/1M3/revise_Full_remoe/logs/0626_1036_llava..._plus_llava_model_args_179bff/results.json
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"chartqa": {
|
| 4 |
+
"relaxed_overall,none": 0.1252,
|
| 5 |
+
"relaxed_overall_stderr,none": 0.0066202352681760356,
|
| 6 |
+
"relaxed_human_split,none": 0.1344,
|
| 7 |
+
"relaxed_human_split_stderr,none": 0.009651104965839433,
|
| 8 |
+
"relaxed_augmented_split,none": 0.116,
|
| 9 |
+
"relaxed_augmented_split_stderr,none": 0.009060953631079097,
|
| 10 |
+
"alias": "chartqa"
|
| 11 |
+
},
|
| 12 |
+
"seedbench_2_plus": {
|
| 13 |
+
"seedbench_2_plus_Chart,none": 0.4617283950617284,
|
| 14 |
+
"seedbench_2_plus_Chart_stderr,none": "N/A",
|
| 15 |
+
"seedbench_2_plus_all,none": 0.4782608695652174,
|
| 16 |
+
"seedbench_2_plus_all_stderr,none": "N/A",
|
| 17 |
+
"seedbench_2_plus_Web,none": 0.5287878787878788,
|
| 18 |
+
"seedbench_2_plus_Web_stderr,none": "N/A",
|
| 19 |
+
"seedbench_2_plus_Map,none": 0.45353159851301117,
|
| 20 |
+
"seedbench_2_plus_Map_stderr,none": "N/A",
|
| 21 |
+
"alias": "seedbench_2_plus"
|
| 22 |
+
}
|
| 23 |
+
},
|
| 24 |
+
"configs": {
|
| 25 |
+
"chartqa": {
|
| 26 |
+
"task": "chartqa",
|
| 27 |
+
"dataset_path": "lmms-lab/ChartQA",
|
| 28 |
+
"dataset_kwargs": {
|
| 29 |
+
"token": true
|
| 30 |
+
},
|
| 31 |
+
"test_split": "test",
|
| 32 |
+
"doc_to_visual": "<function chartqa_doc_to_visual at 0x7f1006b5fdc0>",
|
| 33 |
+
"doc_to_text": "<function chartqa_doc_to_text at 0x7f1006b6a820>",
|
| 34 |
+
"doc_to_target": "answer",
|
| 35 |
+
"process_results": "<function chartqa_process_results at 0x7f1006b6aaf0>",
|
| 36 |
+
"description": "",
|
| 37 |
+
"target_delimiter": " ",
|
| 38 |
+
"fewshot_delimiter": "\n\n",
|
| 39 |
+
"metric_list": [
|
| 40 |
+
{
|
| 41 |
+
"metric": "relaxed_overall",
|
| 42 |
+
"aggregation": "mean",
|
| 43 |
+
"higher_is_better": true
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"metric": "relaxed_human_split",
|
| 47 |
+
"aggregation": "mean",
|
| 48 |
+
"higher_is_better": true
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"metric": "relaxed_augmented_split",
|
| 52 |
+
"aggregation": "mean",
|
| 53 |
+
"higher_is_better": true
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"output_type": "generate_until",
|
| 57 |
+
"generation_kwargs": {
|
| 58 |
+
"max_new_tokens": 16,
|
| 59 |
+
"temperature": 0.0,
|
| 60 |
+
"do_sample": false,
|
| 61 |
+
"until": [
|
| 62 |
+
"\n\n"
|
| 63 |
+
]
|
| 64 |
+
},
|
| 65 |
+
"repeats": 1,
|
| 66 |
+
"should_decontaminate": false,
|
| 67 |
+
"metadata": [
|
| 68 |
+
{
|
| 69 |
+
"version": 0.0
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"model_specific_prompt_kwargs": {
|
| 73 |
+
"default": {
|
| 74 |
+
"pre_prompt": "",
|
| 75 |
+
"post_prompt": "\nAnswer the question with a single word."
|
| 76 |
+
},
|
| 77 |
+
"qwen_vl": {
|
| 78 |
+
"pre_prompt": "",
|
| 79 |
+
"post_prompt": " Answer:"
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
},
|
| 83 |
+
"seedbench_2_plus": {
|
| 84 |
+
"task": "seedbench_2_plus",
|
| 85 |
+
"dataset_path": "doolayer/SEED-Bench-2-Plus",
|
| 86 |
+
"dataset_kwargs": {
|
| 87 |
+
"token": true
|
| 88 |
+
},
|
| 89 |
+
"test_split": "test",
|
| 90 |
+
"doc_to_visual": "<function seed_doc_to_visual at 0x7f10209cdd30>",
|
| 91 |
+
"doc_to_text": "<function seed_doc_to_text at 0x7f10208ec3a0>",
|
| 92 |
+
"doc_to_target": "answer",
|
| 93 |
+
"process_results": "<function seed_process_result at 0x7f10208ec8b0>",
|
| 94 |
+
"description": "",
|
| 95 |
+
"target_delimiter": " ",
|
| 96 |
+
"fewshot_delimiter": "\n\n",
|
| 97 |
+
"metric_list": [
|
| 98 |
+
{
|
| 99 |
+
"metric": "seedbench_2_plus_Chart",
|
| 100 |
+
"aggregation": "<function seed_aggregation_result at 0x7f10208ecdc0>",
|
| 101 |
+
"higher_is_better": true
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"metric": "seedbench_2_plus_Map",
|
| 105 |
+
"aggregation": "<function seed_aggregation_result at 0x7f102090d280>",
|
| 106 |
+
"higher_is_better": true
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"metric": "seedbench_2_plus_Web",
|
| 110 |
+
"aggregation": "<function seed_aggregation_result at 0x7f102090d700>",
|
| 111 |
+
"higher_is_better": true
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"metric": "seedbench_2_plus_all",
|
| 115 |
+
"aggregation": "<function seed_aggregation_result at 0x7f102090db80>",
|
| 116 |
+
"higher_is_better": true
|
| 117 |
+
}
|
| 118 |
+
],
|
| 119 |
+
"output_type": "generate_until",
|
| 120 |
+
"generation_kwargs": {
|
| 121 |
+
"until": [
|
| 122 |
+
"ASSISTANT:"
|
| 123 |
+
],
|
| 124 |
+
"max_new_tokens": 16,
|
| 125 |
+
"image_aspect_ratio": "original"
|
| 126 |
+
},
|
| 127 |
+
"repeats": 1,
|
| 128 |
+
"should_decontaminate": false,
|
| 129 |
+
"metadata": [
|
| 130 |
+
{
|
| 131 |
+
"version": 0.0
|
| 132 |
+
}
|
| 133 |
+
],
|
| 134 |
+
"model_specific_prompt_kwargs": {
|
| 135 |
+
"llava": {
|
| 136 |
+
"img_token": "<image>",
|
| 137 |
+
"post_prompt": "Answer with the option's letter from the given choices directly."
|
| 138 |
+
},
|
| 139 |
+
"gpt4V": {
|
| 140 |
+
"img_token": "<image>",
|
| 141 |
+
"post_prompt": "Answer with the option's letter from the given choices directly."
|
| 142 |
+
},
|
| 143 |
+
"default": {
|
| 144 |
+
"img_token": "<image>",
|
| 145 |
+
"post_prompt": "Answer with the option's letter from the given choices directly."
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
}
|
| 149 |
+
},
|
| 150 |
+
"versions": {
|
| 151 |
+
"chartqa": "Yaml",
|
| 152 |
+
"seedbench_2_plus": "Yaml"
|
| 153 |
+
},
|
| 154 |
+
"n-shot": {
|
| 155 |
+
"chartqa": 0,
|
| 156 |
+
"seedbench_2_plus": 0
|
| 157 |
+
},
|
| 158 |
+
"model_configs": {
|
| 159 |
+
"model": "llava",
|
| 160 |
+
"model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/SMOE/1M3/revise_Full_remoe,conv_template=phi35",
|
| 161 |
+
"batch_size": "1",
|
| 162 |
+
"device": null,
|
| 163 |
+
"limit": null,
|
| 164 |
+
"bootstrap_iters": 100000,
|
| 165 |
+
"gen_kwargs": ""
|
| 166 |
+
},
|
| 167 |
+
"git_hash": "289c7fe5"
|
| 168 |
+
}
|