qianchu commited on
Commit
77a6bdb
·
verified ·
1 Parent(s): 4ca7420

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +43 -0
  2. added_tokens.json +24 -0
  3. chat_template.json +3 -0
  4. config.json +65 -0
  5. generation_config.json +6 -0
  6. inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
  7. inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
  8. inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl +0 -0
  9. inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json +29 -0
  10. inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl +0 -0
  11. inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json +29 -0
  12. inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
  13. inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
  14. inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl +3 -0
  15. inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json +29 -0
  16. inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  17. inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json +29 -0
  18. inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  19. inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_metrics.json +29 -0
  20. inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
  21. inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
  22. inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +3 -0
  23. inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json +41 -0
  24. inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +3 -0
  25. inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json +41 -0
  26. inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
  27. inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
  28. inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl +3 -0
  29. inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json +29 -0
  30. inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
  31. inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
  32. inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl +0 -0
  33. inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json +29 -0
  34. inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl +0 -0
  35. inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json +29 -0
  36. inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +0 -0
  37. inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
  38. inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl +0 -0
  39. inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json +29 -0
  40. inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl +3 -0
  41. inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json +29 -0
  42. inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl +3 -0
  43. inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json +29 -0
  44. inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +0 -0
  45. inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json +41 -0
  46. inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl +0 -0
  47. inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json +29 -0
  48. inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl +0 -0
  49. inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json +29 -0
  50. inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +0 -0
.gitattributes CHANGED
@@ -33,3 +33,46 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
38
+ inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
39
+ inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
40
+ inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
41
+ inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
42
+ inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
43
+ inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
44
+ inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
45
+ inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
46
+ inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
47
+ inference_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
48
+ inference_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
49
+ inference_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
50
+ inference_offline/MathVista/testmini_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
51
+ inference_offline/Vlms_are_blind/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e1000.jsonl filter=lfs diff=lfs merge=lfs -text
52
+ inference_submit/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
53
+ inference_submit/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
54
+ inference_submit/MMLU_PRO_HEALTH/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
55
+ inference_submit/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
56
+ inference_submit/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
57
+ inference_submit/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
58
+ inference_submit/MMMU/test_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
59
+ inference_submit/MMMU/test_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
60
+ inference_submit/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
61
+ inference_submit/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
62
+ inference_submit/MMMU_PRO/test_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
63
+ inference_submit/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
64
+ inference_submit/MMStar/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
65
+ inference_submit/MMStar/test_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
66
+ inference_submit/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
67
+ inference_submit/MathVision/test_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
68
+ inference_submit/MathVista/testmini_chat_mc-math-cot_-1_seed0_t0.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
69
+ inference_submit/MathVista/testmini_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
70
+ inference_submit/MathVista/testmini_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
71
+ inference_submit/MedQA_4choices/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
72
+ inference_submit/Med_QA/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
73
+ inference_submit/NEJM_Challenge/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
74
+ inference_submit/OmniMedVQA/testmini_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
75
+ inference_submit/gsm8k_main/test_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
76
+ inference_submit/medxpertqa/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
77
+ inference_submit/medxpertqa/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
78
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2_5_VLForConditionalGeneration"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "eos_token_id": 151645,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 3584,
9
+ "image_token_id": 151655,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 18944,
12
+ "max_position_embeddings": 128000,
13
+ "max_window_layers": 28,
14
+ "model_type": "qwen2_5_vl",
15
+ "num_attention_heads": 28,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 4,
18
+ "pad_token_id": 151643,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": {
21
+ "mrope_section": [
22
+ 16,
23
+ 24,
24
+ 24
25
+ ],
26
+ "rope_type": "default",
27
+ "type": "default"
28
+ },
29
+ "rope_theta": 1000000.0,
30
+ "sliding_window": 32768,
31
+ "tie_word_embeddings": false,
32
+ "torch_dtype": "bfloat16",
33
+ "transformers_version": "4.51.2",
34
+ "use_cache": true,
35
+ "use_sliding_window": false,
36
+ "video_token_id": 151656,
37
+ "vision_config": {
38
+ "depth": 32,
39
+ "fullatt_block_indexes": [
40
+ 7,
41
+ 15,
42
+ 23,
43
+ 31
44
+ ],
45
+ "hidden_act": "silu",
46
+ "hidden_size": 1280,
47
+ "in_channels": 3,
48
+ "in_chans": 3,
49
+ "intermediate_size": 3420,
50
+ "model_type": "qwen2_5_vl",
51
+ "num_heads": 16,
52
+ "out_hidden_size": 3584,
53
+ "patch_size": 14,
54
+ "spatial_merge_size": 2,
55
+ "spatial_patch_size": 14,
56
+ "temporal_patch_size": 2,
57
+ "tokens_per_second": 2,
58
+ "torch_dtype": "bfloat16",
59
+ "window_size": 112
60
+ },
61
+ "vision_end_token_id": 151653,
62
+ "vision_start_token_id": 151652,
63
+ "vision_token_id": 151654,
64
+ "vocab_size": 152064
65
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 151645,
4
+ "pad_token_id": 151643,
5
+ "transformers_version": "4.51.2"
6
+ }
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64564c716c1470cb7698cb4c49aac3e016849ca095fc8df079e5865468f194c3
3
+ size 12280132
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 900,
3
+ "average_score": 56.111111111111114,
4
+ "majority_vote_score": 56.111111111111114,
5
+ "best_of_n_score": 56.111111111111114,
6
+ "score_per_run": [
7
+ 56.111111111111114
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 99.8888888888889
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 99.8888888888889
14
+ ],
15
+ "time_use_in_second": 42.90982174873352,
16
+ "time_use_in_minite": "0:42",
17
+ "length_percentiles": {
18
+ "10": 377.9,
19
+ "20": 426.8,
20
+ "30": 441.7,
21
+ "40": 474.8,
22
+ "50": 720.0,
23
+ "60": 835.4,
24
+ "70": 866.8,
25
+ "80": 931.0,
26
+ "90": 5000.0
27
+ },
28
+ "percentage_largest_length": 15.0
29
+ }
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 100,
3
+ "average_score": 52.0,
4
+ "majority_vote_score": 52.0,
5
+ "best_of_n_score": 52.0,
6
+ "score_per_run": [
7
+ 52.0
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 100.0
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 100.0
14
+ ],
15
+ "time_use_in_second": 45.84051179885864,
16
+ "time_use_in_minite": "0:45",
17
+ "length_percentiles": {
18
+ "10": 337.6,
19
+ "20": 349.8,
20
+ "30": 408.40000000000003,
21
+ "40": 513.6,
22
+ "50": 644.5,
23
+ "60": 1987.6000000000004,
24
+ "70": 5000.0,
25
+ "80": 5000.0,
26
+ "90": 5000.0
27
+ },
28
+ "percentage_largest_length": 35.0
29
+ }
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 100,
3
+ "average_score": 48.0,
4
+ "majority_vote_score": 48.0,
5
+ "best_of_n_score": 48.0,
6
+ "score_per_run": [
7
+ 48.0
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 100.0
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 100.0
14
+ ],
15
+ "time_use_in_second": 50.82205653190613,
16
+ "time_use_in_minite": "0:50",
17
+ "length_percentiles": {
18
+ "10": 316.1,
19
+ "20": 344.6,
20
+ "30": 390.6,
21
+ "40": 481.40000000000003,
22
+ "50": 2769.5,
23
+ "60": 5000.0,
24
+ "70": 5000.0,
25
+ "80": 5000.0,
26
+ "90": 5000.0
27
+ },
28
+ "percentage_largest_length": 50.0
29
+ }
inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e088e3932bdfb443ca54a1a6919a9a8ad6b7d1a724b8644857f72e38d089395
3
+ size 80145254
inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3040,
3
+ "average_score": 28.815789473684212,
4
+ "majority_vote_score": 28.815789473684212,
5
+ "best_of_n_score": 28.815789473684212,
6
+ "score_per_run": [
7
+ 28.815789473684212
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 99.57236842105263
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 99.83552631578947
14
+ ],
15
+ "time_use_in_second": 83.65901041030884,
16
+ "time_use_in_minite": "1:23",
17
+ "length_percentiles": {
18
+ "10": 640.5,
19
+ "20": 702.6,
20
+ "30": 972.6999999999999,
21
+ "40": 3433.6000000000004,
22
+ "50": 5000.0,
23
+ "60": 5000.0,
24
+ "70": 5000.0,
25
+ "80": 5000.0,
26
+ "90": 5000.0
27
+ },
28
+ "percentage_largest_length": 60.0
29
+ }
inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fe0e93f0916bb82c0dcbc885adebdca19d98b1e3de3507a8a9fe484fde1544f
3
+ size 17858690
inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1901,
3
+ "average_score": 52.23566543924251,
4
+ "majority_vote_score": 52.23566543924251,
5
+ "best_of_n_score": 52.23566543924251,
6
+ "score_per_run": [
7
+ 52.23566543924251
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 100.0
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 100.0
14
+ ],
15
+ "time_use_in_second": 4.516808748245239,
16
+ "time_use_in_minite": "0:04",
17
+ "length_percentiles": {
18
+ "10": 305.0,
19
+ "20": 305.0,
20
+ "30": 305.0,
21
+ "40": 305.0,
22
+ "50": 305.0,
23
+ "60": 305.0,
24
+ "70": 305.0,
25
+ "80": 305.0,
26
+ "90": 305.0
27
+ },
28
+ "percentage_largest_length": 100.0
29
+ }
inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 133,
3
+ "average_score": 45.86466165413533,
4
+ "majority_vote_score": 45.86466165413533,
5
+ "best_of_n_score": 45.86466165413533,
6
+ "score_per_run": [
7
+ 45.86466165413533
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 100.0
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 100.0
14
+ ],
15
+ "time_use_in_second": 9.463228225708008,
16
+ "time_use_in_minite": "0:09",
17
+ "length_percentiles": {
18
+ "10": 359.0,
19
+ "20": 366.0,
20
+ "30": 373.0,
21
+ "40": 380.0,
22
+ "50": 387.0,
23
+ "60": 436.8,
24
+ "70": 486.59999999999997,
25
+ "80": 536.4,
26
+ "90": 586.2
27
+ },
28
+ "percentage_largest_length": 33.33333333333333
29
+ }
inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 400,
3
+ "average_score": 27.250000000000004,
4
+ "majority_vote_score": 27.250000000000004,
5
+ "best_of_n_score": 27.250000000000004,
6
+ "score_per_run": [
7
+ 27.250000000000004
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 100.0
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 100.0
14
+ ],
15
+ "time_use_in_second": 76.05133080482483,
16
+ "time_use_in_minite": "1:16",
17
+ "length_percentiles": {
18
+ "10": 579.0,
19
+ "20": 768.8,
20
+ "30": 853.5,
21
+ "40": 1170.6000000000001,
22
+ "50": 3185.5,
23
+ "60": 5000.0,
24
+ "70": 5000.0,
25
+ "80": 5000.0,
26
+ "90": 5000.0
27
+ },
28
+ "percentage_largest_length": 50.0
29
+ }
inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89c0ffdae6381cc21ae09b7ec6d06e5dc2b846e59bb35ac16342935b35492e1b
3
+ size 100330598
inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 12032,
3
+ "average_score": 52.584773936170215,
4
+ "majority_vote_score": 52.584773936170215,
5
+ "best_of_n_score": 52.584773936170215,
6
+ "score_per_run": [
7
+ 52.584773936170215
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 98.50398936170212
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 98.50398936170212
14
+ ],
15
+ "time_use_in_second": 91.4344892501831,
16
+ "time_use_in_minite": "1:31",
17
+ "length_percentiles": {
18
+ "10": 3147.4,
19
+ "20": 4096.0,
20
+ "30": 4096.0,
21
+ "40": 4096.0,
22
+ "50": 4096.0,
23
+ "60": 4096.0,
24
+ "70": 4096.0,
25
+ "80": 4096.0,
26
+ "90": 4096.0
27
+ },
28
+ "percentage_largest_length": 87.5
29
+ }
inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04e34618ad7b1f08ff72b0c3ebf271b662ff34c9c629649655be3be4048724df
3
+ size 24651615
inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 12032,
3
+ "average_score": 39.93683510638298,
4
+ "majority_vote_score": 40.34242021276596,
5
+ "best_of_n_score": 48.819813829787236,
6
+ "score_per_run": [
7
+ 39.885305851063826,
8
+ 39.95179521276596,
9
+ 40.226063829787236,
10
+ 39.96010638297872,
11
+ 39.66090425531915
12
+ ],
13
+ "valid_fmt_per_run": [
14
+ 0.0,
15
+ 0.0,
16
+ 0.0,
17
+ 0.0,
18
+ 0.0
19
+ ],
20
+ "valid_ans_fmt_per_run": [
21
+ 28.939494680851062,
22
+ 28.956117021276594,
23
+ 28.590425531914892,
24
+ 28.96442819148936,
25
+ 29.089095744680847
26
+ ],
27
+ "time_use_in_second": 6.405738592147827,
28
+ "time_use_in_minite": "0:06",
29
+ "length_percentiles": {
30
+ "10": 2.0,
31
+ "20": 2.0,
32
+ "30": 7.0,
33
+ "40": 7.0,
34
+ "50": 7.0,
35
+ "60": 7.0,
36
+ "70": 7.0,
37
+ "80": 14.0,
38
+ "90": 20.69999999999996
39
+ },
40
+ "percentage_largest_length": 0.625
41
+ }
inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1d9ce0f28a018e74a9aa2ba5f7a9e093ac85a8079b99967d9267da9accf06f5
3
+ size 30733786
inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 12032,
3
+ "average_score": 36.78357712765957,
4
+ "majority_vote_score": 36.85172872340425,
5
+ "best_of_n_score": 43.16821808510639,
6
+ "score_per_run": [
7
+ 36.81848404255319,
8
+ 36.80186170212766,
9
+ 36.90990691489361,
10
+ 36.58577127659575,
11
+ 36.80186170212766
12
+ ],
13
+ "valid_fmt_per_run": [
14
+ 0.0,
15
+ 0.0,
16
+ 0.0,
17
+ 0.0,
18
+ 0.0
19
+ ],
20
+ "valid_ans_fmt_per_run": [
21
+ 100.0,
22
+ 100.0,
23
+ 100.0,
24
+ 100.0,
25
+ 100.0
26
+ ],
27
+ "time_use_in_second": 5.885719060897827,
28
+ "time_use_in_minite": "0:05",
29
+ "length_percentiles": {
30
+ "10": 7.0,
31
+ "20": 7.0,
32
+ "30": 7.0,
33
+ "40": 7.0,
34
+ "50": 7.0,
35
+ "60": 7.0,
36
+ "70": 7.0,
37
+ "80": 7.0,
38
+ "90": 7.0
39
+ },
40
+ "percentage_largest_length": 0.625
41
+ }
inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8b3d4b89503327fe2806ec0ccc1cdd4f7c879646c9682a53c6ca277f8410c05
3
+ size 11339208
inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 900,
3
+ "average_score": 53.888888888888886,
4
+ "majority_vote_score": 53.888888888888886,
5
+ "best_of_n_score": 53.888888888888886,
6
+ "score_per_run": [
7
+ 53.888888888888886
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 99.8888888888889
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 99.8888888888889
14
+ ],
15
+ "time_use_in_second": 64.5319185256958,
16
+ "time_use_in_minite": "1:04",
17
+ "length_percentiles": {
18
+ "10": 301.1,
19
+ "20": 385.40000000000003,
20
+ "30": 426.5,
21
+ "40": 541.8000000000001,
22
+ "50": 588.0,
23
+ "60": 658.0,
24
+ "70": 704.5999999999999,
25
+ "80": 1429.6000000000035,
26
+ "90": 4096.0
27
+ },
28
+ "percentage_largest_length": 20.0
29
+ }
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdd408cdd006efe6aaf9220fd8165a5f763fe14bd334f8b7eb81159ccfd5cefa
3
+ size 24978076
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 900,
3
+ "average_score": 53.11111111111111,
4
+ "majority_vote_score": 53.11111111111111,
5
+ "best_of_n_score": 53.11111111111111,
6
+ "score_per_run": [
7
+ 53.11111111111111
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 99.77777777777777
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 99.77777777777777
14
+ ],
15
+ "time_use_in_second": 18.07580852508545,
16
+ "time_use_in_minite": "0:18",
17
+ "length_percentiles": {
18
+ "10": 388.8,
19
+ "20": 480.2,
20
+ "30": 545.8,
21
+ "40": 587.2,
22
+ "50": 605.5,
23
+ "60": 628.6,
24
+ "70": 661.3,
25
+ "80": 707.8000000000001,
26
+ "90": 900.4999999999999
27
+ },
28
+ "percentage_largest_length": 10.0
29
+ }
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eea6f5892c018e6a9da5d9dcd5b26ea3573bbf7c3765a387a5c73acf5876ea1
3
+ size 11531821
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 900,
3
+ "average_score": 56.44444444444444,
4
+ "majority_vote_score": 56.44444444444444,
5
+ "best_of_n_score": 56.44444444444444,
6
+ "score_per_run": [
7
+ 56.44444444444444
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 100.0
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 100.0
14
+ ],
15
+ "time_use_in_second": 62.402525901794434,
16
+ "time_use_in_minite": "1:02",
17
+ "length_percentiles": {
18
+ "10": 400.1,
19
+ "20": 427.0,
20
+ "30": 437.8,
21
+ "40": 480.6,
22
+ "50": 528.5,
23
+ "60": 576.8,
24
+ "70": 604.9,
25
+ "80": 649.6000000000001,
26
+ "90": 829.0000000000002
27
+ },
28
+ "percentage_largest_length": 5.0
29
+ }
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 900,
3
+ "average_score": 55.222222222222214,
4
+ "majority_vote_score": 55.222222222222214,
5
+ "best_of_n_score": 55.222222222222214,
6
+ "score_per_run": [
7
+ 55.222222222222214
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 97.77777777777777
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 99.8888888888889
14
+ ],
15
+ "time_use_in_second": 33.06883645057678,
16
+ "time_use_in_minite": "0:33",
17
+ "length_percentiles": {
18
+ "10": 437.2,
19
+ "20": 509.6,
20
+ "30": 557.1,
21
+ "40": 629.6,
22
+ "50": 710.0,
23
+ "60": 801.4,
24
+ "70": 953.2,
25
+ "80": 1413.2,
26
+ "90": 2298.7000000000003
27
+ },
28
+ "percentage_largest_length": 1.0
29
+ }
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 150,
3
+ "average_score": 62.66666666666667,
4
+ "majority_vote_score": 62.66666666666667,
5
+ "best_of_n_score": 62.66666666666667,
6
+ "score_per_run": [
7
+ 62.66666666666667
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 100.0
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 100.0
14
+ ],
15
+ "time_use_in_second": 161.83210015296936,
16
+ "time_use_in_minite": "2:41",
17
+ "length_percentiles": {
18
+ "10": 461.6,
19
+ "20": 532.6,
20
+ "30": 827.3999999999999,
21
+ "40": 964.2,
22
+ "50": 1113.0,
23
+ "60": 4750.599999999995,
24
+ "70": 10000.0,
25
+ "80": 10000.0,
26
+ "90": 10000.0
27
+ },
28
+ "percentage_largest_length": 40.0
29
+ }
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 150,
3
+ "average_score": 60.0,
4
+ "majority_vote_score": 60.0,
5
+ "best_of_n_score": 60.0,
6
+ "score_per_run": [
7
+ 60.0
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 100.0
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 100.0
14
+ ],
15
+ "time_use_in_second": 58.788227558135986,
16
+ "time_use_in_minite": "0:58",
17
+ "length_percentiles": {
18
+ "10": 436.6,
19
+ "20": 563.8,
20
+ "30": 679.6999999999999,
21
+ "40": 771.8,
22
+ "50": 882.0,
23
+ "60": 1182.1999999999998,
24
+ "70": 2294.8999999999996,
25
+ "80": 4096.0,
26
+ "90": 4096.0
27
+ },
28
+ "percentage_largest_length": 30.0
29
+ }
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 150,
3
+ "average_score": 56.666666666666664,
4
+ "majority_vote_score": 56.666666666666664,
5
+ "best_of_n_score": 56.666666666666664,
6
+ "score_per_run": [
7
+ 56.666666666666664
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 100.0
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 100.0
14
+ ],
15
+ "time_use_in_second": 46.403769969940186,
16
+ "time_use_in_minite": "0:46",
17
+ "length_percentiles": {
18
+ "10": 649.3,
19
+ "20": 692.4,
20
+ "30": 770.6,
21
+ "40": 807.2,
22
+ "50": 1136.5,
23
+ "60": 1495.0,
24
+ "70": 1911.6999999999998,
25
+ "80": 2803.2,
26
+ "90": 2990.9
27
+ },
28
+ "percentage_largest_length": 10.0
29
+ }
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b83946fdac6b143bb592a2bacc8c09afb0f053ae78fa86c4bf2834ce4b0db9fa
3
+ size 47135861
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1730,
3
+ "average_score": 41.67630057803468,
4
+ "majority_vote_score": 41.67630057803468,
5
+ "best_of_n_score": 41.67630057803468,
6
+ "score_per_run": [
7
+ 41.67630057803468
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 96.41618497109826
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 96.41618497109826
14
+ ],
15
+ "time_use_in_second": 150.29381346702576,
16
+ "time_use_in_minite": "2:30",
17
+ "length_percentiles": {
18
+ "10": 276.1,
19
+ "20": 321.8,
20
+ "30": 378.9,
21
+ "40": 402.6,
22
+ "50": 473.0,
23
+ "60": 555.4,
24
+ "70": 593.5,
25
+ "80": 2505.6000000000017,
26
+ "90": 10000.0
27
+ },
28
+ "percentage_largest_length": 20.0
29
+ }
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71c9d328e89f8a93728008d9a75574a432521c5a90090e846af68c602c736539
3
+ size 15567701
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1730,
3
+ "average_score": 41.84971098265896,
4
+ "majority_vote_score": 41.84971098265896,
5
+ "best_of_n_score": 41.84971098265896,
6
+ "score_per_run": [
7
+ 41.84971098265896
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 99.65317919075144
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 99.82658959537572
14
+ ],
15
+ "time_use_in_second": 10.944640636444092,
16
+ "time_use_in_minite": "0:10",
17
+ "length_percentiles": {
18
+ "10": 376.3,
19
+ "20": 417.6,
20
+ "30": 454.0,
21
+ "40": 525.6,
22
+ "50": 560.0,
23
+ "60": 649.9999999999999,
24
+ "70": 729.9,
25
+ "80": 879.8000000000003,
26
+ "90": 1024.2000000000005
27
+ },
28
+ "percentage_largest_length": 3.3333333333333335
29
+ }
inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1730,
3
+ "average_score": 35.826589595375715,
4
+ "majority_vote_score": 35.78034682080925,
5
+ "best_of_n_score": 45.14450867052023,
6
+ "score_per_run": [
7
+ 35.895953757225435,
8
+ 35.664739884393065,
9
+ 35.72254335260116,
10
+ 35.895953757225435,
11
+ 35.95375722543353
12
+ ],
13
+ "valid_fmt_per_run": [
14
+ 0.0,
15
+ 0.0,
16
+ 0.0,
17
+ 0.0,
18
+ 0.0
19
+ ],
20
+ "valid_ans_fmt_per_run": [
21
+ 72.42774566473989,
22
+ 74.27745664739885,
23
+ 72.54335260115607,
24
+ 73.757225433526,
25
+ 72.89017341040463
26
+ ],
27
+ "time_use_in_second": 1.450103521347046,
28
+ "time_use_in_minite": "0:01",
29
+ "length_percentiles": {
30
+ "10": 7.0,
31
+ "20": 7.0,
32
+ "30": 7.0,
33
+ "40": 7.0,
34
+ "50": 9.0,
35
+ "60": 11.0,
36
+ "70": 11.0,
37
+ "80": 11.0,
38
+ "90": 11.0
39
+ },
40
+ "percentage_largest_length": 50.0
41
+ }
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 286,
3
+ "average_score": 40.55944055944056,
4
+ "majority_vote_score": 40.55944055944056,
5
+ "best_of_n_score": 40.55944055944056,
6
+ "score_per_run": [
7
+ 40.55944055944056
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 100.0
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 100.0
14
+ ],
15
+ "time_use_in_second": 23.286169052124023,
16
+ "time_use_in_minite": "0:23",
17
+ "length_percentiles": {
18
+ "10": 880.0,
19
+ "20": 1084.0,
20
+ "30": 1128.5,
21
+ "40": 1173.0,
22
+ "50": 1233.5,
23
+ "60": 1294.0,
24
+ "70": 1392.0,
25
+ "80": 1490.0,
26
+ "90": 1558.5
27
+ },
28
+ "percentage_largest_length": 16.666666666666664
29
+ }
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 286,
3
+ "average_score": 38.46153846153847,
4
+ "majority_vote_score": 38.46153846153847,
5
+ "best_of_n_score": 38.46153846153847,
6
+ "score_per_run": [
7
+ 38.46153846153847
8
+ ],
9
+ "valid_fmt_per_run": [
10
+ 99.65034965034964
11
+ ],
12
+ "valid_ans_fmt_per_run": [
13
+ 100.0
14
+ ],
15
+ "time_use_in_second": 95.06414794921875,
16
+ "time_use_in_minite": "1:35",
17
+ "length_percentiles": {
18
+ "10": 529.5,
19
+ "20": 630.0,
20
+ "30": 707.5,
21
+ "40": 773.0,
22
+ "50": 868.5,
23
+ "60": 960.0,
24
+ "70": 1200.5,
25
+ "80": 1431.0,
26
+ "90": 2782.0
27
+ },
28
+ "percentage_largest_length": 1.1627906976744187
29
+ }
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl ADDED
The diff for this file is too large to render. See raw diff