Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +43 -0
- added_tokens.json +24 -0
- chat_template.json +3 -0
- config.json +65 -0
- generation_config.json +6 -0
- inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
- inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
- inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl +0 -0
- inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json +29 -0
- inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl +0 -0
- inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json +29 -0
- inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
- inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
- inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl +3 -0
- inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json +29 -0
- inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json +29 -0
- inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_metrics.json +29 -0
- inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
- inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
- inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +3 -0
- inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json +41 -0
- inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +3 -0
- inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json +41 -0
- inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
- inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
- inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl +3 -0
- inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json +29 -0
- inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
- inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
- inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl +0 -0
- inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json +29 -0
- inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl +0 -0
- inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json +29 -0
- inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +0 -0
- inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
- inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl +0 -0
- inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json +29 -0
- inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl +3 -0
- inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json +29 -0
- inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl +3 -0
- inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json +29 -0
- inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +0 -0
- inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json +41 -0
- inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl +0 -0
- inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json +29 -0
- inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl +0 -0
- inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json +29 -0
- inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,46 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
inference_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
inference_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
inference_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
inference_offline/MathVista/testmini_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
inference_offline/Vlms_are_blind/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e1000.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
inference_submit/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
inference_submit/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
inference_submit/MMLU_PRO_HEALTH/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
inference_submit/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
inference_submit/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
inference_submit/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
inference_submit/MMMU/test_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
inference_submit/MMMU/test_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
inference_submit/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
inference_submit/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
inference_submit/MMMU_PRO/test_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
inference_submit/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
inference_submit/MMStar/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
inference_submit/MMStar/test_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
inference_submit/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
inference_submit/MathVision/test_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
inference_submit/MathVista/testmini_chat_mc-math-cot_-1_seed0_t0.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
inference_submit/MathVista/testmini_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
inference_submit/MathVista/testmini_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
inference_submit/MedQA_4choices/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
inference_submit/Med_QA/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
inference_submit/NEJM_Challenge/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
inference_submit/OmniMedVQA/testmini_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
inference_submit/gsm8k_main/test_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
inference_submit/medxpertqa/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
inference_submit/medxpertqa/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
chat_template.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
|
| 3 |
+
}
|
config.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen2_5_VLForConditionalGeneration"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"eos_token_id": 151645,
|
| 7 |
+
"hidden_act": "silu",
|
| 8 |
+
"hidden_size": 3584,
|
| 9 |
+
"image_token_id": 151655,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 18944,
|
| 12 |
+
"max_position_embeddings": 128000,
|
| 13 |
+
"max_window_layers": 28,
|
| 14 |
+
"model_type": "qwen2_5_vl",
|
| 15 |
+
"num_attention_heads": 28,
|
| 16 |
+
"num_hidden_layers": 28,
|
| 17 |
+
"num_key_value_heads": 4,
|
| 18 |
+
"pad_token_id": 151643,
|
| 19 |
+
"rms_norm_eps": 1e-06,
|
| 20 |
+
"rope_scaling": {
|
| 21 |
+
"mrope_section": [
|
| 22 |
+
16,
|
| 23 |
+
24,
|
| 24 |
+
24
|
| 25 |
+
],
|
| 26 |
+
"rope_type": "default",
|
| 27 |
+
"type": "default"
|
| 28 |
+
},
|
| 29 |
+
"rope_theta": 1000000.0,
|
| 30 |
+
"sliding_window": 32768,
|
| 31 |
+
"tie_word_embeddings": false,
|
| 32 |
+
"torch_dtype": "bfloat16",
|
| 33 |
+
"transformers_version": "4.51.2",
|
| 34 |
+
"use_cache": true,
|
| 35 |
+
"use_sliding_window": false,
|
| 36 |
+
"video_token_id": 151656,
|
| 37 |
+
"vision_config": {
|
| 38 |
+
"depth": 32,
|
| 39 |
+
"fullatt_block_indexes": [
|
| 40 |
+
7,
|
| 41 |
+
15,
|
| 42 |
+
23,
|
| 43 |
+
31
|
| 44 |
+
],
|
| 45 |
+
"hidden_act": "silu",
|
| 46 |
+
"hidden_size": 1280,
|
| 47 |
+
"in_channels": 3,
|
| 48 |
+
"in_chans": 3,
|
| 49 |
+
"intermediate_size": 3420,
|
| 50 |
+
"model_type": "qwen2_5_vl",
|
| 51 |
+
"num_heads": 16,
|
| 52 |
+
"out_hidden_size": 3584,
|
| 53 |
+
"patch_size": 14,
|
| 54 |
+
"spatial_merge_size": 2,
|
| 55 |
+
"spatial_patch_size": 14,
|
| 56 |
+
"temporal_patch_size": 2,
|
| 57 |
+
"tokens_per_second": 2,
|
| 58 |
+
"torch_dtype": "bfloat16",
|
| 59 |
+
"window_size": 112
|
| 60 |
+
},
|
| 61 |
+
"vision_end_token_id": 151653,
|
| 62 |
+
"vision_start_token_id": 151652,
|
| 63 |
+
"vision_token_id": 151654,
|
| 64 |
+
"vocab_size": 152064
|
| 65 |
+
}
|
generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"eos_token_id": 151645,
|
| 4 |
+
"pad_token_id": 151643,
|
| 5 |
+
"transformers_version": "4.51.2"
|
| 6 |
+
}
|
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64564c716c1470cb7698cb4c49aac3e016849ca095fc8df079e5865468f194c3
|
| 3 |
+
size 12280132
|
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 900,
|
| 3 |
+
"average_score": 56.111111111111114,
|
| 4 |
+
"majority_vote_score": 56.111111111111114,
|
| 5 |
+
"best_of_n_score": 56.111111111111114,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
56.111111111111114
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
99.8888888888889
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
99.8888888888889
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 42.90982174873352,
|
| 16 |
+
"time_use_in_minite": "0:42",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 377.9,
|
| 19 |
+
"20": 426.8,
|
| 20 |
+
"30": 441.7,
|
| 21 |
+
"40": 474.8,
|
| 22 |
+
"50": 720.0,
|
| 23 |
+
"60": 835.4,
|
| 24 |
+
"70": 866.8,
|
| 25 |
+
"80": 931.0,
|
| 26 |
+
"90": 5000.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 15.0
|
| 29 |
+
}
|
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 100,
|
| 3 |
+
"average_score": 52.0,
|
| 4 |
+
"majority_vote_score": 52.0,
|
| 5 |
+
"best_of_n_score": 52.0,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
52.0
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
100.0
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
100.0
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 45.84051179885864,
|
| 16 |
+
"time_use_in_minite": "0:45",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 337.6,
|
| 19 |
+
"20": 349.8,
|
| 20 |
+
"30": 408.40000000000003,
|
| 21 |
+
"40": 513.6,
|
| 22 |
+
"50": 644.5,
|
| 23 |
+
"60": 1987.6000000000004,
|
| 24 |
+
"70": 5000.0,
|
| 25 |
+
"80": 5000.0,
|
| 26 |
+
"90": 5000.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 35.0
|
| 29 |
+
}
|
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 100,
|
| 3 |
+
"average_score": 48.0,
|
| 4 |
+
"majority_vote_score": 48.0,
|
| 5 |
+
"best_of_n_score": 48.0,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
48.0
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
100.0
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
100.0
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 50.82205653190613,
|
| 16 |
+
"time_use_in_minite": "0:50",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 316.1,
|
| 19 |
+
"20": 344.6,
|
| 20 |
+
"30": 390.6,
|
| 21 |
+
"40": 481.40000000000003,
|
| 22 |
+
"50": 2769.5,
|
| 23 |
+
"60": 5000.0,
|
| 24 |
+
"70": 5000.0,
|
| 25 |
+
"80": 5000.0,
|
| 26 |
+
"90": 5000.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 50.0
|
| 29 |
+
}
|
inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e088e3932bdfb443ca54a1a6919a9a8ad6b7d1a724b8644857f72e38d089395
|
| 3 |
+
size 80145254
|
inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3040,
|
| 3 |
+
"average_score": 28.815789473684212,
|
| 4 |
+
"majority_vote_score": 28.815789473684212,
|
| 5 |
+
"best_of_n_score": 28.815789473684212,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
28.815789473684212
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
99.57236842105263
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
99.83552631578947
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 83.65901041030884,
|
| 16 |
+
"time_use_in_minite": "1:23",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 640.5,
|
| 19 |
+
"20": 702.6,
|
| 20 |
+
"30": 972.6999999999999,
|
| 21 |
+
"40": 3433.6000000000004,
|
| 22 |
+
"50": 5000.0,
|
| 23 |
+
"60": 5000.0,
|
| 24 |
+
"70": 5000.0,
|
| 25 |
+
"80": 5000.0,
|
| 26 |
+
"90": 5000.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 60.0
|
| 29 |
+
}
|
inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fe0e93f0916bb82c0dcbc885adebdca19d98b1e3de3507a8a9fe484fde1544f
|
| 3 |
+
size 17858690
|
inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1901,
|
| 3 |
+
"average_score": 52.23566543924251,
|
| 4 |
+
"majority_vote_score": 52.23566543924251,
|
| 5 |
+
"best_of_n_score": 52.23566543924251,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
52.23566543924251
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
100.0
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
100.0
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 4.516808748245239,
|
| 16 |
+
"time_use_in_minite": "0:04",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 305.0,
|
| 19 |
+
"20": 305.0,
|
| 20 |
+
"30": 305.0,
|
| 21 |
+
"40": 305.0,
|
| 22 |
+
"50": 305.0,
|
| 23 |
+
"60": 305.0,
|
| 24 |
+
"70": 305.0,
|
| 25 |
+
"80": 305.0,
|
| 26 |
+
"90": 305.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 100.0
|
| 29 |
+
}
|
inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 133,
|
| 3 |
+
"average_score": 45.86466165413533,
|
| 4 |
+
"majority_vote_score": 45.86466165413533,
|
| 5 |
+
"best_of_n_score": 45.86466165413533,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
45.86466165413533
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
100.0
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
100.0
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 9.463228225708008,
|
| 16 |
+
"time_use_in_minite": "0:09",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 359.0,
|
| 19 |
+
"20": 366.0,
|
| 20 |
+
"30": 373.0,
|
| 21 |
+
"40": 380.0,
|
| 22 |
+
"50": 387.0,
|
| 23 |
+
"60": 436.8,
|
| 24 |
+
"70": 486.59999999999997,
|
| 25 |
+
"80": 536.4,
|
| 26 |
+
"90": 586.2
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 33.33333333333333
|
| 29 |
+
}
|
inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 400,
|
| 3 |
+
"average_score": 27.250000000000004,
|
| 4 |
+
"majority_vote_score": 27.250000000000004,
|
| 5 |
+
"best_of_n_score": 27.250000000000004,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
27.250000000000004
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
100.0
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
100.0
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 76.05133080482483,
|
| 16 |
+
"time_use_in_minite": "1:16",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 579.0,
|
| 19 |
+
"20": 768.8,
|
| 20 |
+
"30": 853.5,
|
| 21 |
+
"40": 1170.6000000000001,
|
| 22 |
+
"50": 3185.5,
|
| 23 |
+
"60": 5000.0,
|
| 24 |
+
"70": 5000.0,
|
| 25 |
+
"80": 5000.0,
|
| 26 |
+
"90": 5000.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 50.0
|
| 29 |
+
}
|
inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:89c0ffdae6381cc21ae09b7ec6d06e5dc2b846e59bb35ac16342935b35492e1b
|
| 3 |
+
size 100330598
|
inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 12032,
|
| 3 |
+
"average_score": 52.584773936170215,
|
| 4 |
+
"majority_vote_score": 52.584773936170215,
|
| 5 |
+
"best_of_n_score": 52.584773936170215,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
52.584773936170215
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
98.50398936170212
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
98.50398936170212
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 91.4344892501831,
|
| 16 |
+
"time_use_in_minite": "1:31",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 3147.4,
|
| 19 |
+
"20": 4096.0,
|
| 20 |
+
"30": 4096.0,
|
| 21 |
+
"40": 4096.0,
|
| 22 |
+
"50": 4096.0,
|
| 23 |
+
"60": 4096.0,
|
| 24 |
+
"70": 4096.0,
|
| 25 |
+
"80": 4096.0,
|
| 26 |
+
"90": 4096.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 87.5
|
| 29 |
+
}
|
inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04e34618ad7b1f08ff72b0c3ebf271b662ff34c9c629649655be3be4048724df
|
| 3 |
+
size 24651615
|
inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 12032,
|
| 3 |
+
"average_score": 39.93683510638298,
|
| 4 |
+
"majority_vote_score": 40.34242021276596,
|
| 5 |
+
"best_of_n_score": 48.819813829787236,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
39.885305851063826,
|
| 8 |
+
39.95179521276596,
|
| 9 |
+
40.226063829787236,
|
| 10 |
+
39.96010638297872,
|
| 11 |
+
39.66090425531915
|
| 12 |
+
],
|
| 13 |
+
"valid_fmt_per_run": [
|
| 14 |
+
0.0,
|
| 15 |
+
0.0,
|
| 16 |
+
0.0,
|
| 17 |
+
0.0,
|
| 18 |
+
0.0
|
| 19 |
+
],
|
| 20 |
+
"valid_ans_fmt_per_run": [
|
| 21 |
+
28.939494680851062,
|
| 22 |
+
28.956117021276594,
|
| 23 |
+
28.590425531914892,
|
| 24 |
+
28.96442819148936,
|
| 25 |
+
29.089095744680847
|
| 26 |
+
],
|
| 27 |
+
"time_use_in_second": 6.405738592147827,
|
| 28 |
+
"time_use_in_minite": "0:06",
|
| 29 |
+
"length_percentiles": {
|
| 30 |
+
"10": 2.0,
|
| 31 |
+
"20": 2.0,
|
| 32 |
+
"30": 7.0,
|
| 33 |
+
"40": 7.0,
|
| 34 |
+
"50": 7.0,
|
| 35 |
+
"60": 7.0,
|
| 36 |
+
"70": 7.0,
|
| 37 |
+
"80": 14.0,
|
| 38 |
+
"90": 20.69999999999996
|
| 39 |
+
},
|
| 40 |
+
"percentage_largest_length": 0.625
|
| 41 |
+
}
|
inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d1d9ce0f28a018e74a9aa2ba5f7a9e093ac85a8079b99967d9267da9accf06f5
|
| 3 |
+
size 30733786
|
inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 12032,
|
| 3 |
+
"average_score": 36.78357712765957,
|
| 4 |
+
"majority_vote_score": 36.85172872340425,
|
| 5 |
+
"best_of_n_score": 43.16821808510639,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
36.81848404255319,
|
| 8 |
+
36.80186170212766,
|
| 9 |
+
36.90990691489361,
|
| 10 |
+
36.58577127659575,
|
| 11 |
+
36.80186170212766
|
| 12 |
+
],
|
| 13 |
+
"valid_fmt_per_run": [
|
| 14 |
+
0.0,
|
| 15 |
+
0.0,
|
| 16 |
+
0.0,
|
| 17 |
+
0.0,
|
| 18 |
+
0.0
|
| 19 |
+
],
|
| 20 |
+
"valid_ans_fmt_per_run": [
|
| 21 |
+
100.0,
|
| 22 |
+
100.0,
|
| 23 |
+
100.0,
|
| 24 |
+
100.0,
|
| 25 |
+
100.0
|
| 26 |
+
],
|
| 27 |
+
"time_use_in_second": 5.885719060897827,
|
| 28 |
+
"time_use_in_minite": "0:05",
|
| 29 |
+
"length_percentiles": {
|
| 30 |
+
"10": 7.0,
|
| 31 |
+
"20": 7.0,
|
| 32 |
+
"30": 7.0,
|
| 33 |
+
"40": 7.0,
|
| 34 |
+
"50": 7.0,
|
| 35 |
+
"60": 7.0,
|
| 36 |
+
"70": 7.0,
|
| 37 |
+
"80": 7.0,
|
| 38 |
+
"90": 7.0
|
| 39 |
+
},
|
| 40 |
+
"percentage_largest_length": 0.625
|
| 41 |
+
}
|
inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8b3d4b89503327fe2806ec0ccc1cdd4f7c879646c9682a53c6ca277f8410c05
|
| 3 |
+
size 11339208
|
inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 900,
|
| 3 |
+
"average_score": 53.888888888888886,
|
| 4 |
+
"majority_vote_score": 53.888888888888886,
|
| 5 |
+
"best_of_n_score": 53.888888888888886,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
53.888888888888886
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
99.8888888888889
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
99.8888888888889
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 64.5319185256958,
|
| 16 |
+
"time_use_in_minite": "1:04",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 301.1,
|
| 19 |
+
"20": 385.40000000000003,
|
| 20 |
+
"30": 426.5,
|
| 21 |
+
"40": 541.8000000000001,
|
| 22 |
+
"50": 588.0,
|
| 23 |
+
"60": 658.0,
|
| 24 |
+
"70": 704.5999999999999,
|
| 25 |
+
"80": 1429.6000000000035,
|
| 26 |
+
"90": 4096.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 20.0
|
| 29 |
+
}
|
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bdd408cdd006efe6aaf9220fd8165a5f763fe14bd334f8b7eb81159ccfd5cefa
|
| 3 |
+
size 24978076
|
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 900,
|
| 3 |
+
"average_score": 53.11111111111111,
|
| 4 |
+
"majority_vote_score": 53.11111111111111,
|
| 5 |
+
"best_of_n_score": 53.11111111111111,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
53.11111111111111
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
99.77777777777777
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
99.77777777777777
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 18.07580852508545,
|
| 16 |
+
"time_use_in_minite": "0:18",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 388.8,
|
| 19 |
+
"20": 480.2,
|
| 20 |
+
"30": 545.8,
|
| 21 |
+
"40": 587.2,
|
| 22 |
+
"50": 605.5,
|
| 23 |
+
"60": 628.6,
|
| 24 |
+
"70": 661.3,
|
| 25 |
+
"80": 707.8000000000001,
|
| 26 |
+
"90": 900.4999999999999
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 10.0
|
| 29 |
+
}
|
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5eea6f5892c018e6a9da5d9dcd5b26ea3573bbf7c3765a387a5c73acf5876ea1
|
| 3 |
+
size 11531821
|
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 900,
|
| 3 |
+
"average_score": 56.44444444444444,
|
| 4 |
+
"majority_vote_score": 56.44444444444444,
|
| 5 |
+
"best_of_n_score": 56.44444444444444,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
56.44444444444444
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
100.0
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
100.0
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 62.402525901794434,
|
| 16 |
+
"time_use_in_minite": "1:02",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 400.1,
|
| 19 |
+
"20": 427.0,
|
| 20 |
+
"30": 437.8,
|
| 21 |
+
"40": 480.6,
|
| 22 |
+
"50": 528.5,
|
| 23 |
+
"60": 576.8,
|
| 24 |
+
"70": 604.9,
|
| 25 |
+
"80": 649.6000000000001,
|
| 26 |
+
"90": 829.0000000000002
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 5.0
|
| 29 |
+
}
|
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 900,
|
| 3 |
+
"average_score": 55.222222222222214,
|
| 4 |
+
"majority_vote_score": 55.222222222222214,
|
| 5 |
+
"best_of_n_score": 55.222222222222214,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
55.222222222222214
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
97.77777777777777
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
99.8888888888889
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 33.06883645057678,
|
| 16 |
+
"time_use_in_minite": "0:33",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 437.2,
|
| 19 |
+
"20": 509.6,
|
| 20 |
+
"30": 557.1,
|
| 21 |
+
"40": 629.6,
|
| 22 |
+
"50": 710.0,
|
| 23 |
+
"60": 801.4,
|
| 24 |
+
"70": 953.2,
|
| 25 |
+
"80": 1413.2,
|
| 26 |
+
"90": 2298.7000000000003
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 1.0
|
| 29 |
+
}
|
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 150,
|
| 3 |
+
"average_score": 62.66666666666667,
|
| 4 |
+
"majority_vote_score": 62.66666666666667,
|
| 5 |
+
"best_of_n_score": 62.66666666666667,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
62.66666666666667
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
100.0
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
100.0
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 161.83210015296936,
|
| 16 |
+
"time_use_in_minite": "2:41",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 461.6,
|
| 19 |
+
"20": 532.6,
|
| 20 |
+
"30": 827.3999999999999,
|
| 21 |
+
"40": 964.2,
|
| 22 |
+
"50": 1113.0,
|
| 23 |
+
"60": 4750.599999999995,
|
| 24 |
+
"70": 10000.0,
|
| 25 |
+
"80": 10000.0,
|
| 26 |
+
"90": 10000.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 40.0
|
| 29 |
+
}
|
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 150,
|
| 3 |
+
"average_score": 60.0,
|
| 4 |
+
"majority_vote_score": 60.0,
|
| 5 |
+
"best_of_n_score": 60.0,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
60.0
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
100.0
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
100.0
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 58.788227558135986,
|
| 16 |
+
"time_use_in_minite": "0:58",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 436.6,
|
| 19 |
+
"20": 563.8,
|
| 20 |
+
"30": 679.6999999999999,
|
| 21 |
+
"40": 771.8,
|
| 22 |
+
"50": 882.0,
|
| 23 |
+
"60": 1182.1999999999998,
|
| 24 |
+
"70": 2294.8999999999996,
|
| 25 |
+
"80": 4096.0,
|
| 26 |
+
"90": 4096.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 30.0
|
| 29 |
+
}
|
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 150,
|
| 3 |
+
"average_score": 56.666666666666664,
|
| 4 |
+
"majority_vote_score": 56.666666666666664,
|
| 5 |
+
"best_of_n_score": 56.666666666666664,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
56.666666666666664
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
100.0
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
100.0
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 46.403769969940186,
|
| 16 |
+
"time_use_in_minite": "0:46",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 649.3,
|
| 19 |
+
"20": 692.4,
|
| 20 |
+
"30": 770.6,
|
| 21 |
+
"40": 807.2,
|
| 22 |
+
"50": 1136.5,
|
| 23 |
+
"60": 1495.0,
|
| 24 |
+
"70": 1911.6999999999998,
|
| 25 |
+
"80": 2803.2,
|
| 26 |
+
"90": 2990.9
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 10.0
|
| 29 |
+
}
|
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b83946fdac6b143bb592a2bacc8c09afb0f053ae78fa86c4bf2834ce4b0db9fa
|
| 3 |
+
size 47135861
|
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1730,
|
| 3 |
+
"average_score": 41.67630057803468,
|
| 4 |
+
"majority_vote_score": 41.67630057803468,
|
| 5 |
+
"best_of_n_score": 41.67630057803468,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
41.67630057803468
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
96.41618497109826
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
96.41618497109826
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 150.29381346702576,
|
| 16 |
+
"time_use_in_minite": "2:30",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 276.1,
|
| 19 |
+
"20": 321.8,
|
| 20 |
+
"30": 378.9,
|
| 21 |
+
"40": 402.6,
|
| 22 |
+
"50": 473.0,
|
| 23 |
+
"60": 555.4,
|
| 24 |
+
"70": 593.5,
|
| 25 |
+
"80": 2505.6000000000017,
|
| 26 |
+
"90": 10000.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 20.0
|
| 29 |
+
}
|
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71c9d328e89f8a93728008d9a75574a432521c5a90090e846af68c602c736539
|
| 3 |
+
size 15567701
|
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1730,
|
| 3 |
+
"average_score": 41.84971098265896,
|
| 4 |
+
"majority_vote_score": 41.84971098265896,
|
| 5 |
+
"best_of_n_score": 41.84971098265896,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
41.84971098265896
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
99.65317919075144
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
99.82658959537572
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 10.944640636444092,
|
| 16 |
+
"time_use_in_minite": "0:10",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 376.3,
|
| 19 |
+
"20": 417.6,
|
| 20 |
+
"30": 454.0,
|
| 21 |
+
"40": 525.6,
|
| 22 |
+
"50": 560.0,
|
| 23 |
+
"60": 649.9999999999999,
|
| 24 |
+
"70": 729.9,
|
| 25 |
+
"80": 879.8000000000003,
|
| 26 |
+
"90": 1024.2000000000005
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 3.3333333333333335
|
| 29 |
+
}
|
inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1730,
|
| 3 |
+
"average_score": 35.826589595375715,
|
| 4 |
+
"majority_vote_score": 35.78034682080925,
|
| 5 |
+
"best_of_n_score": 45.14450867052023,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
35.895953757225435,
|
| 8 |
+
35.664739884393065,
|
| 9 |
+
35.72254335260116,
|
| 10 |
+
35.895953757225435,
|
| 11 |
+
35.95375722543353
|
| 12 |
+
],
|
| 13 |
+
"valid_fmt_per_run": [
|
| 14 |
+
0.0,
|
| 15 |
+
0.0,
|
| 16 |
+
0.0,
|
| 17 |
+
0.0,
|
| 18 |
+
0.0
|
| 19 |
+
],
|
| 20 |
+
"valid_ans_fmt_per_run": [
|
| 21 |
+
72.42774566473989,
|
| 22 |
+
74.27745664739885,
|
| 23 |
+
72.54335260115607,
|
| 24 |
+
73.757225433526,
|
| 25 |
+
72.89017341040463
|
| 26 |
+
],
|
| 27 |
+
"time_use_in_second": 1.450103521347046,
|
| 28 |
+
"time_use_in_minite": "0:01",
|
| 29 |
+
"length_percentiles": {
|
| 30 |
+
"10": 7.0,
|
| 31 |
+
"20": 7.0,
|
| 32 |
+
"30": 7.0,
|
| 33 |
+
"40": 7.0,
|
| 34 |
+
"50": 9.0,
|
| 35 |
+
"60": 11.0,
|
| 36 |
+
"70": 11.0,
|
| 37 |
+
"80": 11.0,
|
| 38 |
+
"90": 11.0
|
| 39 |
+
},
|
| 40 |
+
"percentage_largest_length": 50.0
|
| 41 |
+
}
|
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 286,
|
| 3 |
+
"average_score": 40.55944055944056,
|
| 4 |
+
"majority_vote_score": 40.55944055944056,
|
| 5 |
+
"best_of_n_score": 40.55944055944056,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
40.55944055944056
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
100.0
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
100.0
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 23.286169052124023,
|
| 16 |
+
"time_use_in_minite": "0:23",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 880.0,
|
| 19 |
+
"20": 1084.0,
|
| 20 |
+
"30": 1128.5,
|
| 21 |
+
"40": 1173.0,
|
| 22 |
+
"50": 1233.5,
|
| 23 |
+
"60": 1294.0,
|
| 24 |
+
"70": 1392.0,
|
| 25 |
+
"80": 1490.0,
|
| 26 |
+
"90": 1558.5
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 16.666666666666664
|
| 29 |
+
}
|
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 286,
|
| 3 |
+
"average_score": 38.46153846153847,
|
| 4 |
+
"majority_vote_score": 38.46153846153847,
|
| 5 |
+
"best_of_n_score": 38.46153846153847,
|
| 6 |
+
"score_per_run": [
|
| 7 |
+
38.46153846153847
|
| 8 |
+
],
|
| 9 |
+
"valid_fmt_per_run": [
|
| 10 |
+
99.65034965034964
|
| 11 |
+
],
|
| 12 |
+
"valid_ans_fmt_per_run": [
|
| 13 |
+
100.0
|
| 14 |
+
],
|
| 15 |
+
"time_use_in_second": 95.06414794921875,
|
| 16 |
+
"time_use_in_minite": "1:35",
|
| 17 |
+
"length_percentiles": {
|
| 18 |
+
"10": 529.5,
|
| 19 |
+
"20": 630.0,
|
| 20 |
+
"30": 707.5,
|
| 21 |
+
"40": 773.0,
|
| 22 |
+
"50": 868.5,
|
| 23 |
+
"60": 960.0,
|
| 24 |
+
"70": 1200.5,
|
| 25 |
+
"80": 1431.0,
|
| 26 |
+
"90": 2782.0
|
| 27 |
+
},
|
| 28 |
+
"percentage_largest_length": 1.1627906976744187
|
| 29 |
+
}
|
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|