qianchu commited on Feb 3

Commit

77a6bdb

verified ·

1 Parent(s): 4ca7420

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +43 -0
added_tokens.json +24 -0
chat_template.json +3 -0
config.json +65 -0
generation_config.json +6 -0
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl +0 -0
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json +29 -0
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl +0 -0
inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json +29 -0
inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl +3 -0
inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json +29 -0
inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json +29 -0
inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_metrics.json +29 -0
inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +3 -0
inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json +41 -0
inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +3 -0
inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json +41 -0
inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl +3 -0
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json +29 -0
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +3 -0
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl +0 -0
inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json +29 -0
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl +0 -0
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json +29 -0
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl +0 -0
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json +29 -0
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl +0 -0
inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json +29 -0
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl +3 -0
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json +29 -0
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl +3 -0
inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json +29 -0
inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +0 -0
inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json +41 -0
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl +0 -0
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json +29 -0
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl +0 -0
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json +29 -0
inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,46 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/MathVista/testmini_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_offline/Vlms_are_blind/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e1000.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMLU_PRO_HEALTH/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMMU/test_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMMU/test_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMMU_PRO/test_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMStar/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MMStar/test_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MathVision/test_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MathVista/testmini_chat_mc-math-cot_-1_seed0_t0.1_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MathVista/testmini_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MathVista/testmini_chat_mc-openthought_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/MedQA_4choices/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/Med_QA/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/NEJM_Challenge/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/OmniMedVQA/testmini_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/gsm8k_main/test_chat_mc-math-cot_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/medxpertqa/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl filter=lfs diff=lfs merge=lfs -text
+inference_submit/medxpertqa/test_chat_mc-cot-force-format_-1_seed0_t0.3_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "architectures": [
+    "Qwen2_5_VLForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "image_token_id": 151655,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 28,
+  "model_type": "qwen2_5_vl",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "depth": 32,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "in_channels": 3,
+    "in_chans": 3,
+    "intermediate_size": 3420,
+    "model_type": "qwen2_5_vl",
+    "num_heads": 16,
+    "out_hidden_size": 3584,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "temporal_patch_size": 2,
+    "tokens_per_second": 2,
+    "torch_dtype": "bfloat16",
+    "window_size": 112
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vocab_size": 152064
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "4.51.2"
+}

inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64564c716c1470cb7698cb4c49aac3e016849ca095fc8df079e5865468f194c3
+size 12280132

inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 900,
+    "average_score": 56.111111111111114,
+    "majority_vote_score": 56.111111111111114,
+    "best_of_n_score": 56.111111111111114,
+    "score_per_run": [
+        56.111111111111114
+    ],
+    "valid_fmt_per_run": [
+        99.8888888888889
+    ],
+    "valid_ans_fmt_per_run": [
+        99.8888888888889
+    ],
+    "time_use_in_second": 42.90982174873352,
+    "time_use_in_minite": "0:42",
+    "length_percentiles": {
+        "10": 377.9,
+        "20": 426.8,
+        "30": 441.7,
+        "40": 474.8,
+        "50": 720.0,
+        "60": 835.4,
+        "70": 866.8,
+        "80": 931.0,
+        "90": 5000.0
+    },
+    "percentage_largest_length": 15.0
+}

inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format-image_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 100,
+    "average_score": 52.0,
+    "majority_vote_score": 52.0,
+    "best_of_n_score": 52.0,
+    "score_per_run": [
+        52.0
+    ],
+    "valid_fmt_per_run": [
+        100.0
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0
+    ],
+    "time_use_in_second": 45.84051179885864,
+    "time_use_in_minite": "0:45",
+    "length_percentiles": {
+        "10": 337.6,
+        "20": 349.8,
+        "30": 408.40000000000003,
+        "40": 513.6,
+        "50": 644.5,
+        "60": 1987.6000000000004,
+        "70": 5000.0,
+        "80": 5000.0,
+        "90": 5000.0
+    },
+    "percentage_largest_length": 35.0
+}

inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_h100sandbox_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e100_forcestoppingwiththink_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 100,
+    "average_score": 48.0,
+    "majority_vote_score": 48.0,
+    "best_of_n_score": 48.0,
+    "score_per_run": [
+        48.0
+    ],
+    "valid_fmt_per_run": [
+        100.0
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0
+    ],
+    "time_use_in_second": 50.82205653190613,
+    "time_use_in_minite": "0:50",
+    "length_percentiles": {
+        "10": 316.1,
+        "20": 344.6,
+        "30": 390.6,
+        "40": 481.40000000000003,
+        "50": 2769.5,
+        "60": 5000.0,
+        "70": 5000.0,
+        "80": 5000.0,
+        "90": 5000.0
+    },
+    "percentage_largest_length": 50.0
+}

inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e088e3932bdfb443ca54a1a6919a9a8ad6b7d1a724b8644857f72e38d089395
+size 80145254

inference_h100sandbox_offline/MathVision/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 3040,
+    "average_score": 28.815789473684212,
+    "majority_vote_score": 28.815789473684212,
+    "best_of_n_score": 28.815789473684212,
+    "score_per_run": [
+        28.815789473684212
+    ],
+    "valid_fmt_per_run": [
+        99.57236842105263
+    ],
+    "valid_ans_fmt_per_run": [
+        99.83552631578947
+    ],
+    "time_use_in_second": 83.65901041030884,
+    "time_use_in_minite": "1:23",
+    "length_percentiles": {
+        "10": 640.5,
+        "20": 702.6,
+        "30": 972.6999999999999,
+        "40": 3433.6000000000004,
+        "50": 5000.0,
+        "60": 5000.0,
+        "70": 5000.0,
+        "80": 5000.0,
+        "90": 5000.0
+    },
+    "percentage_largest_length": 60.0
+}

inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1fe0e93f0916bb82c0dcbc885adebdca19d98b1e3de3507a8a9fe484fde1544f
+size 17858690

inference_offline/BLINK/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 1901,
+    "average_score": 52.23566543924251,
+    "majority_vote_score": 52.23566543924251,
+    "best_of_n_score": 52.23566543924251,
+    "score_per_run": [
+        52.23566543924251
+    ],
+    "valid_fmt_per_run": [
+        100.0
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0
+    ],
+    "time_use_in_second": 4.516808748245239,
+    "time_use_in_minite": "0:04",
+    "length_percentiles": {
+        "10": 305.0,
+        "20": 305.0,
+        "30": 305.0,
+        "40": 305.0,
+        "50": 305.0,
+        "60": 305.0,
+        "70": 305.0,
+        "80": 305.0,
+        "90": 305.0
+    },
+    "percentage_largest_length": 100.0
+}

inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_offline/BLINK_reasoning/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 133,
+    "average_score": 45.86466165413533,
+    "majority_vote_score": 45.86466165413533,
+    "best_of_n_score": 45.86466165413533,
+    "score_per_run": [
+        45.86466165413533
+    ],
+    "valid_fmt_per_run": [
+        100.0
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0
+    ],
+    "time_use_in_second": 9.463228225708008,
+    "time_use_in_minite": "0:09",
+    "length_percentiles": {
+        "10": 359.0,
+        "20": 366.0,
+        "30": 373.0,
+        "40": 380.0,
+        "50": 387.0,
+        "60": 436.8,
+        "70": 486.59999999999997,
+        "80": 536.4,
+        "90": 586.2
+    },
+    "percentage_largest_length": 33.33333333333333
+}

inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_offline/EMMA_mini/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 400,
+    "average_score": 27.250000000000004,
+    "majority_vote_score": 27.250000000000004,
+    "best_of_n_score": 27.250000000000004,
+    "score_per_run": [
+        27.250000000000004
+    ],
+    "valid_fmt_per_run": [
+        100.0
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0
+    ],
+    "time_use_in_second": 76.05133080482483,
+    "time_use_in_minite": "1:16",
+    "length_percentiles": {
+        "10": 579.0,
+        "20": 768.8,
+        "30": 853.5,
+        "40": 1170.6000000000001,
+        "50": 3185.5,
+        "60": 5000.0,
+        "70": 5000.0,
+        "80": 5000.0,
+        "90": 5000.0
+    },
+    "percentage_largest_length": 50.0
+}

inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89c0ffdae6381cc21ae09b7ec6d06e5dc2b846e59bb35ac16342935b35492e1b
+size 100330598

inference_offline/MMLU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 12032,
+    "average_score": 52.584773936170215,
+    "majority_vote_score": 52.584773936170215,
+    "best_of_n_score": 52.584773936170215,
+    "score_per_run": [
+        52.584773936170215
+    ],
+    "valid_fmt_per_run": [
+        98.50398936170212
+    ],
+    "valid_ans_fmt_per_run": [
+        98.50398936170212
+    ],
+    "time_use_in_second": 91.4344892501831,
+    "time_use_in_minite": "1:31",
+    "length_percentiles": {
+        "10": 3147.4,
+        "20": 4096.0,
+        "30": 4096.0,
+        "40": 4096.0,
+        "50": 4096.0,
+        "60": 4096.0,
+        "70": 4096.0,
+        "80": 4096.0,
+        "90": 4096.0
+    },
+    "percentage_largest_length": 87.5
+}

inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04e34618ad7b1f08ff72b0c3ebf271b662ff34c9c629649655be3be4048724df
+size 24651615

inference_offline/MMLU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+    "num_samples": 12032,
+    "average_score": 39.93683510638298,
+    "majority_vote_score": 40.34242021276596,
+    "best_of_n_score": 48.819813829787236,
+    "score_per_run": [
+        39.885305851063826,
+        39.95179521276596,
+        40.226063829787236,
+        39.96010638297872,
+        39.66090425531915
+    ],
+    "valid_fmt_per_run": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+    ],
+    "valid_ans_fmt_per_run": [
+        28.939494680851062,
+        28.956117021276594,
+        28.590425531914892,
+        28.96442819148936,
+        29.089095744680847
+    ],
+    "time_use_in_second": 6.405738592147827,
+    "time_use_in_minite": "0:06",
+    "length_percentiles": {
+        "10": 2.0,
+        "20": 2.0,
+        "30": 7.0,
+        "40": 7.0,
+        "50": 7.0,
+        "60": 7.0,
+        "70": 7.0,
+        "80": 14.0,
+        "90": 20.69999999999996
+    },
+    "percentage_largest_length": 0.625
+}

inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1d9ce0f28a018e74a9aa2ba5f7a9e093ac85a8079b99967d9267da9accf06f5
+size 30733786

inference_offline/MMLU_PRO/test_mc-cot-force-format-nothinking_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+    "num_samples": 12032,
+    "average_score": 36.78357712765957,
+    "majority_vote_score": 36.85172872340425,
+    "best_of_n_score": 43.16821808510639,
+    "score_per_run": [
+        36.81848404255319,
+        36.80186170212766,
+        36.90990691489361,
+        36.58577127659575,
+        36.80186170212766
+    ],
+    "valid_fmt_per_run": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0,
+        100.0,
+        100.0,
+        100.0,
+        100.0
+    ],
+    "time_use_in_second": 5.885719060897827,
+    "time_use_in_minite": "0:05",
+    "length_percentiles": {
+        "10": 7.0,
+        "20": 7.0,
+        "30": 7.0,
+        "40": 7.0,
+        "50": 7.0,
+        "60": 7.0,
+        "70": 7.0,
+        "80": 7.0,
+        "90": 7.0
+    },
+    "percentage_largest_length": 0.625
+}

inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8b3d4b89503327fe2806ec0ccc1cdd4f7c879646c9682a53c6ca277f8410c05
+size 11339208

inference_offline/MMMU/test_chat_mc-cot-force-format_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 900,
+    "average_score": 53.888888888888886,
+    "majority_vote_score": 53.888888888888886,
+    "best_of_n_score": 53.888888888888886,
+    "score_per_run": [
+        53.888888888888886
+    ],
+    "valid_fmt_per_run": [
+        99.8888888888889
+    ],
+    "valid_ans_fmt_per_run": [
+        99.8888888888889
+    ],
+    "time_use_in_second": 64.5319185256958,
+    "time_use_in_minite": "1:04",
+    "length_percentiles": {
+        "10": 301.1,
+        "20": 385.40000000000003,
+        "30": 426.5,
+        "40": 541.8000000000001,
+        "50": 588.0,
+        "60": 658.0,
+        "70": 704.5999999999999,
+        "80": 1429.6000000000035,
+        "90": 4096.0
+    },
+    "percentage_largest_length": 20.0
+}

inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bdd408cdd006efe6aaf9220fd8165a5f763fe14bd334f8b7eb81159ccfd5cefa
+size 24978076

inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 900,
+    "average_score": 53.11111111111111,
+    "majority_vote_score": 53.11111111111111,
+    "best_of_n_score": 53.11111111111111,
+    "score_per_run": [
+        53.11111111111111
+    ],
+    "valid_fmt_per_run": [
+        99.77777777777777
+    ],
+    "valid_ans_fmt_per_run": [
+        99.77777777777777
+    ],
+    "time_use_in_second": 18.07580852508545,
+    "time_use_in_minite": "0:18",
+    "length_percentiles": {
+        "10": 388.8,
+        "20": 480.2,
+        "30": 545.8,
+        "40": 587.2,
+        "50": 605.5,
+        "60": 628.6,
+        "70": 661.3,
+        "80": 707.8000000000001,
+        "90": 900.4999999999999
+    },
+    "percentage_largest_length": 10.0
+}

inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5eea6f5892c018e6a9da5d9dcd5b26ea3573bbf7c3765a387a5c73acf5876ea1
+size 11531821

inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 900,
+    "average_score": 56.44444444444444,
+    "majority_vote_score": 56.44444444444444,
+    "best_of_n_score": 56.44444444444444,
+    "score_per_run": [
+        56.44444444444444
+    ],
+    "valid_fmt_per_run": [
+        100.0
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0
+    ],
+    "time_use_in_second": 62.402525901794434,
+    "time_use_in_minite": "1:02",
+    "length_percentiles": {
+        "10": 400.1,
+        "20": 427.0,
+        "30": 437.8,
+        "40": 480.6,
+        "50": 528.5,
+        "60": 576.8,
+        "70": 604.9,
+        "80": 649.6000000000001,
+        "90": 829.0000000000002
+    },
+    "percentage_largest_length": 5.0
+}

inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_offline/MMMU/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 900,
+    "average_score": 55.222222222222214,
+    "majority_vote_score": 55.222222222222214,
+    "best_of_n_score": 55.222222222222214,
+    "score_per_run": [
+        55.222222222222214
+    ],
+    "valid_fmt_per_run": [
+        97.77777777777777
+    ],
+    "valid_ans_fmt_per_run": [
+        99.8888888888889
+    ],
+    "time_use_in_second": 33.06883645057678,
+    "time_use_in_minite": "0:33",
+    "length_percentiles": {
+        "10": 437.2,
+        "20": 509.6,
+        "30": 557.1,
+        "40": 629.6,
+        "50": 710.0,
+        "60": 801.4,
+        "70": 953.2,
+        "80": 1413.2,
+        "90": 2298.7000000000003
+    },
+    "percentage_largest_length": 1.0
+}

inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 150,
+    "average_score": 62.66666666666667,
+    "majority_vote_score": 62.66666666666667,
+    "best_of_n_score": 62.66666666666667,
+    "score_per_run": [
+        62.66666666666667
+    ],
+    "valid_fmt_per_run": [
+        100.0
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0
+    ],
+    "time_use_in_second": 161.83210015296936,
+    "time_use_in_minite": "2:41",
+    "length_percentiles": {
+        "10": 461.6,
+        "20": 532.6,
+        "30": 827.3999999999999,
+        "40": 964.2,
+        "50": 1113.0,
+        "60": 4750.599999999995,
+        "70": 10000.0,
+        "80": 10000.0,
+        "90": 10000.0
+    },
+    "percentage_largest_length": 40.0
+}

inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.0_s0_e-1_forcestoppingwiththink_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 150,
+    "average_score": 60.0,
+    "majority_vote_score": 60.0,
+    "best_of_n_score": 60.0,
+    "score_per_run": [
+        60.0
+    ],
+    "valid_fmt_per_run": [
+        100.0
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0
+    ],
+    "time_use_in_second": 58.788227558135986,
+    "time_use_in_minite": "0:58",
+    "length_percentiles": {
+        "10": 436.6,
+        "20": 563.8,
+        "30": 679.6999999999999,
+        "40": 771.8,
+        "50": 882.0,
+        "60": 1182.1999999999998,
+        "70": 2294.8999999999996,
+        "80": 4096.0,
+        "90": 4096.0
+    },
+    "percentage_largest_length": 30.0
+}

inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_offline/MMMU_HEALTH/test_chat_mc-math-cot_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 150,
+    "average_score": 56.666666666666664,
+    "majority_vote_score": 56.666666666666664,
+    "best_of_n_score": 56.666666666666664,
+    "score_per_run": [
+        56.666666666666664
+    ],
+    "valid_fmt_per_run": [
+        100.0
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0
+    ],
+    "time_use_in_second": 46.403769969940186,
+    "time_use_in_minite": "0:46",
+    "length_percentiles": {
+        "10": 649.3,
+        "20": 692.4,
+        "30": 770.6,
+        "40": 807.2,
+        "50": 1136.5,
+        "60": 1495.0,
+        "70": 1911.6999999999998,
+        "80": 2803.2,
+        "90": 2990.9
+    },
+    "percentage_largest_length": 10.0
+}

inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b83946fdac6b143bb592a2bacc8c09afb0f053ae78fa86c4bf2834ce4b0db9fa
+size 47135861

inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.0_rp1.0_s0_e-1_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 1730,
+    "average_score": 41.67630057803468,
+    "majority_vote_score": 41.67630057803468,
+    "best_of_n_score": 41.67630057803468,
+    "score_per_run": [
+        41.67630057803468
+    ],
+    "valid_fmt_per_run": [
+        96.41618497109826
+    ],
+    "valid_ans_fmt_per_run": [
+        96.41618497109826
+    ],
+    "time_use_in_second": 150.29381346702576,
+    "time_use_in_minite": "2:30",
+    "length_percentiles": {
+        "10": 276.1,
+        "20": 321.8,
+        "30": 378.9,
+        "40": 402.6,
+        "50": 473.0,
+        "60": 555.4,
+        "70": 593.5,
+        "80": 2505.6000000000017,
+        "90": 10000.0
+    },
+    "percentage_largest_length": 20.0
+}

inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71c9d328e89f8a93728008d9a75574a432521c5a90090e846af68c602c736539
+size 15567701

inference_offline/MMMU_PRO/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 1730,
+    "average_score": 41.84971098265896,
+    "majority_vote_score": 41.84971098265896,
+    "best_of_n_score": 41.84971098265896,
+    "score_per_run": [
+        41.84971098265896
+    ],
+    "valid_fmt_per_run": [
+        99.65317919075144
+    ],
+    "valid_ans_fmt_per_run": [
+        99.82658959537572
+    ],
+    "time_use_in_second": 10.944640636444092,
+    "time_use_in_minite": "0:10",
+    "length_percentiles": {
+        "10": 376.3,
+        "20": 417.6,
+        "30": 454.0,
+        "40": 525.6,
+        "50": 560.0,
+        "60": 649.9999999999999,
+        "70": 729.9,
+        "80": 879.8000000000003,
+        "90": 1024.2000000000005
+    },
+    "percentage_largest_length": 3.3333333333333335
+}

inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_offline/MMMU_PRO/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink_metrics.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+    "num_samples": 1730,
+    "average_score": 35.826589595375715,
+    "majority_vote_score": 35.78034682080925,
+    "best_of_n_score": 45.14450867052023,
+    "score_per_run": [
+        35.895953757225435,
+        35.664739884393065,
+        35.72254335260116,
+        35.895953757225435,
+        35.95375722543353
+    ],
+    "valid_fmt_per_run": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+    ],
+    "valid_ans_fmt_per_run": [
+        72.42774566473989,
+        74.27745664739885,
+        72.54335260115607,
+        73.757225433526,
+        72.89017341040463
+    ],
+    "time_use_in_second": 1.450103521347046,
+    "time_use_in_minite": "0:01",
+    "length_percentiles": {
+        "10": 7.0,
+        "20": 7.0,
+        "30": 7.0,
+        "40": 7.0,
+        "50": 9.0,
+        "60": 11.0,
+        "70": 11.0,
+        "80": 11.0,
+        "90": 11.0
+    },
+    "percentage_largest_length": 50.0
+}

inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.0_s0_e-1_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 286,
+    "average_score": 40.55944055944056,
+    "majority_vote_score": 40.55944055944056,
+    "best_of_n_score": 40.55944055944056,
+    "score_per_run": [
+        40.55944055944056
+    ],
+    "valid_fmt_per_run": [
+        100.0
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0
+    ],
+    "time_use_in_second": 23.286169052124023,
+    "time_use_in_minite": "0:23",
+    "length_percentiles": {
+        "10": 880.0,
+        "20": 1084.0,
+        "30": 1128.5,
+        "40": 1173.0,
+        "50": 1233.5,
+        "60": 1294.0,
+        "70": 1392.0,
+        "80": 1490.0,
+        "90": 1558.5
+    },
+    "percentage_largest_length": 16.666666666666664
+}

inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-cot-force-format_-1_seed0_t0.6_rp1.1_s0_e-1_metrics.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "num_samples": 286,
+    "average_score": 38.46153846153847,
+    "majority_vote_score": 38.46153846153847,
+    "best_of_n_score": 38.46153846153847,
+    "score_per_run": [
+        38.46153846153847
+    ],
+    "valid_fmt_per_run": [
+        99.65034965034964
+    ],
+    "valid_ans_fmt_per_run": [
+        100.0
+    ],
+    "time_use_in_second": 95.06414794921875,
+    "time_use_in_minite": "1:35",
+    "length_percentiles": {
+        "10": 529.5,
+        "20": 630.0,
+        "30": 707.5,
+        "40": 773.0,
+        "50": 868.5,
+        "60": 960.0,
+        "70": 1200.5,
+        "80": 1431.0,
+        "90": 2782.0
+    },
+    "percentage_largest_length": 1.1627906976744187
+}

inference_offline/MMMU_PRO_MEDICINE_ALL_IMAGES/test_chat_mc-direct-force-format_-1_seed0_t0.3_s0_e-1_forcestoppingwiththink.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff