Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- arena_hard/20251124_074215/configs/task_config_2131fa.yaml +76 -0
- arena_hard/20251124_074215/logs/eval_log.log +105 -0
- arena_hard/20251124_074215/predictions/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl +0 -0
- arena_hard/20251124_074215/reports/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard.json +34 -0
- arena_hard/20251124_074215/reviews/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl +3 -0
.gitattributes
CHANGED
|
@@ -61,3 +61,4 @@ arena_hard/20251119_191645/reviews/gpt-5/arena_hard_default.jsonl filter=lfs dif
|
|
| 61 |
mnpo_iter3_armo_dpo_abl_scored.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 62 |
output_pre_mnpo.log filter=lfs diff=lfs merge=lfs -text
|
| 63 |
arena_hard/20251123_220114/reviews/olmo-2-0325-32b-instruct/arena_hard_default.jsonl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 61 |
mnpo_iter3_armo_dpo_abl_scored.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 62 |
output_pre_mnpo.log filter=lfs diff=lfs merge=lfs -text
|
| 63 |
arena_hard/20251123_220114/reviews/olmo-2-0325-32b-instruct/arena_hard_default.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
arena_hard/20251124_074215/reviews/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl filter=lfs diff=lfs merge=lfs -text
|
arena_hard/20251124_074215/configs/task_config_2131fa.yaml
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
analysis_report: false
|
| 2 |
+
api_url: http://127.0.0.1:8002/v1
|
| 3 |
+
chat_template: null
|
| 4 |
+
dataset_args:
|
| 5 |
+
arena_hard:
|
| 6 |
+
aggregation: mean
|
| 7 |
+
dataset_id: AI-ModelScope/arena-hard-auto-v0.1
|
| 8 |
+
default_subset: default
|
| 9 |
+
description: ArenaHard is a benchmark designed to evaluate the performance of
|
| 10 |
+
large language models in a competitive setting, where models are pitted against
|
| 11 |
+
each other in a series of tasks to determine their relative strengths and weaknesses.
|
| 12 |
+
It includes a set of challenging tasks that require reasoning, understanding,
|
| 13 |
+
and generation capabilities. Currently not support `style-controlled winrate`;
|
| 14 |
+
the official Judge model is `gpt-4-1106-preview`, while the baseline model is
|
| 15 |
+
`gpt-4-0314`.
|
| 16 |
+
eval_split: test
|
| 17 |
+
extra_params: {}
|
| 18 |
+
few_shot_num: 0
|
| 19 |
+
few_shot_prompt_template: null
|
| 20 |
+
few_shot_random: false
|
| 21 |
+
filters: null
|
| 22 |
+
metric_list:
|
| 23 |
+
- winrate
|
| 24 |
+
name: arena_hard
|
| 25 |
+
output_types:
|
| 26 |
+
- generation
|
| 27 |
+
pretty_name: ArenaHard
|
| 28 |
+
prompt_template: '{question}'
|
| 29 |
+
query_template: null
|
| 30 |
+
review_timeout: null
|
| 31 |
+
shuffle: false
|
| 32 |
+
shuffle_choices: false
|
| 33 |
+
subset_list:
|
| 34 |
+
- default
|
| 35 |
+
system_prompt: null
|
| 36 |
+
tags:
|
| 37 |
+
- InstructionFollowing
|
| 38 |
+
- Arena
|
| 39 |
+
train_split: null
|
| 40 |
+
dataset_dir: /afs/.ir/users/f/a/fangwu97/.cache/modelscope/hub/datasets
|
| 41 |
+
dataset_hub: modelscope
|
| 42 |
+
datasets:
|
| 43 |
+
- arena_hard
|
| 44 |
+
debug: false
|
| 45 |
+
eval_backend: Native
|
| 46 |
+
eval_batch_size: 12
|
| 47 |
+
eval_config: null
|
| 48 |
+
eval_type: openai_api
|
| 49 |
+
generation_config:
|
| 50 |
+
batch_size: 12
|
| 51 |
+
max_tokens: 4096
|
| 52 |
+
ignore_errors: false
|
| 53 |
+
judge_model_args:
|
| 54 |
+
api_key: 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
|
| 55 |
+
api_url: https://openrouter.ai/api/v1
|
| 56 |
+
generation_config:
|
| 57 |
+
reasoning_effort: minimal
|
| 58 |
+
model_id: gpt-5-mini
|
| 59 |
+
judge_strategy: auto
|
| 60 |
+
judge_worker_num: 12
|
| 61 |
+
limit: null
|
| 62 |
+
model: gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25
|
| 63 |
+
model_args: {}
|
| 64 |
+
model_id: gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25
|
| 65 |
+
model_task: text_generation
|
| 66 |
+
repeats: 1
|
| 67 |
+
rerun_review: false
|
| 68 |
+
sandbox_config: {}
|
| 69 |
+
sandbox_manager_config: {}
|
| 70 |
+
sandbox_type: docker
|
| 71 |
+
seed: 42
|
| 72 |
+
stream: null
|
| 73 |
+
timeout: null
|
| 74 |
+
use_cache: null
|
| 75 |
+
use_sandbox: false
|
| 76 |
+
work_dir: ./outputs/20251124_074215
|
arena_hard/20251124_074215/logs/eval_log.log
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-11-24 07:42:15 - evalscope - INFO: Creating model gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 with eval_type=openai_api base_url=http://127.0.0.1:8002/v1, config={'batch_size': 12, 'max_tokens': 4096}, model_args={}
|
| 2 |
+
2025-11-24 07:42:16 - evalscope - INFO: Dump task config to ./outputs/20251124_074215/configs/task_config_2131fa.yaml
|
| 3 |
+
2025-11-24 07:42:16 - evalscope - INFO: {
|
| 4 |
+
"model": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25",
|
| 5 |
+
"model_id": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25",
|
| 6 |
+
"model_args": {},
|
| 7 |
+
"model_task": "text_generation",
|
| 8 |
+
"chat_template": null,
|
| 9 |
+
"datasets": [
|
| 10 |
+
"arena_hard"
|
| 11 |
+
],
|
| 12 |
+
"dataset_args": {
|
| 13 |
+
"arena_hard": {
|
| 14 |
+
"name": "arena_hard",
|
| 15 |
+
"dataset_id": "AI-ModelScope/arena-hard-auto-v0.1",
|
| 16 |
+
"output_types": [
|
| 17 |
+
"generation"
|
| 18 |
+
],
|
| 19 |
+
"subset_list": [
|
| 20 |
+
"default"
|
| 21 |
+
],
|
| 22 |
+
"default_subset": "default",
|
| 23 |
+
"few_shot_num": 0,
|
| 24 |
+
"few_shot_random": false,
|
| 25 |
+
"train_split": null,
|
| 26 |
+
"eval_split": "test",
|
| 27 |
+
"prompt_template": "{question}",
|
| 28 |
+
"few_shot_prompt_template": null,
|
| 29 |
+
"system_prompt": null,
|
| 30 |
+
"query_template": null,
|
| 31 |
+
"pretty_name": "ArenaHard",
|
| 32 |
+
"description": "ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.",
|
| 33 |
+
"tags": [
|
| 34 |
+
"InstructionFollowing",
|
| 35 |
+
"Arena"
|
| 36 |
+
],
|
| 37 |
+
"filters": null,
|
| 38 |
+
"metric_list": [
|
| 39 |
+
"winrate"
|
| 40 |
+
],
|
| 41 |
+
"aggregation": "mean",
|
| 42 |
+
"shuffle": false,
|
| 43 |
+
"shuffle_choices": false,
|
| 44 |
+
"review_timeout": null,
|
| 45 |
+
"extra_params": {}
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"dataset_dir": "/afs/.ir/users/f/a/fangwu97/.cache/modelscope/hub/datasets",
|
| 49 |
+
"dataset_hub": "modelscope",
|
| 50 |
+
"repeats": 1,
|
| 51 |
+
"generation_config": {
|
| 52 |
+
"batch_size": 12,
|
| 53 |
+
"max_tokens": 4096
|
| 54 |
+
},
|
| 55 |
+
"eval_type": "openai_api",
|
| 56 |
+
"eval_backend": "Native",
|
| 57 |
+
"eval_config": null,
|
| 58 |
+
"limit": null,
|
| 59 |
+
"eval_batch_size": 12,
|
| 60 |
+
"use_cache": null,
|
| 61 |
+
"rerun_review": false,
|
| 62 |
+
"work_dir": "./outputs/20251124_074215",
|
| 63 |
+
"ignore_errors": false,
|
| 64 |
+
"debug": false,
|
| 65 |
+
"seed": 42,
|
| 66 |
+
"api_url": "http://127.0.0.1:8002/v1",
|
| 67 |
+
"timeout": null,
|
| 68 |
+
"stream": null,
|
| 69 |
+
"judge_strategy": "auto",
|
| 70 |
+
"judge_worker_num": 12,
|
| 71 |
+
"judge_model_args": {
|
| 72 |
+
"model_id": "gpt-5-mini",
|
| 73 |
+
"generation_config": {
|
| 74 |
+
"reasoning_effort": "minimal"
|
| 75 |
+
},
|
| 76 |
+
"api_url": "https://openrouter.ai/api/v1",
|
| 77 |
+
"api_key": "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"
|
| 78 |
+
},
|
| 79 |
+
"analysis_report": false,
|
| 80 |
+
"use_sandbox": false,
|
| 81 |
+
"sandbox_type": "docker",
|
| 82 |
+
"sandbox_manager_config": {},
|
| 83 |
+
"sandbox_config": {}
|
| 84 |
+
}
|
| 85 |
+
2025-11-24 07:45:59 - evalscope - INFO: Creating model gpt-5-mini with eval_type=openai_api base_url=https://openrouter.ai/api/v1, config={'reasoning_effort': 'minimal'}, model_args={}
|
| 86 |
+
2025-11-24 08:15:49 - evalscope - INFO:
|
| 87 |
+
arena_hard report table:
|
| 88 |
+
+-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
|
| 89 |
+
| Model | Dataset | Metric | Subset | Num | Score | Cat.0 |
|
| 90 |
+
+=============================================================================+============+==========+==========+=======+=========+=========+
|
| 91 |
+
| gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 | arena_hard | winrate | default | 500 | 0.3873 | default |
|
| 92 |
+
+-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
|
| 93 |
+
|
| 94 |
+
2025-11-24 08:15:49 - evalscope - INFO: Skipping report analysis (`analysis_report=False`).
|
| 95 |
+
2025-11-24 08:15:49 - evalscope - INFO: Dump report to: ./outputs/20251124_074215/reports/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard.json
|
| 96 |
+
|
| 97 |
+
2025-11-24 08:15:49 - evalscope - INFO: Overall report table:
|
| 98 |
+
+-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
|
| 99 |
+
| Model | Dataset | Metric | Subset | Num | Score | Cat.0 |
|
| 100 |
+
+=============================================================================+============+==========+==========+=======+=========+=========+
|
| 101 |
+
| gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 | arena_hard | winrate | default | 500 | 0.3873 | default |
|
| 102 |
+
+-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
|
| 103 |
+
|
| 104 |
+
2025-11-24 08:15:50 - evalscope - INFO: Finished evaluation for gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 on ['arena_hard']
|
| 105 |
+
2025-11-24 08:15:50 - evalscope - INFO: Output directory: ./outputs/20251124_074215
|
arena_hard/20251124_074215/predictions/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
arena_hard/20251124_074215/reports/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25@arena_hard",
|
| 3 |
+
"dataset_name": "arena_hard",
|
| 4 |
+
"dataset_pretty_name": "ArenaHard",
|
| 5 |
+
"dataset_description": "ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.",
|
| 6 |
+
"model_name": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25",
|
| 7 |
+
"score": 0.3873,
|
| 8 |
+
"metrics": [
|
| 9 |
+
{
|
| 10 |
+
"name": "winrate",
|
| 11 |
+
"num": 500,
|
| 12 |
+
"score": 0.3873,
|
| 13 |
+
"macro_score": 0.3873,
|
| 14 |
+
"categories": [
|
| 15 |
+
{
|
| 16 |
+
"name": [
|
| 17 |
+
"default"
|
| 18 |
+
],
|
| 19 |
+
"num": 500,
|
| 20 |
+
"score": 0.3873,
|
| 21 |
+
"macro_score": 0.3873,
|
| 22 |
+
"subsets": [
|
| 23 |
+
{
|
| 24 |
+
"name": "default",
|
| 25 |
+
"score": 0.3873,
|
| 26 |
+
"num": 500
|
| 27 |
+
}
|
| 28 |
+
]
|
| 29 |
+
}
|
| 30 |
+
]
|
| 31 |
+
}
|
| 32 |
+
],
|
| 33 |
+
"analysis": "N/A"
|
| 34 |
+
}
|
arena_hard/20251124_074215/reviews/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8b2e7ec803b6368df0780cf8939b3046b97fcecbbad5b990ad635aa3d0dcaa9
|
| 3 |
+
size 15526049
|