XuHuang commited on
Commit
a05d5b6
·
verified ·
1 Parent(s): 215737a

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -61,3 +61,4 @@ arena_hard/20251119_191645/reviews/gpt-5/arena_hard_default.jsonl filter=lfs dif
61
  mnpo_iter3_armo_dpo_abl_scored.jsonl filter=lfs diff=lfs merge=lfs -text
62
  output_pre_mnpo.log filter=lfs diff=lfs merge=lfs -text
63
  arena_hard/20251123_220114/reviews/olmo-2-0325-32b-instruct/arena_hard_default.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
61
  mnpo_iter3_armo_dpo_abl_scored.jsonl filter=lfs diff=lfs merge=lfs -text
62
  output_pre_mnpo.log filter=lfs diff=lfs merge=lfs -text
63
  arena_hard/20251123_220114/reviews/olmo-2-0325-32b-instruct/arena_hard_default.jsonl filter=lfs diff=lfs merge=lfs -text
64
+ arena_hard/20251124_074215/reviews/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl filter=lfs diff=lfs merge=lfs -text
arena_hard/20251124_074215/configs/task_config_2131fa.yaml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ analysis_report: false
2
+ api_url: http://127.0.0.1:8002/v1
3
+ chat_template: null
4
+ dataset_args:
5
+ arena_hard:
6
+ aggregation: mean
7
+ dataset_id: AI-ModelScope/arena-hard-auto-v0.1
8
+ default_subset: default
9
+ description: ArenaHard is a benchmark designed to evaluate the performance of
10
+ large language models in a competitive setting, where models are pitted against
11
+ each other in a series of tasks to determine their relative strengths and weaknesses.
12
+ It includes a set of challenging tasks that require reasoning, understanding,
13
+ and generation capabilities. Currently not support `style-controlled winrate`;
14
+ the official Judge model is `gpt-4-1106-preview`, while the baseline model is
15
+ `gpt-4-0314`.
16
+ eval_split: test
17
+ extra_params: {}
18
+ few_shot_num: 0
19
+ few_shot_prompt_template: null
20
+ few_shot_random: false
21
+ filters: null
22
+ metric_list:
23
+ - winrate
24
+ name: arena_hard
25
+ output_types:
26
+ - generation
27
+ pretty_name: ArenaHard
28
+ prompt_template: '{question}'
29
+ query_template: null
30
+ review_timeout: null
31
+ shuffle: false
32
+ shuffle_choices: false
33
+ subset_list:
34
+ - default
35
+ system_prompt: null
36
+ tags:
37
+ - InstructionFollowing
38
+ - Arena
39
+ train_split: null
40
+ dataset_dir: /afs/.ir/users/f/a/fangwu97/.cache/modelscope/hub/datasets
41
+ dataset_hub: modelscope
42
+ datasets:
43
+ - arena_hard
44
+ debug: false
45
+ eval_backend: Native
46
+ eval_batch_size: 12
47
+ eval_config: null
48
+ eval_type: openai_api
49
+ generation_config:
50
+ batch_size: 12
51
+ max_tokens: 4096
52
+ ignore_errors: false
53
+ judge_model_args:
54
+ api_key: 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
55
+ api_url: https://openrouter.ai/api/v1
56
+ generation_config:
57
+ reasoning_effort: minimal
58
+ model_id: gpt-5-mini
59
+ judge_strategy: auto
60
+ judge_worker_num: 12
61
+ limit: null
62
+ model: gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25
63
+ model_args: {}
64
+ model_id: gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25
65
+ model_task: text_generation
66
+ repeats: 1
67
+ rerun_review: false
68
+ sandbox_config: {}
69
+ sandbox_manager_config: {}
70
+ sandbox_type: docker
71
+ seed: 42
72
+ stream: null
73
+ timeout: null
74
+ use_cache: null
75
+ use_sandbox: false
76
+ work_dir: ./outputs/20251124_074215
arena_hard/20251124_074215/logs/eval_log.log ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-11-24 07:42:15 - evalscope - INFO: Creating model gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 with eval_type=openai_api base_url=http://127.0.0.1:8002/v1, config={'batch_size': 12, 'max_tokens': 4096}, model_args={}
2
+ 2025-11-24 07:42:16 - evalscope - INFO: Dump task config to ./outputs/20251124_074215/configs/task_config_2131fa.yaml
3
+ 2025-11-24 07:42:16 - evalscope - INFO: {
4
+ "model": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25",
5
+ "model_id": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25",
6
+ "model_args": {},
7
+ "model_task": "text_generation",
8
+ "chat_template": null,
9
+ "datasets": [
10
+ "arena_hard"
11
+ ],
12
+ "dataset_args": {
13
+ "arena_hard": {
14
+ "name": "arena_hard",
15
+ "dataset_id": "AI-ModelScope/arena-hard-auto-v0.1",
16
+ "output_types": [
17
+ "generation"
18
+ ],
19
+ "subset_list": [
20
+ "default"
21
+ ],
22
+ "default_subset": "default",
23
+ "few_shot_num": 0,
24
+ "few_shot_random": false,
25
+ "train_split": null,
26
+ "eval_split": "test",
27
+ "prompt_template": "{question}",
28
+ "few_shot_prompt_template": null,
29
+ "system_prompt": null,
30
+ "query_template": null,
31
+ "pretty_name": "ArenaHard",
32
+ "description": "ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.",
33
+ "tags": [
34
+ "InstructionFollowing",
35
+ "Arena"
36
+ ],
37
+ "filters": null,
38
+ "metric_list": [
39
+ "winrate"
40
+ ],
41
+ "aggregation": "mean",
42
+ "shuffle": false,
43
+ "shuffle_choices": false,
44
+ "review_timeout": null,
45
+ "extra_params": {}
46
+ }
47
+ },
48
+ "dataset_dir": "/afs/.ir/users/f/a/fangwu97/.cache/modelscope/hub/datasets",
49
+ "dataset_hub": "modelscope",
50
+ "repeats": 1,
51
+ "generation_config": {
52
+ "batch_size": 12,
53
+ "max_tokens": 4096
54
+ },
55
+ "eval_type": "openai_api",
56
+ "eval_backend": "Native",
57
+ "eval_config": null,
58
+ "limit": null,
59
+ "eval_batch_size": 12,
60
+ "use_cache": null,
61
+ "rerun_review": false,
62
+ "work_dir": "./outputs/20251124_074215",
63
+ "ignore_errors": false,
64
+ "debug": false,
65
+ "seed": 42,
66
+ "api_url": "http://127.0.0.1:8002/v1",
67
+ "timeout": null,
68
+ "stream": null,
69
+ "judge_strategy": "auto",
70
+ "judge_worker_num": 12,
71
+ "judge_model_args": {
72
+ "model_id": "gpt-5-mini",
73
+ "generation_config": {
74
+ "reasoning_effort": "minimal"
75
+ },
76
+ "api_url": "https://openrouter.ai/api/v1",
77
+ "api_key": "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"
78
+ },
79
+ "analysis_report": false,
80
+ "use_sandbox": false,
81
+ "sandbox_type": "docker",
82
+ "sandbox_manager_config": {},
83
+ "sandbox_config": {}
84
+ }
85
+ 2025-11-24 07:45:59 - evalscope - INFO: Creating model gpt-5-mini with eval_type=openai_api base_url=https://openrouter.ai/api/v1, config={'reasoning_effort': 'minimal'}, model_args={}
86
+ 2025-11-24 08:15:49 - evalscope - INFO:
87
+ arena_hard report table:
88
+ +-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
89
+ | Model | Dataset | Metric | Subset | Num | Score | Cat.0 |
90
+ +=============================================================================+============+==========+==========+=======+=========+=========+
91
+ | gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 | arena_hard | winrate | default | 500 | 0.3873 | default |
92
+ +-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
93
+
94
+ 2025-11-24 08:15:49 - evalscope - INFO: Skipping report analysis (`analysis_report=False`).
95
+ 2025-11-24 08:15:49 - evalscope - INFO: Dump report to: ./outputs/20251124_074215/reports/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard.json
96
+
97
+ 2025-11-24 08:15:49 - evalscope - INFO: Overall report table:
98
+ +-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
99
+ | Model | Dataset | Metric | Subset | Num | Score | Cat.0 |
100
+ +=============================================================================+============+==========+==========+=======+=========+=========+
101
+ | gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 | arena_hard | winrate | default | 500 | 0.3873 | default |
102
+ +-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
103
+
104
+ 2025-11-24 08:15:50 - evalscope - INFO: Finished evaluation for gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 on ['arena_hard']
105
+ 2025-11-24 08:15:50 - evalscope - INFO: Output directory: ./outputs/20251124_074215
arena_hard/20251124_074215/predictions/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
arena_hard/20251124_074215/reports/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25@arena_hard",
3
+ "dataset_name": "arena_hard",
4
+ "dataset_pretty_name": "ArenaHard",
5
+ "dataset_description": "ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.",
6
+ "model_name": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25",
7
+ "score": 0.3873,
8
+ "metrics": [
9
+ {
10
+ "name": "winrate",
11
+ "num": 500,
12
+ "score": 0.3873,
13
+ "macro_score": 0.3873,
14
+ "categories": [
15
+ {
16
+ "name": [
17
+ "default"
18
+ ],
19
+ "num": 500,
20
+ "score": 0.3873,
21
+ "macro_score": 0.3873,
22
+ "subsets": [
23
+ {
24
+ "name": "default",
25
+ "score": 0.3873,
26
+ "num": 500
27
+ }
28
+ ]
29
+ }
30
+ ]
31
+ }
32
+ ],
33
+ "analysis": "N/A"
34
+ }
arena_hard/20251124_074215/reviews/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8b2e7ec803b6368df0780cf8939b3046b97fcecbbad5b990ad635aa3d0dcaa9
3
+ size 15526049