Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gitattributes +1 -0
arena_hard/20251124_074215/configs/task_config_2131fa.yaml +76 -0
arena_hard/20251124_074215/logs/eval_log.log +105 -0
arena_hard/20251124_074215/predictions/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl +0 -0
arena_hard/20251124_074215/reports/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard.json +34 -0
arena_hard/20251124_074215/reviews/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl +3 -0

.gitattributes CHANGED Viewed

@@ -61,3 +61,4 @@ arena_hard/20251119_191645/reviews/gpt-5/arena_hard_default.jsonl filter=lfs dif
 mnpo_iter3_armo_dpo_abl_scored.jsonl filter=lfs diff=lfs merge=lfs -text
 output_pre_mnpo.log filter=lfs diff=lfs merge=lfs -text
 arena_hard/20251123_220114/reviews/olmo-2-0325-32b-instruct/arena_hard_default.jsonl filter=lfs diff=lfs merge=lfs -text

 mnpo_iter3_armo_dpo_abl_scored.jsonl filter=lfs diff=lfs merge=lfs -text
 output_pre_mnpo.log filter=lfs diff=lfs merge=lfs -text
 arena_hard/20251123_220114/reviews/olmo-2-0325-32b-instruct/arena_hard_default.jsonl filter=lfs diff=lfs merge=lfs -text
+arena_hard/20251124_074215/reviews/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl filter=lfs diff=lfs merge=lfs -text

arena_hard/20251124_074215/configs/task_config_2131fa.yaml ADDED Viewed

	@@ -0,0 +1,76 @@

+analysis_report: false
+api_url: http://127.0.0.1:8002/v1
+chat_template: null
+dataset_args:
+  arena_hard:
+    aggregation: mean
+    dataset_id: AI-ModelScope/arena-hard-auto-v0.1
+    default_subset: default
+    description: ArenaHard is a benchmark designed to evaluate the performance of
+      large language models in a competitive setting, where models are pitted against
+      each other in a series of tasks to determine their relative strengths and weaknesses.
+      It includes a set of challenging tasks that require reasoning, understanding,
+      and generation capabilities. Currently not support `style-controlled winrate`;
+      the official Judge model is `gpt-4-1106-preview`, while the baseline model is
+      `gpt-4-0314`.
+    eval_split: test
+    extra_params: {}
+    few_shot_num: 0
+    few_shot_prompt_template: null
+    few_shot_random: false
+    filters: null
+    metric_list:
+    - winrate
+    name: arena_hard
+    output_types:
+    - generation
+    pretty_name: ArenaHard
+    prompt_template: '{question}'
+    query_template: null
+    review_timeout: null
+    shuffle: false
+    shuffle_choices: false
+    subset_list:
+    - default
+    system_prompt: null
+    tags:
+    - InstructionFollowing
+    - Arena
+    train_split: null
+dataset_dir: /afs/.ir/users/f/a/fangwu97/.cache/modelscope/hub/datasets
+dataset_hub: modelscope
+datasets:
+- arena_hard
+debug: false
+eval_backend: Native
+eval_batch_size: 12
+eval_config: null
+eval_type: openai_api
+generation_config:
+  batch_size: 12
+  max_tokens: 4096
+ignore_errors: false
+judge_model_args:
+  api_key: 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
+  api_url: https://openrouter.ai/api/v1
+  generation_config:
+    reasoning_effort: minimal
+  model_id: gpt-5-mini
+judge_strategy: auto
+judge_worker_num: 12
+limit: null
+model: gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25
+model_args: {}
+model_id: gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25
+model_task: text_generation
+repeats: 1
+rerun_review: false
+sandbox_config: {}
+sandbox_manager_config: {}
+sandbox_type: docker
+seed: 42
+stream: null
+timeout: null
+use_cache: null
+use_sandbox: false
+work_dir: ./outputs/20251124_074215

arena_hard/20251124_074215/logs/eval_log.log ADDED Viewed

	@@ -0,0 +1,105 @@

+2025-11-24 07:42:15 - evalscope - INFO: Creating model gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 with eval_type=openai_api base_url=http://127.0.0.1:8002/v1, config={'batch_size': 12, 'max_tokens': 4096}, model_args={}
+2025-11-24 07:42:16 - evalscope - INFO: Dump task config to ./outputs/20251124_074215/configs/task_config_2131fa.yaml
+2025-11-24 07:42:16 - evalscope - INFO: {
+    "model": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25",
+    "model_id": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25",
+    "model_args": {},
+    "model_task": "text_generation",
+    "chat_template": null,
+    "datasets": [
+        "arena_hard"
+    ],
+    "dataset_args": {
+        "arena_hard": {
+            "name": "arena_hard",
+            "dataset_id": "AI-ModelScope/arena-hard-auto-v0.1",
+            "output_types": [
+                "generation"
+            ],
+            "subset_list": [
+                "default"
+            ],
+            "default_subset": "default",
+            "few_shot_num": 0,
+            "few_shot_random": false,
+            "train_split": null,
+            "eval_split": "test",
+            "prompt_template": "{question}",
+            "few_shot_prompt_template": null,
+            "system_prompt": null,
+            "query_template": null,
+            "pretty_name": "ArenaHard",
+            "description": "ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.",
+            "tags": [
+                "InstructionFollowing",
+                "Arena"
+            ],
+            "filters": null,
+            "metric_list": [
+                "winrate"
+            ],
+            "aggregation": "mean",
+            "shuffle": false,
+            "shuffle_choices": false,
+            "review_timeout": null,
+            "extra_params": {}
+        }
+    },
+    "dataset_dir": "/afs/.ir/users/f/a/fangwu97/.cache/modelscope/hub/datasets",
+    "dataset_hub": "modelscope",
+    "repeats": 1,
+    "generation_config": {
+        "batch_size": 12,
+        "max_tokens": 4096
+    },
+    "eval_type": "openai_api",
+    "eval_backend": "Native",
+    "eval_config": null,
+    "limit": null,
+    "eval_batch_size": 12,
+    "use_cache": null,
+    "rerun_review": false,
+    "work_dir": "./outputs/20251124_074215",
+    "ignore_errors": false,
+    "debug": false,
+    "seed": 42,
+    "api_url": "http://127.0.0.1:8002/v1",
+    "timeout": null,
+    "stream": null,
+    "judge_strategy": "auto",
+    "judge_worker_num": 12,
+    "judge_model_args": {
+        "model_id": "gpt-5-mini",
+        "generation_config": {
+            "reasoning_effort": "minimal"
+        },
+        "api_url": "https://openrouter.ai/api/v1",
+        "api_key": "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"
+    },
+    "analysis_report": false,
+    "use_sandbox": false,
+    "sandbox_type": "docker",
+    "sandbox_manager_config": {},
+    "sandbox_config": {}
+}
+2025-11-24 07:45:59 - evalscope - INFO: Creating model gpt-5-mini with eval_type=openai_api base_url=https://openrouter.ai/api/v1, config={'reasoning_effort': 'minimal'}, model_args={}
+2025-11-24 08:15:49 - evalscope - INFO:
+arena_hard report table:
++-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
+| Model                                                                       | Dataset    | Metric   | Subset   |   Num |   Score | Cat.0   |
++=============================================================================+============+==========+==========+=======+=========+=========+
+| gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 | arena_hard | winrate  | default  |   500 |  0.3873 | default |
++-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
+2025-11-24 08:15:49 - evalscope - INFO: Skipping report analysis (`analysis_report=False`).
+2025-11-24 08:15:49 - evalscope - INFO: Dump report to: ./outputs/20251124_074215/reports/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard.json
+2025-11-24 08:15:49 - evalscope - INFO: Overall report table:
++-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
+| Model                                                                       | Dataset    | Metric   | Subset   |   Num |   Score | Cat.0   |
++=============================================================================+============+==========+==========+=======+=========+=========+
+| gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 | arena_hard | winrate  | default  |   500 |  0.3873 | default |
++-----------------------------------------------------------------------------+------------+----------+----------+-------+---------+---------+
+2025-11-24 08:15:50 - evalscope - INFO: Finished evaluation for gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25 on ['arena_hard']
+2025-11-24 08:15:50 - evalscope - INFO: Output directory: ./outputs/20251124_074215

arena_hard/20251124_074215/predictions/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

arena_hard/20251124_074215/reports/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+    "name": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25@arena_hard",
+    "dataset_name": "arena_hard",
+    "dataset_pretty_name": "ArenaHard",
+    "dataset_description": "ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.",
+    "model_name": "gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25",
+    "score": 0.3873,
+    "metrics": [
+        {
+            "name": "winrate",
+            "num": 500,
+            "score": 0.3873,
+            "macro_score": 0.3873,
+            "categories": [
+                {
+                    "name": [
+                        "default"
+                    ],
+                    "num": 500,
+                    "score": 0.3873,
+                    "macro_score": 0.3873,
+                    "subsets": [
+                        {
+                            "name": "default",
+                            "score": 0.3873,
+                            "num": 500
+                        }
+                    ]
+                }
+            ]
+        }
+    ],
+    "analysis": "N/A"
+}

arena_hard/20251124_074215/reviews/gemma-2-9b-it_mnpo_stage_2_athene_beta1_ratio0.85_eta0.005_weights0.75-0.25/arena_hard_default.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8b2e7ec803b6368df0780cf8939b3046b97fcecbbad5b990ad635aa3d0dcaa9
+size 15526049