File size: 5,112 Bytes
8cfacd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from itertools import product

from datasets import Dataset

# Task templates (excluding Very hard difficulty)
tasks = [
    {
        "task": "Evaluate models {M} on benchmarks {B}",
        "difficulty": "Easy",
        "category": "Evaluation",
        "params": ["M", "B"],
    },
    {
        "task": "Train models {M} on datasets {D} evaluating them on benchmarks {B}",
        "difficulty": "Medium",
        "category": "Training",
        "params": ["M", "D", "B"],
    },
    {
        "task": "Run an ablation for hyperparameter {P} for model {M} on dataset {D}",
        "difficulty": "Hard",
        "category": "Ablation",
        "params": ["P", "M", "D"],
    },
    {
        "task": "Generate completions with model {M} on benchmarks {B} using engine {E}",
        "difficulty": "Medium",
        "category": "Generation",
        "params": ["M", "B", "E"],
    },
    # {
    #     "task": "Merge models {M} using linear averaging to find the best result on benchmarks {B}",
    #     "difficulty": "Hard",
    #     "category": "Model Merging",
    #     "params": ["M", "B"],
    # },
    {
        "task": "Decontaminate dataset {D} against benchmarks {B}",
        "difficulty": "Hard",
        "category": "Data Processing",
        "params": ["D", "B"],
    },
    {
        "task": "Format dataset {D} for compatibility with framework {F} on task {T}",
        "difficulty": "Easy",
        "category": "Data Formatting",
        "params": ["D", "F", "T"],
    },
]

# Parameter values
values = {
    "M": [
        "Qwen/Qwen3-4B-Instruct-2507",
        "openai/gpt-oss-20b",
        "gpt-4o-mini",
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "anthropic's latest model",
    ],
    "B": [
        "Idavidrein/gpqa",
        "HuggingFaceH4/MATH-500",
        "lighteval/SimpleQA",
        "TIGER-Lab/MMLU-Pro",
    ],
    "D": [
        "HuggingFaceH4/multi_turn_if",
        "HuggingFaceH4/ultrachat_200k",
        "HuggingFaceH4/AceReason-1.1-SFT config: math_no_think",
    ],
    "E": [
        "vllm",
        "sglang",
    ],
    "F": [
        "trl",
        "axolotl",
        "verl",
    ],
    "P": [
        "learning_rate",
        "batch_size",
        "num_epochs",
    ],
    "T": [
        "SFT",
        "GRPO",
    ],
}

# Task-specific instance limits
# For each task, specify which parameter(s) to pivot on and how many instances per pivot combination
# pivot can be a single parameter string or a list of parameters
task_limits = [
    {"pivot": "B", "instances_per_pivot": 1},  # Task 0: 1 instance per
    {"pivot": ["M", "B"], "instances_per_pivot": 3},  # Task 1: 3 instances per model
    {"pivot": ["P", "D"], "instances_per_pivot": 3},  # Task 2:
    {"pivot": "E", "instances_per_pivot": 2},  # Task 3: 2 instances per benchmark
    # {"pivot": "M", "instances_per_pivot": 2},  # Task 4
    {"pivot": "D", "instances_per_pivot": 2},  # Task 5: 2 instances per dataset
    {"pivot": ["D", "F", "T"], "instances_per_pivot": 2},  # Task 6:
]


def main():
    eval_data = []

    for task_idx, task_dict in enumerate(tasks):
        template = task_dict["task"]
        params = task_dict["params"]
        limit_config = task_limits[task_idx]

        pivot_params = limit_config["pivot"]
        instances_per_pivot = limit_config["instances_per_pivot"]

        # Normalize pivot to list
        if isinstance(pivot_params, str):
            pivot_params = [pivot_params]

        # Get all combinations of pivot values
        pivot_param_values = [values[p] for p in pivot_params]
        pivot_combinations = product(*pivot_param_values)

        # For each pivot combination, generate limited instances
        for pivot_combo in pivot_combinations:
            # Get combinations of other (non-pivot) parameters
            other_params = [p for p in params if p not in pivot_params]
            other_param_values = [values[p] for p in other_params]
            other_combinations = list(product(*other_param_values))

            # Limit to specified number of instances per pivot combination
            limited_combinations = other_combinations[:instances_per_pivot]

            # Generate instances
            for combo in limited_combinations:
                # Build kwargs with pivot values and other values
                kwargs = dict(zip(pivot_params, pivot_combo))
                kwargs.update(dict(zip(other_params, combo)))

                concrete_task = template.format(**kwargs)
                eval_data.append(
                    {
                        "task": concrete_task,
                        "difficulty": task_dict["difficulty"],
                        "category": task_dict["category"],
                    }
                )

    print(f"Generated {len(eval_data)} instances from {len(tasks)} templates")

    dataset = Dataset.from_list(eval_data)
    print(f"\nDataset: {len(dataset)} rows")
    print(f"Sample: {dataset[0]['task']}")

    dataset.push_to_hub("akseljoonas/qyestions", private=False)
    print("\n✓ Pushed to akseljoonas/qyestions")


if __name__ == "__main__":
    main()