Spaces:
Running
Running
File size: 5,112 Bytes
8cfacd3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
from itertools import product
from datasets import Dataset
# Task templates (excluding Very hard difficulty)
tasks = [
{
"task": "Evaluate models {M} on benchmarks {B}",
"difficulty": "Easy",
"category": "Evaluation",
"params": ["M", "B"],
},
{
"task": "Train models {M} on datasets {D} evaluating them on benchmarks {B}",
"difficulty": "Medium",
"category": "Training",
"params": ["M", "D", "B"],
},
{
"task": "Run an ablation for hyperparameter {P} for model {M} on dataset {D}",
"difficulty": "Hard",
"category": "Ablation",
"params": ["P", "M", "D"],
},
{
"task": "Generate completions with model {M} on benchmarks {B} using engine {E}",
"difficulty": "Medium",
"category": "Generation",
"params": ["M", "B", "E"],
},
# {
# "task": "Merge models {M} using linear averaging to find the best result on benchmarks {B}",
# "difficulty": "Hard",
# "category": "Model Merging",
# "params": ["M", "B"],
# },
{
"task": "Decontaminate dataset {D} against benchmarks {B}",
"difficulty": "Hard",
"category": "Data Processing",
"params": ["D", "B"],
},
{
"task": "Format dataset {D} for compatibility with framework {F} on task {T}",
"difficulty": "Easy",
"category": "Data Formatting",
"params": ["D", "F", "T"],
},
]
# Parameter values
values = {
"M": [
"Qwen/Qwen3-4B-Instruct-2507",
"openai/gpt-oss-20b",
"gpt-4o-mini",
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"anthropic's latest model",
],
"B": [
"Idavidrein/gpqa",
"HuggingFaceH4/MATH-500",
"lighteval/SimpleQA",
"TIGER-Lab/MMLU-Pro",
],
"D": [
"HuggingFaceH4/multi_turn_if",
"HuggingFaceH4/ultrachat_200k",
"HuggingFaceH4/AceReason-1.1-SFT config: math_no_think",
],
"E": [
"vllm",
"sglang",
],
"F": [
"trl",
"axolotl",
"verl",
],
"P": [
"learning_rate",
"batch_size",
"num_epochs",
],
"T": [
"SFT",
"GRPO",
],
}
# Task-specific instance limits
# For each task, specify which parameter(s) to pivot on and how many instances per pivot combination
# pivot can be a single parameter string or a list of parameters
task_limits = [
{"pivot": "B", "instances_per_pivot": 1}, # Task 0: 1 instance per
{"pivot": ["M", "B"], "instances_per_pivot": 3}, # Task 1: 3 instances per model
{"pivot": ["P", "D"], "instances_per_pivot": 3}, # Task 2:
{"pivot": "E", "instances_per_pivot": 2}, # Task 3: 2 instances per benchmark
# {"pivot": "M", "instances_per_pivot": 2}, # Task 4
{"pivot": "D", "instances_per_pivot": 2}, # Task 5: 2 instances per dataset
{"pivot": ["D", "F", "T"], "instances_per_pivot": 2}, # Task 6:
]
def main():
eval_data = []
for task_idx, task_dict in enumerate(tasks):
template = task_dict["task"]
params = task_dict["params"]
limit_config = task_limits[task_idx]
pivot_params = limit_config["pivot"]
instances_per_pivot = limit_config["instances_per_pivot"]
# Normalize pivot to list
if isinstance(pivot_params, str):
pivot_params = [pivot_params]
# Get all combinations of pivot values
pivot_param_values = [values[p] for p in pivot_params]
pivot_combinations = product(*pivot_param_values)
# For each pivot combination, generate limited instances
for pivot_combo in pivot_combinations:
# Get combinations of other (non-pivot) parameters
other_params = [p for p in params if p not in pivot_params]
other_param_values = [values[p] for p in other_params]
other_combinations = list(product(*other_param_values))
# Limit to specified number of instances per pivot combination
limited_combinations = other_combinations[:instances_per_pivot]
# Generate instances
for combo in limited_combinations:
# Build kwargs with pivot values and other values
kwargs = dict(zip(pivot_params, pivot_combo))
kwargs.update(dict(zip(other_params, combo)))
concrete_task = template.format(**kwargs)
eval_data.append(
{
"task": concrete_task,
"difficulty": task_dict["difficulty"],
"category": task_dict["category"],
}
)
print(f"Generated {len(eval_data)} instances from {len(tasks)} templates")
dataset = Dataset.from_list(eval_data)
print(f"\nDataset: {len(dataset)} rows")
print(f"Sample: {dataset[0]['task']}")
dataset.push_to_hub("akseljoonas/qyestions", private=False)
print("\n✓ Pushed to akseljoonas/qyestions")
if __name__ == "__main__":
main()
|