from itertools import product from datasets import Dataset # Task templates (excluding Very hard difficulty) tasks = [ { "task": "Evaluate models {M} on benchmarks {B}", "difficulty": "Easy", "category": "Evaluation", "params": ["M", "B"], }, { "task": "Train models {M} on datasets {D} evaluating them on benchmarks {B}", "difficulty": "Medium", "category": "Training", "params": ["M", "D", "B"], }, { "task": "Run an ablation for hyperparameter {P} for model {M} on dataset {D}", "difficulty": "Hard", "category": "Ablation", "params": ["P", "M", "D"], }, { "task": "Generate completions with model {M} on benchmarks {B} using engine {E}", "difficulty": "Medium", "category": "Generation", "params": ["M", "B", "E"], }, # { # "task": "Merge models {M} using linear averaging to find the best result on benchmarks {B}", # "difficulty": "Hard", # "category": "Model Merging", # "params": ["M", "B"], # }, { "task": "Decontaminate dataset {D} against benchmarks {B}", "difficulty": "Hard", "category": "Data Processing", "params": ["D", "B"], }, { "task": "Format dataset {D} for compatibility with framework {F} on task {T}", "difficulty": "Easy", "category": "Data Formatting", "params": ["D", "F", "T"], }, ] # Parameter values values = { "M": [ "Qwen/Qwen3-4B-Instruct-2507", "openai/gpt-oss-20b", "gpt-4o-mini", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "anthropic's latest model", ], "B": [ "Idavidrein/gpqa", "HuggingFaceH4/MATH-500", "lighteval/SimpleQA", "TIGER-Lab/MMLU-Pro", ], "D": [ "HuggingFaceH4/multi_turn_if", "HuggingFaceH4/ultrachat_200k", "HuggingFaceH4/AceReason-1.1-SFT config: math_no_think", ], "E": [ "vllm", "sglang", ], "F": [ "trl", "axolotl", "verl", ], "P": [ "learning_rate", "batch_size", "num_epochs", ], "T": [ "SFT", "GRPO", ], } # Task-specific instance limits # For each task, specify which parameter(s) to pivot on and how many instances per pivot combination # pivot can be a single parameter string or a list of parameters task_limits = [ {"pivot": "B", "instances_per_pivot": 1}, # Task 0: 1 instance per {"pivot": ["M", "B"], "instances_per_pivot": 3}, # Task 1: 3 instances per model {"pivot": ["P", "D"], "instances_per_pivot": 3}, # Task 2: {"pivot": "E", "instances_per_pivot": 2}, # Task 3: 2 instances per benchmark # {"pivot": "M", "instances_per_pivot": 2}, # Task 4 {"pivot": "D", "instances_per_pivot": 2}, # Task 5: 2 instances per dataset {"pivot": ["D", "F", "T"], "instances_per_pivot": 2}, # Task 6: ] def main(): eval_data = [] for task_idx, task_dict in enumerate(tasks): template = task_dict["task"] params = task_dict["params"] limit_config = task_limits[task_idx] pivot_params = limit_config["pivot"] instances_per_pivot = limit_config["instances_per_pivot"] # Normalize pivot to list if isinstance(pivot_params, str): pivot_params = [pivot_params] # Get all combinations of pivot values pivot_param_values = [values[p] for p in pivot_params] pivot_combinations = product(*pivot_param_values) # For each pivot combination, generate limited instances for pivot_combo in pivot_combinations: # Get combinations of other (non-pivot) parameters other_params = [p for p in params if p not in pivot_params] other_param_values = [values[p] for p in other_params] other_combinations = list(product(*other_param_values)) # Limit to specified number of instances per pivot combination limited_combinations = other_combinations[:instances_per_pivot] # Generate instances for combo in limited_combinations: # Build kwargs with pivot values and other values kwargs = dict(zip(pivot_params, pivot_combo)) kwargs.update(dict(zip(other_params, combo))) concrete_task = template.format(**kwargs) eval_data.append( { "task": concrete_task, "difficulty": task_dict["difficulty"], "category": task_dict["category"], } ) print(f"Generated {len(eval_data)} instances from {len(tasks)} templates") dataset = Dataset.from_list(eval_data) print(f"\nDataset: {len(dataset)} rows") print(f"Sample: {dataset[0]['task']}") dataset.push_to_hub("akseljoonas/qyestions", private=False) print("\n✓ Pushed to akseljoonas/qyestions") if __name__ == "__main__": main()