Spaces:

smolagents
/

ml-agent

Running

File size: 5,112 Bytes

8cfacd3

from itertools import product

from datasets import Dataset

# Task templates (excluding Very hard difficulty)
tasks = [
    {
        "task": "Evaluate models {M} on benchmarks {B}",
        "difficulty": "Easy",
        "category": "Evaluation",
        "params": ["M", "B"],
    },
    {
        "task": "Train models {M} on datasets {D} evaluating them on benchmarks {B}",
        "difficulty": "Medium",
        "category": "Training",
        "params": ["M", "D", "B"],
    },
    {
        "task": "Run an ablation for hyperparameter {P} for model {M} on dataset {D}",
        "difficulty": "Hard",
        "category": "Ablation",
        "params": ["P", "M", "D"],
    },
    {
        "task": "Generate completions with model {M} on benchmarks {B} using engine {E}",
        "difficulty": "Medium",
        "category": "Generation",
        "params": ["M", "B", "E"],
    },
    # {
    #     "task": "Merge models {M} using linear averaging to find the best result on benchmarks {B}",
    #     "difficulty": "Hard",
    #     "category": "Model Merging",
    #     "params": ["M", "B"],
    # },
    {
        "task": "Decontaminate dataset {D} against benchmarks {B}",
        "difficulty": "Hard",
        "category": "Data Processing",
        "params": ["D", "B"],
    },
    {
        "task": "Format dataset {D} for compatibility with framework {F} on task {T}",
        "difficulty": "Easy",
        "category": "Data Formatting",
        "params": ["D", "F", "T"],
    },
]

# Parameter values
values = {
    "M": [
        "Qwen/Qwen3-4B-Instruct-2507",
        "openai/gpt-oss-20b",
        "gpt-4o-mini",
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "anthropic's latest model",
    ],
    "B": [
        "Idavidrein/gpqa",
        "HuggingFaceH4/MATH-500",
        "lighteval/SimpleQA",
        "TIGER-Lab/MMLU-Pro",
    ],
    "D": [
        "HuggingFaceH4/multi_turn_if",
        "HuggingFaceH4/ultrachat_200k",
        "HuggingFaceH4/AceReason-1.1-SFT config: math_no_think",
    ],
    "E": [
        "vllm",
        "sglang",
    ],
    "F": [
        "trl",
        "axolotl",
        "verl",
    ],
    "P": [
        "learning_rate",
        "batch_size",
        "num_epochs",
    ],
    "T": [
        "SFT",
        "GRPO",
    ],
}

# Task-specific instance limits
# For each task, specify which parameter(s) to pivot on and how many instances per pivot combination
# pivot can be a single parameter string or a list of parameters
task_limits = [
    {"pivot": "B", "instances_per_pivot": 1},  # Task 0: 1 instance per
    {"pivot": ["M", "B"], "instances_per_pivot": 3},  # Task 1: 3 instances per model
    {"pivot": ["P", "D"], "instances_per_pivot": 3},  # Task 2:
    {"pivot": "E", "instances_per_pivot": 2},  # Task 3: 2 instances per benchmark
    # {"pivot": "M", "instances_per_pivot": 2},  # Task 4
    {"pivot": "D", "instances_per_pivot": 2},  # Task 5: 2 instances per dataset
    {"pivot": ["D", "F", "T"], "instances_per_pivot": 2},  # Task 6:
]


def main():
    eval_data = []

    for task_idx, task_dict in enumerate(tasks):
        template = task_dict["task"]
        params = task_dict["params"]
        limit_config = task_limits[task_idx]

        pivot_params = limit_config["pivot"]
        instances_per_pivot = limit_config["instances_per_pivot"]

        # Normalize pivot to list
        if isinstance(pivot_params, str):
            pivot_params = [pivot_params]

        # Get all combinations of pivot values
        pivot_param_values = [values[p] for p in pivot_params]
        pivot_combinations = product(*pivot_param_values)

        # For each pivot combination, generate limited instances
        for pivot_combo in pivot_combinations:
            # Get combinations of other (non-pivot) parameters
            other_params = [p for p in params if p not in pivot_params]
            other_param_values = [values[p] for p in other_params]
            other_combinations = list(product(*other_param_values))

            # Limit to specified number of instances per pivot combination
            limited_combinations = other_combinations[:instances_per_pivot]

            # Generate instances
            for combo in limited_combinations:
                # Build kwargs with pivot values and other values
                kwargs = dict(zip(pivot_params, pivot_combo))
                kwargs.update(dict(zip(other_params, combo)))

                concrete_task = template.format(**kwargs)
                eval_data.append(
                    {
                        "task": concrete_task,
                        "difficulty": task_dict["difficulty"],
                        "category": task_dict["category"],
                    }
                )

    print(f"Generated {len(eval_data)} instances from {len(tasks)} templates")

    dataset = Dataset.from_list(eval_data)
    print(f"\nDataset: {len(dataset)} rows")
    print(f"Sample: {dataset[0]['task']}")

    dataset.push_to_hub("akseljoonas/qyestions", private=False)
    print("\n✓ Pushed to akseljoonas/qyestions")


if __name__ == "__main__":
    main()