File size: 1,005 Bytes
d31c88f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
"""Inspect eval for MIDI caps benchmark."""
from typing import Any
from inspect_ai import Task, task
from inspect_ai.dataset import Sample, hf_dataset
from inspect_ai.model import ChatMessageUser, ContentText
from inspect_ai.scorer import model_graded_fact
from inspect_ai.solver import generate


@task
def midicaps_eval():
    return Task(
        dataset=hf_dataset(
            path="dvilasuero/midicaps_benchmark",
            split="small_test",
            sample_fields=record_to_sample,
            shuffle=True,
        ),
        solver=generate(),
        scorer=model_graded_fact(
            partial_credit=True,
            model="hf-inference-providers/Qwen/Qwen3-32B:fastest"
        )
    )


def record_to_sample(record: dict[str, Any]) -> Sample:
    message = [
        ChatMessageUser(
            content=[
                ContentText(text=record['caption']),
            ]
        )
    ]
    return Sample(
        input=message,
        target=record["condensed_sequence"]
    )