File size: 3,916 Bytes
895066e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "inspect-ai @ git+https://github.com/dvsrepo/inspect_ai.git@fallback-to-modified-for-hf-fs",
#     "datasets",
#     "openai",
#     "transformers",
#     "accelerate",
#     "huggingface_hub",
#     "inspect-evals",
#     "pandas",
#     "pyarrow",
# ]
# ///

import os
import sys
import subprocess
import tempfile
import urllib.request
from pathlib import Path

from inspect_ai.analysis import evals_df, samples_df


def export_logs_to_parquet(log_dir: str, dataset_repo: str) -> None:
    from huggingface_hub import HfApi

    hf_token = os.getenv("HF_TOKEN")
    if not hf_token:
        raise ValueError("HF_TOKEN environment variable not set")

    api = HfApi(token=hf_token)

    repo_id = (
        dataset_repo.replace("datasets/", "")
        if dataset_repo.startswith("datasets/")
        else dataset_repo
    )

    evals = evals_df(logs=log_dir)
    samples = samples_df(logs=log_dir)

    with tempfile.TemporaryDirectory() as tmpdir:
        evals_path = Path(tmpdir) / "evals.parquet"
        samples_path = Path(tmpdir) / "samples.parquet"

        evals.to_parquet(evals_path, index=False, engine="pyarrow")
        samples.to_parquet(samples_path, index=False, engine="pyarrow")

        api.upload_file(
            path_or_fileobj=str(evals_path),
            path_in_repo="evals.parquet",
            repo_id=repo_id,
            repo_type="dataset",
            token=hf_token,
        )

        api.upload_file(
            path_or_fileobj=str(samples_path),
            path_in_repo="samples.parquet",
            repo_id=repo_id,
            repo_type="dataset",
            token=hf_token,
        )


if __name__ == "__main__":
    if len(sys.argv) < 4:
        print("Usage: eval_runner.py <eval_ref> <model> <dataset_repo> [--inspect-evals] [extra_args...]")
        sys.exit(1)

    eval_ref = sys.argv[1]
    model = sys.argv[2]
    dataset_repo = sys.argv[3]

    is_inspect_evals = "--inspect-evals" in sys.argv
    extra_args = [arg for arg in sys.argv[4:] if arg != "--inspect-evals"]

    if not dataset_repo.startswith("datasets/"):
        dataset_repo = f"datasets/{dataset_repo}"
    log_dir = f"hf://{dataset_repo}/logs"

    is_eval_set = "," in model

    if is_inspect_evals:
        eval_target = eval_ref
        cleanup_file = None
    else:
        print("Downloading eval script...")
        with urllib.request.urlopen(eval_ref) as response:
            eval_code = response.read().decode("utf-8")

        eval_filename = "downloaded_eval.py"
        with open(eval_filename, "w") as f:
            f.write(eval_code)

        eval_target = eval_filename
        cleanup_file = eval_filename

    is_eval_set = "," in model

    try:
        if is_eval_set:
            print("Running evaluation set...")
            cmd = [
                "inspect",
                "eval-set",
                eval_target,
                "--model",
                model,
                "--log-dir",
                log_dir,
                "--log-shared",
                "--log-buffer",
                "100",
            ]
        else:
            print("Running evaluation...")
            cmd = [
                "inspect",
                "eval",
                eval_target,
                "--model",
                model,
                "--log-dir",
                log_dir,
                "--log-shared",
                "--log-buffer",
                "100",
            ]
        cmd.extend(extra_args)

        subprocess.run(cmd, check=True)

        print("Exporting logs to parquet...")
        try:
            export_logs_to_parquet(log_dir, dataset_repo)
        except Exception as e:
            print(f"Warning: Could not export to parquet: {e}")

    finally:
        if cleanup_file and os.path.exists(cleanup_file):
            os.unlink(cleanup_file)