File size: 2,290 Bytes
8026e0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python3
"""
Fix HF dataset README metadata for the 11 test300 selected-tools datasets.
The push_to_hub() updated the parquet (13 cols) but not the README (still 10 cols).
This script updates the YAML front matter in each README to include the 3 new columns.
"""
import os, sys

os.environ["HF_HOME"] = "/scratch/hc3337/.cache/huggingface"

from huggingface_hub import HfApi, DatasetCard

REPOS = [
    "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-less-chars-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-gemini-2p5-pro-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-gemini-3p1-pro-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed0-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed1-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed3-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed4-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed5-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed6-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed7-v1",
]

NEW_COLUMNS = [
    {"name": "question", "dtype": "string"},
    {"name": "correct_answer", "dtype": "string"},
    {"name": "correct", "dtype": "bool"},
]

api = HfApi()

for repo in REPOS:
    print(f"Fixing {repo}...")
    try:
        card = DatasetCard.load(repo)
        features = card.data.get("dataset_info", {}).get("features", [])
        if not features:
            print(f"  WARNING: no features found in card data for {repo}")
            print(f"  card.data keys: {list(card.data.keys())}")
            continue

        existing_names = {f["name"] for f in features}
        for col in NEW_COLUMNS:
            if col["name"] not in existing_names:
                features.append(col)
                print(f"  Added column: {col['name']}")
            else:
                print(f"  Column already present: {col['name']}")

        card.data["dataset_info"]["features"] = features
        card.push_to_hub(repo)
        print(f"  Updated README for {repo}")
    except Exception as e:
        print(f"  ERROR: {e}")
        import traceback; traceback.print_exc()

print("Done!")