#!/usr/bin/env python3 """ Fix HF dataset README metadata for the 11 test300 selected-tools datasets. The push_to_hub() updated the parquet (13 cols) but not the README (still 10 cols). This script updates the YAML front matter in each README to include the 3 new columns. """ import os, sys os.environ["HF_HOME"] = "/scratch/hc3337/.cache/huggingface" from huggingface_hub import HfApi, DatasetCard REPOS = [ "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-v1", "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-less-chars-v1", "timchen0618/browsecomp-plus-sel-tools-test300-gemini-2p5-pro-v1", "timchen0618/browsecomp-plus-sel-tools-test300-gemini-3p1-pro-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed0-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed1-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed3-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed4-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed5-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed6-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed7-v1", ] NEW_COLUMNS = [ {"name": "question", "dtype": "string"}, {"name": "correct_answer", "dtype": "string"}, {"name": "correct", "dtype": "bool"}, ] api = HfApi() for repo in REPOS: print(f"Fixing {repo}...") try: card = DatasetCard.load(repo) features = card.data.get("dataset_info", {}).get("features", []) if not features: print(f" WARNING: no features found in card data for {repo}") print(f" card.data keys: {list(card.data.keys())}") continue existing_names = {f["name"] for f in features} for col in NEW_COLUMNS: if col["name"] not in existing_names: features.append(col) print(f" Added column: {col['name']}") else: print(f" Column already present: {col['name']}") card.data["dataset_info"]["features"] = features card.push_to_hub(repo) print(f" Updated README for {repo}") except Exception as e: print(f" ERROR: {e}") import traceback; traceback.print_exc() print("Done!")