Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Fix HF dataset README metadata for the 11 test300 selected-tools datasets. | |
| The push_to_hub() updated the parquet (13 cols) but not the README (still 10 cols). | |
| This script updates the YAML front matter in each README to include the 3 new columns. | |
| """ | |
| import os, sys | |
| os.environ["HF_HOME"] = "/scratch/hc3337/.cache/huggingface" | |
| from huggingface_hub import HfApi, DatasetCard | |
| REPOS = [ | |
| "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-v1", | |
| "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-less-chars-v1", | |
| "timchen0618/browsecomp-plus-sel-tools-test300-gemini-2p5-pro-v1", | |
| "timchen0618/browsecomp-plus-sel-tools-test300-gemini-3p1-pro-v1", | |
| "timchen0618/browsecomp-plus-sel-tools-test300-random-seed0-v1", | |
| "timchen0618/browsecomp-plus-sel-tools-test300-random-seed1-v1", | |
| "timchen0618/browsecomp-plus-sel-tools-test300-random-seed3-v1", | |
| "timchen0618/browsecomp-plus-sel-tools-test300-random-seed4-v1", | |
| "timchen0618/browsecomp-plus-sel-tools-test300-random-seed5-v1", | |
| "timchen0618/browsecomp-plus-sel-tools-test300-random-seed6-v1", | |
| "timchen0618/browsecomp-plus-sel-tools-test300-random-seed7-v1", | |
| ] | |
| NEW_COLUMNS = [ | |
| {"name": "question", "dtype": "string"}, | |
| {"name": "correct_answer", "dtype": "string"}, | |
| {"name": "correct", "dtype": "bool"}, | |
| ] | |
| api = HfApi() | |
| for repo in REPOS: | |
| print(f"Fixing {repo}...") | |
| try: | |
| card = DatasetCard.load(repo) | |
| features = card.data.get("dataset_info", {}).get("features", []) | |
| if not features: | |
| print(f" WARNING: no features found in card data for {repo}") | |
| print(f" card.data keys: {list(card.data.keys())}") | |
| continue | |
| existing_names = {f["name"] for f in features} | |
| for col in NEW_COLUMNS: | |
| if col["name"] not in existing_names: | |
| features.append(col) | |
| print(f" Added column: {col['name']}") | |
| else: | |
| print(f" Column already present: {col['name']}") | |
| card.data["dataset_info"]["features"] = features | |
| card.push_to_hub(repo) | |
| print(f" Updated README for {repo}") | |
| except Exception as e: | |
| print(f" ERROR: {e}") | |
| import traceback; traceback.print_exc() | |
| print("Done!") | |