#!/usr/bin/env python3 """ Patch the selected-tools test300 HF datasets to fill missing question fields from the BrowseComp JSONL. The eval files for some queries omit the 'question' field; this script fills them using query_id -> query from BrowseComp. Python env: /scratch/hc3337/envs/raca-py312/bin/python """ from __future__ import annotations import json, sys, os from pathlib import Path os.environ.setdefault("HF_HOME", "/scratch/hc3337/.cache/huggingface") BC_JSONL = Path("/scratch/hc3337/projects/BrowseComp-Plus/data/browsecomp_plus_decrypted_test300.jsonl") REPOS = [ "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-less-chars-v1", "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-v1", "timchen0618/browsecomp-plus-sel-tools-test300-gemini-2p5-pro-v1", "timchen0618/browsecomp-plus-sel-tools-test300-gemini-3p1-pro-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed0-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed1-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed3-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed4-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed5-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed6-v1", "timchen0618/browsecomp-plus-sel-tools-test300-random-seed7-v1", ] def load_bc_questions(path: Path) -> dict: qmap: dict = {} with path.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue d = json.loads(line) qid = d.get("query_id") q = d.get("query") or d.get("question") or "" if qid is not None and q: qmap[int(qid)] = q print(f"Loaded {len(qmap)} questions from {path}", file=sys.stderr) return qmap def patch_repo(repo: str, bc_questions: dict) -> None: from datasets import load_dataset, Dataset print(f"\nLoading {repo}...", file=sys.stderr) ds = load_dataset(repo, split="train") print(f" {len(ds)} rows, columns: {ds.column_names}", file=sys.stderr) rows = [] filled = 0 for row in ds: r = dict(row) qid = int(r["query_id"]) if not r.get("question"): q = bc_questions.get(qid, "") if q: r["question"] = q filled += 1 rows.append(r) print(f" Filled {filled} missing questions from BrowseComp JSONL", file=sys.stderr) if filled == 0: print(f" No changes needed — skipping push.", file=sys.stderr) return ds_new = Dataset.from_list(rows) ds_new.push_to_hub(repo, split="train", commit_message="Fill missing question fields from BrowseComp JSONL") print(f" Pushed {len(rows)} rows to {repo}.", file=sys.stderr) def main(): bc_questions = load_bc_questions(BC_JSONL) for repo in REPOS: try: patch_repo(repo, bc_questions) except Exception as e: print(f"ERROR patching {repo}: {e}", file=sys.stderr) print("\nALL DONE", file=sys.stderr) if __name__ == "__main__": main()