Spaces:

timchen0618
/

dashboard

Running

File size: 3,183 Bytes

d14bce3

#!/usr/bin/env python3
"""
Patch the selected-tools test300 HF datasets to fill missing question fields
from the BrowseComp JSONL. The eval files for some queries omit the 'question'
field; this script fills them using query_id -> query from BrowseComp.

Python env: /scratch/hc3337/envs/raca-py312/bin/python
"""
from __future__ import annotations
import json, sys, os
from pathlib import Path

os.environ.setdefault("HF_HOME", "/scratch/hc3337/.cache/huggingface")

BC_JSONL = Path("/scratch/hc3337/projects/BrowseComp-Plus/data/browsecomp_plus_decrypted_test300.jsonl")

REPOS = [
    "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-less-chars-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-gemini-2p5-pro-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-gemini-3p1-pro-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed0-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed1-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed3-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed4-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed5-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed6-v1",
    "timchen0618/browsecomp-plus-sel-tools-test300-random-seed7-v1",
]


def load_bc_questions(path: Path) -> dict:
    qmap: dict = {}
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            d = json.loads(line)
            qid = d.get("query_id")
            q = d.get("query") or d.get("question") or ""
            if qid is not None and q:
                qmap[int(qid)] = q
    print(f"Loaded {len(qmap)} questions from {path}", file=sys.stderr)
    return qmap


def patch_repo(repo: str, bc_questions: dict) -> None:
    from datasets import load_dataset, Dataset

    print(f"\nLoading {repo}...", file=sys.stderr)
    ds = load_dataset(repo, split="train")
    print(f"  {len(ds)} rows, columns: {ds.column_names}", file=sys.stderr)

    rows = []
    filled = 0
    for row in ds:
        r = dict(row)
        qid = int(r["query_id"])
        if not r.get("question"):
            q = bc_questions.get(qid, "")
            if q:
                r["question"] = q
                filled += 1
        rows.append(r)

    print(f"  Filled {filled} missing questions from BrowseComp JSONL", file=sys.stderr)
    if filled == 0:
        print(f"  No changes needed — skipping push.", file=sys.stderr)
        return

    ds_new = Dataset.from_list(rows)
    ds_new.push_to_hub(repo, split="train",
                       commit_message="Fill missing question fields from BrowseComp JSONL")
    print(f"  Pushed {len(rows)} rows to {repo}.", file=sys.stderr)


def main():
    bc_questions = load_bc_questions(BC_JSONL)
    for repo in REPOS:
        try:
            patch_repo(repo, bc_questions)
        except Exception as e:
            print(f"ERROR patching {repo}: {e}", file=sys.stderr)
    print("\nALL DONE", file=sys.stderr)


if __name__ == "__main__":
    main()