dash-jsp-trainer / scripts /push_benchmarks_to_hf.py
Vittal-M's picture
Trainer Space: download -> train -> push -> sleep
52c82e4
"""Pre-parse local benchmark instances to Parquet and push to HF dataset hub.
Output layout in `Vittal-M/jsp-benchmarks-cached`:
taillard.parquet
lawrence.parquet
brandimarte.parquet
dmu.parquet
README.md
"""
from __future__ import annotations
import argparse
import json
import os
from pathlib import Path
import pandas as pd
from dash_jsp.benchmarks import taillard, lawrence, brandimarte, dmu
def _jsp_to_df(instances) -> pd.DataFrame:
rows = []
for inst in instances:
rows.append({
"name": inst.name,
"family": inst.family,
"n_jobs": inst.n_jobs,
"n_machines": inst.n_machines,
"optimum": inst.optimum,
"ops_json": json.dumps(inst.ops),
"due_dates_json": json.dumps(inst.due_dates) if inst.due_dates else None,
"weights_json": json.dumps(inst.weights) if inst.weights else None,
"source_url": inst.source_url,
})
return pd.DataFrame(rows)
def _fjsp_to_df(instances) -> pd.DataFrame:
rows = []
for inst in instances:
rows.append({
"name": inst.name,
"family": inst.family,
"n_jobs": inst.n_jobs,
"n_machines": inst.n_machines,
"optimum": inst.optimum,
"ops_json": json.dumps(inst.ops),
"source_url": inst.source_url,
})
return pd.DataFrame(rows)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--data-dir", default="data")
parser.add_argument("--out-dir", default="data/cached")
parser.add_argument("--push", action="store_true",
help="Also push to HF dataset hub")
parser.add_argument(
"--repo-id",
default=os.environ.get("HF_DATASET_REPO_ID", "Vittal-M/jsp-benchmarks-cached"),
)
args = parser.parse_args()
data_dir = Path(args.data_dir)
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
families = {}
try:
families["taillard"] = _jsp_to_df(taillard.load_all(data_dir / "taillard"))
except FileNotFoundError as e:
print(f"[skip] taillard: {e}")
try:
families["lawrence"] = _jsp_to_df(lawrence.load_all(data_dir / "lawrence"))
except FileNotFoundError as e:
print(f"[skip] lawrence: {e}")
try:
families["dmu"] = _jsp_to_df(dmu.load_all(data_dir / "dmu"))
except FileNotFoundError as e:
print(f"[skip] dmu: {e}")
try:
families["brandimarte"] = _fjsp_to_df(brandimarte.load_all(data_dir / "brandimarte"))
except FileNotFoundError as e:
print(f"[skip] brandimarte: {e}")
for fam, df in families.items():
path = out_dir / f"{fam}.parquet"
df.to_parquet(path, index=False)
print(f" wrote {path} ({len(df)} instances)")
# Dataset card
card = (
"# JSP / FJSP Benchmark Cache\n\n"
"Pre-parsed canonical JSP and FJSP benchmark instances:\n\n"
"- **Taillard** (1993) β€” 80 JSP instances\n"
"- **Lawrence** (1984) β€” 40 JSP instances\n"
"- **Brandimarte** (1993) β€” 10 FJSP instances\n"
"- **DMU** (Demirkol-Mehta-Uzsoy 1998) β€” up to 80 JSP instances\n\n"
"Schema: `name, family, n_jobs, n_machines, optimum, ops_json, "
"due_dates_json, weights_json, source_url`. The `ops_json` field "
"contains a JSON-encoded list of `[machine_id, processing_time]` pairs "
"per operation per job.\n\n"
"Used by [DASH-JSP](https://huggingface.co/spaces/Vittal-M/dash-jsp-demo).\n"
)
(out_dir / "README.md").write_text(card)
if args.push:
try:
from huggingface_hub import HfApi, create_repo
except ImportError:
raise SystemExit("pip install huggingface_hub")
token = os.environ.get("HF_TOKEN")
if not token:
raise SystemExit("Set HF_TOKEN")
api = HfApi(token=token)
create_repo(args.repo_id, token=token, exist_ok=True, repo_type="dataset")
api.upload_folder(
folder_path=str(out_dir),
repo_id=args.repo_id,
repo_type="dataset",
commit_message="Update benchmark cache",
)
print(f"Pushed {out_dir} β†’ https://huggingface.co/datasets/{args.repo_id}")
if __name__ == "__main__":
main()