| | """
|
| | Upload MMB-style dataset (CSV + images) to Hugging Face Hub.
|
| |
|
| | Usage:
|
| | pip install datasets pillow
|
| | huggingface-cli login # or set HF_TOKEN
|
| |
|
| | python scripts/upload_to_huggingface.py hf_dataset/image_mapping_with_questions.csv
|
| | python scripts/upload_to_huggingface.py hf_dataset/image_mapping_with_questions.csv --repo-id scholo/MMB_dataset
|
| | """
|
| | from __future__ import annotations
|
| |
|
| | import argparse
|
| | import json
|
| | import sys
|
| | from pathlib import Path
|
| |
|
| |
|
| | SCRIPT_DIR = Path(__file__).resolve().parent
|
| | PROJECT_ROOT = SCRIPT_DIR.parent
|
| | sys.path.insert(0, str(PROJECT_ROOT))
|
| |
|
| | import pandas as pd
|
| | from datasets import Dataset, Image
|
| |
|
| |
|
| | def resolve_image_path(csv_path: Path, fname: str) -> Path | None:
|
| | """Try multiple locations for image file."""
|
| | base = csv_path.resolve().parent
|
| | for candidate in (
|
| | base / "images" / fname,
|
| | base / fname,
|
| | base.parent / "images" / fname,
|
| | base.parent / fname,
|
| | ):
|
| | if candidate.exists() and candidate.is_file():
|
| | return candidate
|
| | return None
|
| |
|
| |
|
| | def load_cf_type_from_scene(csv_path: Path, scene_id: str, variant: str) -> str | None:
|
| | """Load cf_type from scenes/{scene_id}_{variant}.json."""
|
| | base = csv_path.resolve().parent
|
| | for scenes_dir in (base / "scenes", base.parent / "scenes"):
|
| | if not scenes_dir.is_dir():
|
| | continue
|
| | path = scenes_dir / f"{scene_id.lower()}_{variant}.json"
|
| | if not path.exists():
|
| | continue
|
| | try:
|
| | data = json.loads(path.read_text(encoding="utf-8"))
|
| | meta = data.get("cf_metadata") or {}
|
| | t = meta.get("cf_type")
|
| | return str(t) if t is not None else None
|
| | except Exception:
|
| | pass
|
| | return None
|
| |
|
| |
|
| | def scene_id_from_image_name(fname: str) -> str:
|
| | s = str(fname).strip()
|
| | for suf in ("_original.png", "_original", "_cf1.png", "_cf1", "_cf2.png", "_cf2", ".png"):
|
| | if s.lower().endswith(suf.lower()):
|
| | s = s[: -len(suf)]
|
| | break
|
| | return s.strip() or fname
|
| |
|
| |
|
| | def build_dataset(csv_path: Path) -> tuple[list[dict], list[str]]:
|
| | """Build list of row dicts from CSV + images. Returns (rows, image_cols)."""
|
| | df = pd.read_csv(csv_path)
|
| | image_cols = ["original_image", "counterfactual1_image", "counterfactual2_image"]
|
| | image_cols = [c for c in image_cols if c in df.columns]
|
| |
|
| | rows = []
|
| | for i, row in df.iterrows():
|
| | rec = {}
|
| | missing = False
|
| | for col in df.columns:
|
| | v = row[col]
|
| | if pd.isna(v):
|
| | v = ""
|
| | rec[col] = str(v)
|
| |
|
| | for col in image_cols:
|
| | fname = str(row.get(col, "") or "").strip()
|
| | if not fname:
|
| | missing = True
|
| | break
|
| | fp = resolve_image_path(csv_path, fname)
|
| | if fp is None:
|
| | print(f"Warning: missing image {fname} for row {i}", file=sys.stderr)
|
| | missing = True
|
| | break
|
| |
|
| | rec[col] = {"bytes": fp.read_bytes()}
|
| |
|
| | if missing:
|
| | continue
|
| |
|
| | sid_cf1 = scene_id_from_image_name(str(row.get(image_cols[1], "")))
|
| | sid_cf2 = scene_id_from_image_name(str(row.get(image_cols[2], "")))
|
| | cf1_type = load_cf_type_from_scene(csv_path, sid_cf1, "cf1")
|
| | cf2_type = load_cf_type_from_scene(csv_path, sid_cf2, "cf2")
|
| | if cf1_type is not None:
|
| | rec["counterfactual1_type"] = cf1_type
|
| | if cf2_type is not None:
|
| | rec["counterfactual2_type"] = cf2_type
|
| |
|
| | rows.append(rec)
|
| |
|
| | return rows, image_cols
|
| |
|
| |
|
| | def main():
|
| | parser = argparse.ArgumentParser(description="Upload MMB dataset to Hugging Face Hub")
|
| | parser.add_argument("csv_path", type=Path, help="Path to image_mapping_with_questions.csv")
|
| | parser.add_argument("--repo-id", default=None, help="Hub repo ID (e.g. username/dataset-name)")
|
| | parser.add_argument("--private", action="store_true", help="Create private dataset")
|
| | parser.add_argument("--dry-run", action="store_true", help="Build dataset but don't push")
|
| | args = parser.parse_args()
|
| |
|
| | csv_path = args.csv_path.resolve()
|
| | if not csv_path.exists():
|
| | print(f"Error: {csv_path} not found", file=sys.stderr)
|
| | sys.exit(1)
|
| |
|
| | try:
|
| | from datasets import Dataset, Image
|
| | except ImportError:
|
| | print("Install datasets and pillow: pip install datasets pillow", file=sys.stderr)
|
| | sys.exit(1)
|
| |
|
| | print("Building dataset from", csv_path)
|
| | rows, image_cols = build_dataset(csv_path)
|
| | if not rows:
|
| | print("Error: no valid rows (check image paths)", file=sys.stderr)
|
| | sys.exit(1)
|
| |
|
| | print(f"Loaded {len(rows)} rows")
|
| |
|
| | ds = Dataset.from_list(rows)
|
| | for col in image_cols:
|
| | if col in ds.column_names:
|
| | ds = ds.cast_column(col, Image())
|
| |
|
| | if args.dry_run:
|
| | print("Dry run: dataset built, not pushing.")
|
| | print("Columns:", ds.column_names)
|
| | return
|
| |
|
| | repo_id = args.repo_id
|
| | if not repo_id:
|
| | repo_id = csv_path.parent.name.replace(" ", "-").lower()
|
| | repo_id = f"mmb-{repo_id}"
|
| | print(f"No --repo-id given, using: {repo_id}")
|
| |
|
| | print(f"Pushing to Hugging Face Hub: {repo_id}")
|
| | ds.push_to_hub(repo_id, private=args.private)
|
| | print("Done. View at: https://huggingface.co/datasets/" + repo_id)
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | main()
|
| |
|