| | """ |
| | Upload MMB-style dataset (CSV + images) to Hugging Face Hub. |
| | |
| | Usage: |
| | pip install datasets pillow |
| | huggingface-cli login # or set HF_TOKEN |
| | |
| | python scripts/upload_to_huggingface.py hf_dataset/image_mapping_with_questions.csv |
| | python scripts/upload_to_huggingface.py hf_dataset/image_mapping_with_questions.csv --repo-id scholo/MMB_dataset |
| | """ |
| | from __future__ import annotations |
| |
|
| | import argparse |
| | import json |
| | import sys |
| | from pathlib import Path |
| |
|
| | |
| | SCRIPT_DIR = Path(__file__).resolve().parent |
| | PROJECT_ROOT = SCRIPT_DIR.parent |
| | sys.path.insert(0, str(PROJECT_ROOT)) |
| |
|
| | import pandas as pd |
| | from datasets import Dataset, Image |
| |
|
| |
|
| | def resolve_image_path(csv_path: Path, fname: str) -> Path | None: |
| | """Try multiple locations for image file.""" |
| | base = csv_path.resolve().parent |
| | for candidate in ( |
| | base / "images" / fname, |
| | base / fname, |
| | base.parent / "images" / fname, |
| | base.parent / fname, |
| | ): |
| | if candidate.exists() and candidate.is_file(): |
| | return candidate |
| | return None |
| |
|
| |
|
| | def load_cf_type_from_scene(csv_path: Path, scene_id: str, variant: str) -> str | None: |
| | """Load cf_type from scenes/{scene_id}_{variant}.json.""" |
| | base = csv_path.resolve().parent |
| | for scenes_dir in (base / "scenes", base.parent / "scenes"): |
| | if not scenes_dir.is_dir(): |
| | continue |
| | path = scenes_dir / f"{scene_id.lower()}_{variant}.json" |
| | if not path.exists(): |
| | continue |
| | try: |
| | data = json.loads(path.read_text(encoding="utf-8")) |
| | meta = data.get("cf_metadata") or {} |
| | t = meta.get("cf_type") |
| | return str(t) if t is not None else None |
| | except Exception: |
| | pass |
| | return None |
| |
|
| |
|
| | def scene_id_from_image_name(fname: str) -> str: |
| | s = str(fname).strip() |
| | for suf in ("_original.png", "_original", "_cf1.png", "_cf1", "_cf2.png", "_cf2", ".png"): |
| | if s.lower().endswith(suf.lower()): |
| | s = s[: -len(suf)] |
| | break |
| | return s.strip() or fname |
| |
|
| |
|
| | def build_dataset(csv_path: Path) -> tuple[list[dict], list[str]]: |
| | """Build list of row dicts from CSV + images. Returns (rows, image_cols).""" |
| | df = pd.read_csv(csv_path) |
| | image_cols = ["original_image", "counterfactual1_image", "counterfactual2_image"] |
| | image_cols = [c for c in image_cols if c in df.columns] |
| |
|
| | rows = [] |
| | for i, row in df.iterrows(): |
| | rec = {} |
| | missing = False |
| | for col in df.columns: |
| | v = row[col] |
| | if pd.isna(v): |
| | v = "" |
| | rec[col] = str(v) |
| |
|
| | for col in image_cols: |
| | fname = str(row.get(col, "") or "").strip() |
| | if not fname: |
| | missing = True |
| | break |
| | fp = resolve_image_path(csv_path, fname) |
| | if fp is None: |
| | print(f"Warning: missing image {fname} for row {i}", file=sys.stderr) |
| | missing = True |
| | break |
| | rec[col] = str(fp.resolve()) |
| |
|
| | if missing: |
| | continue |
| |
|
| | sid_cf1 = scene_id_from_image_name(str(row.get(image_cols[1], ""))) |
| | sid_cf2 = scene_id_from_image_name(str(row.get(image_cols[2], ""))) |
| | cf1_type = load_cf_type_from_scene(csv_path, sid_cf1, "cf1") |
| | cf2_type = load_cf_type_from_scene(csv_path, sid_cf2, "cf2") |
| | if cf1_type is not None: |
| | rec["counterfactual1_type"] = cf1_type |
| | if cf2_type is not None: |
| | rec["counterfactual2_type"] = cf2_type |
| |
|
| | rows.append(rec) |
| |
|
| | return rows, image_cols |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="Upload MMB dataset to Hugging Face Hub") |
| | parser.add_argument("csv_path", type=Path, help="Path to image_mapping_with_questions.csv") |
| | parser.add_argument("--repo-id", default=None, help="Hub repo ID (e.g. username/dataset-name)") |
| | parser.add_argument("--private", action="store_true", help="Create private dataset") |
| | parser.add_argument("--dry-run", action="store_true", help="Build dataset but don't push") |
| | args = parser.parse_args() |
| |
|
| | csv_path = args.csv_path.resolve() |
| | if not csv_path.exists(): |
| | print(f"Error: {csv_path} not found", file=sys.stderr) |
| | sys.exit(1) |
| |
|
| | try: |
| | from datasets import Dataset, Image |
| | except ImportError: |
| | print("Install datasets and pillow: pip install datasets pillow", file=sys.stderr) |
| | sys.exit(1) |
| |
|
| | print("Building dataset from", csv_path) |
| | rows, image_cols = build_dataset(csv_path) |
| | if not rows: |
| | print("Error: no valid rows (check image paths)", file=sys.stderr) |
| | sys.exit(1) |
| |
|
| | print(f"Loaded {len(rows)} rows") |
| |
|
| | ds = Dataset.from_list(rows) |
| | for col in image_cols: |
| | if col in ds.column_names: |
| | ds = ds.cast_column(col, Image()) |
| |
|
| | if args.dry_run: |
| | print("Dry run: dataset built, not pushing.") |
| | print("Columns:", ds.column_names) |
| | return |
| |
|
| | repo_id = args.repo_id |
| | if not repo_id: |
| | repo_id = csv_path.parent.name.replace(" ", "-").lower() |
| | repo_id = f"mmb-{repo_id}" |
| | print(f"No --repo-id given, using: {repo_id}") |
| |
|
| | print(f"Pushing to Hugging Face Hub: {repo_id}") |
| | ds.push_to_hub(repo_id, private=args.private) |
| | print("Done. View at: https://huggingface.co/datasets/" + repo_id) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|