""" Upload MMB-style dataset (CSV + images) to Hugging Face Hub. Usage: pip install datasets pillow huggingface-cli login # or set HF_TOKEN python scripts/upload_to_huggingface.py hf_dataset/image_mapping_with_questions.csv python scripts/upload_to_huggingface.py hf_dataset/image_mapping_with_questions.csv --repo-id scholo/MMB_dataset """ from __future__ import annotations import argparse import json import sys from pathlib import Path # Add project root for imports SCRIPT_DIR = Path(__file__).resolve().parent PROJECT_ROOT = SCRIPT_DIR.parent sys.path.insert(0, str(PROJECT_ROOT)) import pandas as pd from datasets import Dataset, Image def resolve_image_path(csv_path: Path, fname: str) -> Path | None: """Try multiple locations for image file.""" base = csv_path.resolve().parent for candidate in ( base / "images" / fname, base / fname, base.parent / "images" / fname, base.parent / fname, ): if candidate.exists() and candidate.is_file(): return candidate return None def load_cf_type_from_scene(csv_path: Path, scene_id: str, variant: str) -> str | None: """Load cf_type from scenes/{scene_id}_{variant}.json.""" base = csv_path.resolve().parent for scenes_dir in (base / "scenes", base.parent / "scenes"): if not scenes_dir.is_dir(): continue path = scenes_dir / f"{scene_id.lower()}_{variant}.json" if not path.exists(): continue try: data = json.loads(path.read_text(encoding="utf-8")) meta = data.get("cf_metadata") or {} t = meta.get("cf_type") return str(t) if t is not None else None except Exception: pass return None def scene_id_from_image_name(fname: str) -> str: s = str(fname).strip() for suf in ("_original.png", "_original", "_cf1.png", "_cf1", "_cf2.png", "_cf2", ".png"): if s.lower().endswith(suf.lower()): s = s[: -len(suf)] break return s.strip() or fname def build_dataset(csv_path: Path) -> tuple[list[dict], list[str]]: """Build list of row dicts from CSV + images. Returns (rows, image_cols).""" df = pd.read_csv(csv_path) image_cols = ["original_image", "counterfactual1_image", "counterfactual2_image"] image_cols = [c for c in image_cols if c in df.columns] rows = [] for i, row in df.iterrows(): rec = {} missing = False for col in df.columns: v = row[col] if pd.isna(v): v = "" rec[col] = str(v) for col in image_cols: fname = str(row.get(col, "") or "").strip() if not fname: missing = True break fp = resolve_image_path(csv_path, fname) if fp is None: print(f"Warning: missing image {fname} for row {i}", file=sys.stderr) missing = True break rec[col] = str(fp.resolve()) if missing: continue sid_cf1 = scene_id_from_image_name(str(row.get(image_cols[1], ""))) sid_cf2 = scene_id_from_image_name(str(row.get(image_cols[2], ""))) cf1_type = load_cf_type_from_scene(csv_path, sid_cf1, "cf1") cf2_type = load_cf_type_from_scene(csv_path, sid_cf2, "cf2") if cf1_type is not None: rec["counterfactual1_type"] = cf1_type if cf2_type is not None: rec["counterfactual2_type"] = cf2_type rows.append(rec) return rows, image_cols def main(): parser = argparse.ArgumentParser(description="Upload MMB dataset to Hugging Face Hub") parser.add_argument("csv_path", type=Path, help="Path to image_mapping_with_questions.csv") parser.add_argument("--repo-id", default=None, help="Hub repo ID (e.g. username/dataset-name)") parser.add_argument("--private", action="store_true", help="Create private dataset") parser.add_argument("--dry-run", action="store_true", help="Build dataset but don't push") args = parser.parse_args() csv_path = args.csv_path.resolve() if not csv_path.exists(): print(f"Error: {csv_path} not found", file=sys.stderr) sys.exit(1) try: from datasets import Dataset, Image except ImportError: print("Install datasets and pillow: pip install datasets pillow", file=sys.stderr) sys.exit(1) print("Building dataset from", csv_path) rows, image_cols = build_dataset(csv_path) if not rows: print("Error: no valid rows (check image paths)", file=sys.stderr) sys.exit(1) print(f"Loaded {len(rows)} rows") ds = Dataset.from_list(rows) for col in image_cols: if col in ds.column_names: ds = ds.cast_column(col, Image()) if args.dry_run: print("Dry run: dataset built, not pushing.") print("Columns:", ds.column_names) return repo_id = args.repo_id if not repo_id: repo_id = csv_path.parent.name.replace(" ", "-").lower() repo_id = f"mmb-{repo_id}" print(f"No --repo-id given, using: {repo_id}") print(f"Pushing to Hugging Face Hub: {repo_id}") ds.push_to_hub(repo_id, private=args.private) print("Done. View at: https://huggingface.co/datasets/" + repo_id) if __name__ == "__main__": main()