MMIB_Dataset_Analysis_Tool / scripts /upload_to_huggingface.py
AnonymousECCV15285's picture
Upload 143 files
51c36ad verified
"""
Upload MMB-style dataset (CSV + images) to Hugging Face Hub.
Usage:
pip install datasets pillow
huggingface-cli login # or set HF_TOKEN
python scripts/upload_to_huggingface.py hf_dataset/image_mapping_with_questions.csv
python scripts/upload_to_huggingface.py hf_dataset/image_mapping_with_questions.csv --repo-id scholo/MMB_dataset
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
# Add project root for imports
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parent
sys.path.insert(0, str(PROJECT_ROOT))
import pandas as pd
from datasets import Dataset, Image
def resolve_image_path(csv_path: Path, fname: str) -> Path | None:
"""Try multiple locations for image file."""
base = csv_path.resolve().parent
for candidate in (
base / "images" / fname,
base / fname,
base.parent / "images" / fname,
base.parent / fname,
):
if candidate.exists() and candidate.is_file():
return candidate
return None
def load_cf_type_from_scene(csv_path: Path, scene_id: str, variant: str) -> str | None:
"""Load cf_type from scenes/{scene_id}_{variant}.json."""
base = csv_path.resolve().parent
for scenes_dir in (base / "scenes", base.parent / "scenes"):
if not scenes_dir.is_dir():
continue
path = scenes_dir / f"{scene_id.lower()}_{variant}.json"
if not path.exists():
continue
try:
data = json.loads(path.read_text(encoding="utf-8"))
meta = data.get("cf_metadata") or {}
t = meta.get("cf_type")
return str(t) if t is not None else None
except Exception:
pass
return None
def scene_id_from_image_name(fname: str) -> str:
s = str(fname).strip()
for suf in ("_original.png", "_original", "_cf1.png", "_cf1", "_cf2.png", "_cf2", ".png"):
if s.lower().endswith(suf.lower()):
s = s[: -len(suf)]
break
return s.strip() or fname
def build_dataset(csv_path: Path) -> tuple[list[dict], list[str]]:
"""Build list of row dicts from CSV + images. Returns (rows, image_cols)."""
df = pd.read_csv(csv_path)
image_cols = ["original_image", "counterfactual1_image", "counterfactual2_image"]
image_cols = [c for c in image_cols if c in df.columns]
rows = []
for i, row in df.iterrows():
rec = {}
missing = False
for col in df.columns:
v = row[col]
if pd.isna(v):
v = ""
rec[col] = str(v)
for col in image_cols:
fname = str(row.get(col, "") or "").strip()
if not fname:
missing = True
break
fp = resolve_image_path(csv_path, fname)
if fp is None:
print(f"Warning: missing image {fname} for row {i}", file=sys.stderr)
missing = True
break
# Store image bytes so Hub viewer can display inline (paths don't work on server)
rec[col] = {"bytes": fp.read_bytes()}
if missing:
continue
sid_cf1 = scene_id_from_image_name(str(row.get(image_cols[1], "")))
sid_cf2 = scene_id_from_image_name(str(row.get(image_cols[2], "")))
cf1_type = load_cf_type_from_scene(csv_path, sid_cf1, "cf1")
cf2_type = load_cf_type_from_scene(csv_path, sid_cf2, "cf2")
if cf1_type is not None:
rec["counterfactual1_type"] = cf1_type
if cf2_type is not None:
rec["counterfactual2_type"] = cf2_type
rows.append(rec)
return rows, image_cols
def main():
parser = argparse.ArgumentParser(description="Upload MMB dataset to Hugging Face Hub")
parser.add_argument("csv_path", type=Path, help="Path to image_mapping_with_questions.csv")
parser.add_argument("--repo-id", default=None, help="Hub repo ID (e.g. username/dataset-name)")
parser.add_argument("--private", action="store_true", help="Create private dataset")
parser.add_argument("--dry-run", action="store_true", help="Build dataset but don't push")
args = parser.parse_args()
csv_path = args.csv_path.resolve()
if not csv_path.exists():
print(f"Error: {csv_path} not found", file=sys.stderr)
sys.exit(1)
try:
from datasets import Dataset, Image
except ImportError:
print("Install datasets and pillow: pip install datasets pillow", file=sys.stderr)
sys.exit(1)
print("Building dataset from", csv_path)
rows, image_cols = build_dataset(csv_path)
if not rows:
print("Error: no valid rows (check image paths)", file=sys.stderr)
sys.exit(1)
print(f"Loaded {len(rows)} rows")
ds = Dataset.from_list(rows)
for col in image_cols:
if col in ds.column_names:
ds = ds.cast_column(col, Image())
if args.dry_run:
print("Dry run: dataset built, not pushing.")
print("Columns:", ds.column_names)
return
repo_id = args.repo_id
if not repo_id:
repo_id = csv_path.parent.name.replace(" ", "-").lower()
repo_id = f"mmb-{repo_id}"
print(f"No --repo-id given, using: {repo_id}")
print(f"Pushing to Hugging Face Hub: {repo_id}")
ds.push_to_hub(repo_id, private=args.private)
print("Done. View at: https://huggingface.co/datasets/" + repo_id)
if __name__ == "__main__":
main()