pitvqa-training-scripts / create_image_dataset.py

Upload create_image_dataset.py with huggingface_hub

69375af verified 8 days ago

5.69 kB

	#!/usr/bin/env python3
	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "torch",
	# "datasets>=2.18.0",
	# "pillow",
	# "opencv-python-headless",
	# "huggingface_hub>=0.21.0",
	# "av",
	# "tqdm",
	# ]
	# ///
	"""
	Create dataset with embedded images from pitvqa-comprehensive-spatial.

	Extracts video frames and embeds them directly in the dataset.
	This eliminates the need for video extraction during training/inference.

	Run with: hf jobs uv run --flavor cpu-xlarge --secrets HF_TOKEN create_image_dataset.py
	"""

	import os
	import cv2
	from io import BytesIO
	from PIL import Image
	from pathlib import Path
	from tqdm import tqdm

	# ============================================================
	# Config
	# ============================================================

	SOURCE_DATASET = "mmrech/pitvqa-comprehensive-spatial"
	VIDEO_DATASET = "UCL-WEISS/PitVis-2023"
	OUTPUT_DATASET = "mmrech/pitvqa-spatial-with-images"

	VIDEO_CACHE = Path("/tmp/videos")
	VIDEO_CACHE.mkdir(exist_ok=True)

	MAX_SAMPLES = 1000 # Start with subset for testing

	# ============================================================
	# Setup
	# ============================================================

	from huggingface_hub import login, HfApi, hf_hub_download
	from datasets import load_dataset, Dataset, Features, Value, Image as ImageFeature

	hf_token = os.environ.get("HF_TOKEN")
	if hf_token:
	login(token=hf_token)
	print("✓ Logged in to HuggingFace")

	api = HfApi()

	# ============================================================
	# Load Source Dataset
	# ============================================================

	print("\n📦 Loading source dataset...")
	ds = load_dataset(SOURCE_DATASET, split="train")
	print(f"✓ Loaded {len(ds)} samples")

	# ============================================================
	# Video Helpers
	# ============================================================

	video_cache = {}

	def download_video(video_id: str) -> Path:
	"""Download video if not cached."""
	video_path = VIDEO_CACHE / f"{video_id}.mp4"
	if not video_path.exists():
	try:
	downloaded = hf_hub_download(
	repo_id=VIDEO_DATASET,
	filename=f"videos/{video_id}.mp4",
	repo_type="dataset"
	)
	import shutil
	shutil.copy(downloaded, video_path)
	except Exception as e:
	print(f" ⚠ Could not download {video_id}: {e}")
	return None
	return video_path

	def get_video_capture(video_id: str):
	"""Get or create video capture object."""
	if video_id not in video_cache:
	video_path = download_video(video_id)
	if video_path:
	video_cache[video_id] = cv2.VideoCapture(str(video_path))
	return video_cache.get(video_id)

	def extract_frame(video_id: str, frame_idx: int) -> Image.Image:
	"""Extract frame from video."""
	cap = get_video_capture(video_id)
	if cap is None:
	return None

	cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
	ret, frame = cap.read()

	if ret:
	# Convert BGR to RGB
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	return Image.fromarray(frame_rgb)
	return None

	# ============================================================
	# Process Dataset
	# ============================================================

	print("\n🔄 Processing samples and extracting frames...")

	# Get unique video IDs first
	video_ids = set()
	for ex in ds:
	video_ids.add(ex['video_id'])
	print(f"Found {len(video_ids)} unique videos")

	# Download videos first
	print("\n📥 Downloading videos...")
	for vid in tqdm(list(video_ids), desc="Videos"):
	download_video(vid)

	# Process samples
	print("\n🖼️ Extracting frames...")
	processed_samples = []
	failed = 0

	for i, ex in enumerate(tqdm(ds, desc="Samples")):
	if i >= MAX_SAMPLES:
	break

	video_id = ex['video_id']
	frame_idx = ex.get('frame_index', 0)

	# Extract frame
	frame = extract_frame(video_id, frame_idx)

	if frame is None:
	failed += 1
	continue

	# Create new sample with image
	sample = {
	"image": frame,
	"video_id": video_id,
	"frame_index": frame_idx,
	"messages": ex['messages'],
	}
	processed_samples.append(sample)

	print(f"\n✓ Processed {len(processed_samples)} samples ({failed} failed)")

	# Close video captures
	for cap in video_cache.values():
	cap.release()

	# ============================================================
	# Create Dataset
	# ============================================================

	print("\n📊 Creating dataset...")

	# Create dataset with Image feature
	new_ds = Dataset.from_list(processed_samples)
	print(f"✓ Created dataset with {len(new_ds)} samples")

	# Check features
	print(f"Features: {new_ds.features}")

	# ============================================================
	# Upload
	# ============================================================

	print(f"\n📤 Uploading to {OUTPUT_DATASET}...")

	try:
	new_ds.push_to_hub(OUTPUT_DATASET, private=False)
	print(f"✓ Uploaded to https://huggingface.co/datasets/{OUTPUT_DATASET}")
	except Exception as e:
	print(f"⚠ Upload error: {e}")

	# ============================================================
	# Summary
	# ============================================================

	print("\n" + "=" * 60)
	print("✅ DONE!")
	print("=" * 60)
	print(f"""
	Dataset created: {OUTPUT_DATASET}
	Samples: {len(processed_samples)}
	Failed: {failed}

	To use:
	```python
	from datasets import load_dataset
	ds = load_dataset("{OUTPUT_DATASET}")
	# Images are directly available - no video extraction needed!
	image = ds['train'][0]['image']
	```
	""")