Spaces:

Lucifer9907
/

SDK-Docker

Sleeping

App Files Files Community

SDK-Docker / prepare_dataset.py

Lucifer9907

Prepare Hugging Face Docker Space

ff0c419 about 1 month ago

raw

history blame contribute delete

6.84 kB

	from __future__ import annotations

	import argparse
	import base64
	import io
	import time
	from pathlib import Path

	import requests
	from PIL import Image

	from src.ai_image_detector.config import PROCESSED_DATA_DIR


	def save_image(image_bytes: bytes, destination: Path) -> None:
	destination.parent.mkdir(parents=True, exist_ok=True)
	image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
	image.save(destination, format="PNG")


	def save_pil_image(image: Image.Image, destination: Path) -> None:
	destination.parent.mkdir(parents=True, exist_ok=True)
	image.convert("RGB").save(destination, format="PNG")


	def download_image_url(image_url: str, destination: Path, retries: int = 5) -> None:
	last_error: Exception \| None = None
	for attempt in range(retries):
	try:
	response = requests.get(image_url, timeout=120)
	response.raise_for_status()
	save_image(response.content, destination)
	return
	except requests.RequestException as exc:
	last_error = exc
	time.sleep(2 * (attempt + 1))

	raise RuntimeError(f"Failed to download image after {retries} attempts") from last_error


	def fetch_rows(
	dataset_name: str,
	offset: int,
	length: int,
	retries: int = 5,
	) -> list[dict]:
	rows_api = "https://datasets-server.huggingface.co/rows"
	last_error: Exception \| None = None

	for attempt in range(retries):
	try:
	response = requests.get(
	rows_api,
	params={
	"dataset": dataset_name,
	"config": "default",
	"split": "train",
	"offset": offset,
	"length": length,
	},
	timeout=120,
	)
	response.raise_for_status()
	return response.json().get("rows", [])
	except requests.RequestException as exc:
	last_error = exc
	if isinstance(exc, requests.HTTPError) and exc.response is not None and exc.response.status_code == 429:
	time.sleep(30 * (attempt + 1))
	else:
	time.sleep(2 * (attempt + 1))

	raise RuntimeError(f"Failed to fetch dataset rows after {retries} attempts") from last_error


	def download_subset(
	dataset_name: str,
	real_target: int,
	fake_target: int,
	seed: int,
	start_offset: int,
	) -> tuple[int, int]:
	real_dir = PROCESSED_DATA_DIR / "real"
	fake_dir = PROCESSED_DATA_DIR / "fake"
	real_dir.mkdir(parents=True, exist_ok=True)
	fake_dir.mkdir(parents=True, exist_ok=True)

	real_count = len(list(real_dir.glob("*.png")))
	fake_count = len(list(fake_dir.glob("*.png")))

	if dataset_name == "OwensLab/CommunityForensics-Small":
	offset = start_offset
	page_size = 10

	while real_count < real_target or fake_count < fake_target:
	rows = fetch_rows(dataset_name, offset, page_size)
	if not rows:
	break

	for item in rows:
	row = item["row"]
	label = int(row["label"])
	image_name = Path(row["image_name"]).stem
	image_bytes = base64.b64decode(row["image_data"])

	if label == 0 and real_count < real_target:
	destination = real_dir / f"communityforensics_real_{real_count:05d}_{image_name}.png"
	save_image(image_bytes, destination)
	real_count += 1
	elif label == 1 and fake_count < fake_target:
	destination = fake_dir / f"communityforensics_fake_{fake_count:05d}_{image_name}.png"
	save_image(image_bytes, destination)
	fake_count += 1

	if real_count >= real_target and fake_count >= fake_target:
	break

	offset += page_size

	elif dataset_name == "Parveshiiii/AI-vs-Real":
	offset = start_offset
	page_size = 20

	while real_count < real_target or fake_count < fake_target:
	rows = fetch_rows(dataset_name, offset, page_size)
	if not rows:
	break

	for item in rows:
	row = item["row"]
	label = int(row["binary_label"])
	image_url = row["image"]["src"]
	row_idx = item["row_idx"]

	if label == 1 and real_count < real_target:
	destination = real_dir / f"aivsreal_real_{real_count:05d}_{row_idx}.png"
	download_image_url(image_url, destination)
	real_count += 1
	elif label == 0 and fake_count < fake_target:
	destination = fake_dir / f"aivsreal_fake_{fake_count:05d}_{row_idx}.png"
	download_image_url(image_url, destination)
	fake_count += 1

	if real_count >= real_target and fake_count >= fake_target:
	break

	offset += page_size
	else:
	raise ValueError(f"Unsupported dataset source: {dataset_name}")

	return real_count, fake_count


	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Download a balanced starter subset of CommunityForensics-Small."
	)
	parser.add_argument(
	"--per-class",
	type=int,
	default=2000,
	help="Number of real and fake images to download",
	)
	parser.add_argument(
	"--real-count",
	type=int,
	default=None,
	help="Override the real-image target count",
	)
	parser.add_argument(
	"--fake-count",
	type=int,
	default=None,
	help="Override the fake-image target count",
	)
	parser.add_argument(
	"--seed",
	type=int,
	default=42,
	help="Random seed for stream shuffling",
	)
	parser.add_argument(
	"--dataset",
	type=str,
	default="OwensLab/CommunityForensics-Small",
	help="Hugging Face dataset id",
	)
	parser.add_argument(
	"--start-offset",
	type=int,
	default=0,
	help="Starting row offset for resumable API downloads",
	)
	args = parser.parse_args()

	real_target = args.per_class if args.real_count is None else args.real_count
	fake_target = args.per_class if args.fake_count is None else args.fake_count

	real_count, fake_count = download_subset(
	dataset_name=args.dataset,
	real_target=real_target,
	fake_target=fake_target,
	seed=args.seed,
	start_offset=args.start_offset,
	)

	print(f"Downloaded {real_count} real images to {PROCESSED_DATA_DIR / 'real'}")
	print(f"Downloaded {fake_count} fake images to {PROCESSED_DATA_DIR / 'fake'}")


	if __name__ == "__main__":
	main()