Spaces:

akagtag
/

deepdetection

Paused

App Files Files Community

deepdetection / training /verify_downloads.py

akagtag

Initial commit

4e75170 about 1 month ago

raw

history blame contribute delete

2.59 kB

	"""
	training/verify_downloads.py

	Verifies all training datasets are present and reports file counts and sizes.

	Usage:
	python training/verify_downloads.py
	python training/verify_downloads.py --kaggle # checks /kaggle/input/ paths
	"""
	from __future__ import annotations

	import argparse
	import sys
	from pathlib import Path

	DATASETS_LOCAL = {
	"ff++": "data/raw/ff++",
	"140k_faces": "data/raw/140k_faces",
	"dfdc": "data/raw/dfdc",
	"celebdf": "data/raw/celebdf",
	"deepfake_faces": "data/raw/deepfake_faces",
	"deepfake_real": "data/raw/deepfake_real_images",
	"ai_vs_real": "data/raw/ai_vs_real",
	}

	DATASETS_KAGGLE = {
	"140k_faces": "/kaggle/input/140k-real-and-fake-faces",
	"ai_vs_real": "/kaggle/input/ai-generated-vs-real-images-datasaet",
	"ff++": "/kaggle/input/faceforensics-in-compressed-videos",
	"celebdf": "/kaggle/input/celeb-df",
	"dfdc": "/kaggle/input/deepfake-detection-challenge",
	"deepfake_faces": "/kaggle/input/deepfake-faces",
	"deepfake_real": "/kaggle/input/deepfake-and-real-images",
	}

	MEDIA_EXTS = {".jpg", ".jpeg", ".png", ".mp4", ".avi", ".mov"}


	def check_dataset(name: str, path: str) -> bool:
	p = Path(path)
	if not p.exists():
	print(f" MISSING {name:20s} → {path}")
	return False

	all_files = list(p.rglob("*"))
	media_files = [f for f in all_files if f.is_file() and f.suffix.lower() in MEDIA_EXTS]
	size_gb = sum(f.stat().st_size for f in all_files if f.is_file()) / 1e9
	print(f" OK {name:20s} → {len(media_files):>7,} media files {size_gb:>6.2f} GB {path}")
	return True


	def main() -> None:
	p = argparse.ArgumentParser()
	p.add_argument("--kaggle", action="store_true", help="Check Kaggle input paths")
	args = p.parse_args()

	datasets = DATASETS_KAGGLE if args.kaggle else DATASETS_LOCAL
	label = "Kaggle" if args.kaggle else "local"

	print(f"\n=== Dataset verification ({label}) ===\n")
	all_ok = True
	for name, path in datasets.items():
	ok = check_dataset(name, path)
	all_ok = all_ok and ok

	print()
	if all_ok:
	print("All datasets present. Ready to train.")
	else:
	missing = [n for n, p in datasets.items() if not Path(p).exists()]
	print(f"Missing: {', '.join(missing)}")
	if not args.kaggle:
	print("Run: bash training/download_datasets.sh")
	sys.exit(1)


	if __name__ == "__main__":
	main()