deepdetection / training /verify_downloads.py
akagtag's picture
Initial commit
4e75170
"""
training/verify_downloads.py
Verifies all training datasets are present and reports file counts and sizes.
Usage:
python training/verify_downloads.py
python training/verify_downloads.py --kaggle # checks /kaggle/input/ paths
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
DATASETS_LOCAL = {
"ff++": "data/raw/ff++",
"140k_faces": "data/raw/140k_faces",
"dfdc": "data/raw/dfdc",
"celebdf": "data/raw/celebdf",
"deepfake_faces": "data/raw/deepfake_faces",
"deepfake_real": "data/raw/deepfake_real_images",
"ai_vs_real": "data/raw/ai_vs_real",
}
DATASETS_KAGGLE = {
"140k_faces": "/kaggle/input/140k-real-and-fake-faces",
"ai_vs_real": "/kaggle/input/ai-generated-vs-real-images-datasaet",
"ff++": "/kaggle/input/faceforensics-in-compressed-videos",
"celebdf": "/kaggle/input/celeb-df",
"dfdc": "/kaggle/input/deepfake-detection-challenge",
"deepfake_faces": "/kaggle/input/deepfake-faces",
"deepfake_real": "/kaggle/input/deepfake-and-real-images",
}
MEDIA_EXTS = {".jpg", ".jpeg", ".png", ".mp4", ".avi", ".mov"}
def check_dataset(name: str, path: str) -> bool:
p = Path(path)
if not p.exists():
print(f" MISSING {name:20s}{path}")
return False
all_files = list(p.rglob("*"))
media_files = [f for f in all_files if f.is_file() and f.suffix.lower() in MEDIA_EXTS]
size_gb = sum(f.stat().st_size for f in all_files if f.is_file()) / 1e9
print(f" OK {name:20s}{len(media_files):>7,} media files {size_gb:>6.2f} GB {path}")
return True
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--kaggle", action="store_true", help="Check Kaggle input paths")
args = p.parse_args()
datasets = DATASETS_KAGGLE if args.kaggle else DATASETS_LOCAL
label = "Kaggle" if args.kaggle else "local"
print(f"\n=== Dataset verification ({label}) ===\n")
all_ok = True
for name, path in datasets.items():
ok = check_dataset(name, path)
all_ok = all_ok and ok
print()
if all_ok:
print("All datasets present. Ready to train.")
else:
missing = [n for n, p in datasets.items() if not Path(p).exists()]
print(f"Missing: {', '.join(missing)}")
if not args.kaggle:
print("Run: bash training/download_datasets.sh")
sys.exit(1)
if __name__ == "__main__":
main()