Spaces:
Paused
Paused
| """ | |
| training/verify_downloads.py | |
| Verifies all training datasets are present and reports file counts and sizes. | |
| Usage: | |
| python training/verify_downloads.py | |
| python training/verify_downloads.py --kaggle # checks /kaggle/input/ paths | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import sys | |
| from pathlib import Path | |
| DATASETS_LOCAL = { | |
| "ff++": "data/raw/ff++", | |
| "140k_faces": "data/raw/140k_faces", | |
| "dfdc": "data/raw/dfdc", | |
| "celebdf": "data/raw/celebdf", | |
| "deepfake_faces": "data/raw/deepfake_faces", | |
| "deepfake_real": "data/raw/deepfake_real_images", | |
| "ai_vs_real": "data/raw/ai_vs_real", | |
| } | |
| DATASETS_KAGGLE = { | |
| "140k_faces": "/kaggle/input/140k-real-and-fake-faces", | |
| "ai_vs_real": "/kaggle/input/ai-generated-vs-real-images-datasaet", | |
| "ff++": "/kaggle/input/faceforensics-in-compressed-videos", | |
| "celebdf": "/kaggle/input/celeb-df", | |
| "dfdc": "/kaggle/input/deepfake-detection-challenge", | |
| "deepfake_faces": "/kaggle/input/deepfake-faces", | |
| "deepfake_real": "/kaggle/input/deepfake-and-real-images", | |
| } | |
| MEDIA_EXTS = {".jpg", ".jpeg", ".png", ".mp4", ".avi", ".mov"} | |
| def check_dataset(name: str, path: str) -> bool: | |
| p = Path(path) | |
| if not p.exists(): | |
| print(f" MISSING {name:20s} → {path}") | |
| return False | |
| all_files = list(p.rglob("*")) | |
| media_files = [f for f in all_files if f.is_file() and f.suffix.lower() in MEDIA_EXTS] | |
| size_gb = sum(f.stat().st_size for f in all_files if f.is_file()) / 1e9 | |
| print(f" OK {name:20s} → {len(media_files):>7,} media files {size_gb:>6.2f} GB {path}") | |
| return True | |
| def main() -> None: | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--kaggle", action="store_true", help="Check Kaggle input paths") | |
| args = p.parse_args() | |
| datasets = DATASETS_KAGGLE if args.kaggle else DATASETS_LOCAL | |
| label = "Kaggle" if args.kaggle else "local" | |
| print(f"\n=== Dataset verification ({label}) ===\n") | |
| all_ok = True | |
| for name, path in datasets.items(): | |
| ok = check_dataset(name, path) | |
| all_ok = all_ok and ok | |
| print() | |
| if all_ok: | |
| print("All datasets present. Ready to train.") | |
| else: | |
| missing = [n for n, p in datasets.items() if not Path(p).exists()] | |
| print(f"Missing: {', '.join(missing)}") | |
| if not args.kaggle: | |
| print("Run: bash training/download_datasets.sh") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |