Spaces:
Sleeping
Sleeping
| """Rebuild data/metadata.csv from all .npz files in data/processed/.""" | |
| import csv | |
| import sys | |
| from pathlib import Path | |
| import numpy as np | |
| ROOT = Path(__file__).parent.parent | |
| processed = ROOT / "data" / "processed" | |
| out_csv = ROOT / "data" / "metadata.csv" | |
| rows = [] | |
| for npz in sorted(processed.glob("*.npz")): | |
| try: | |
| d = np.load(npz, allow_pickle=True) | |
| label = int(d["label"]) | |
| video_id = str(d["video_id"]) | |
| rows.append({"npz_path": str(npz.resolve()), "label": label, "video_id": video_id}) | |
| except Exception as e: | |
| print(f"skipping {npz.name}: {e}", file=sys.stderr) | |
| # Identity-disjoint split | |
| rng = np.random.default_rng(42) | |
| unique_ids = sorted({r["video_id"] for r in rows}) | |
| rng.shuffle(unique_ids) | |
| n = len(unique_ids) | |
| train_ids = set(unique_ids[: int(0.7 * n)]) | |
| val_ids = set(unique_ids[int(0.7 * n) : int(0.85 * n)]) | |
| for r in rows: | |
| if r["video_id"] in train_ids: | |
| r["split"] = "train" | |
| elif r["video_id"] in val_ids: | |
| r["split"] = "val" | |
| else: | |
| r["split"] = "test" | |
| out_csv.parent.mkdir(parents=True, exist_ok=True) | |
| with out_csv.open("w", newline="", encoding="utf-8") as f: | |
| writer = csv.DictWriter(f, fieldnames=["npz_path", "label", "video_id", "split"]) | |
| writer.writeheader() | |
| writer.writerows(rows) | |
| from collections import Counter | |
| label_counts = Counter(r["label"] for r in rows) | |
| split_counts = Counter(r["split"] for r in rows) | |
| print(f"Rebuilt {len(rows)} sequences") | |
| print(f" real (0): {label_counts[0]}, fake (1): {label_counts[1]}") | |
| print(f" train: {split_counts['train']}, val: {split_counts['val']}, test: {split_counts['test']}") | |
| print(f" written to: {out_csv}") | |