Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import argparse | |
| from pathlib import Path | |
| from typing import Dict, List | |
| import pandas as pd | |
| VIDEO_EXTS = {".mp4", ".avi", ".mov", ".mkv"} | |
| def discover_dataset(root: Path, dataset_name: str) -> List[Dict[str, str]]: | |
| rows: List[Dict[str, str]] = [] | |
| for path in root.rglob("*"): | |
| if path.suffix.lower() not in VIDEO_EXTS: | |
| continue | |
| label = 1 if "fake" in str(path).lower() else 0 | |
| identity = path.parent.name | |
| rows.append( | |
| { | |
| "dataset": dataset_name, | |
| "video_path": str(path.resolve()), | |
| "label": label, | |
| "identity": identity, | |
| } | |
| ) | |
| return rows | |
| def identity_disjoint_split(df: pd.DataFrame, train: float, val: float) -> pd.DataFrame: | |
| identities = sorted(df["identity"].unique()) | |
| n = len(identities) | |
| n_train = int(n * train) | |
| n_val = int(n * val) | |
| train_ids = set(identities[:n_train]) | |
| val_ids = set(identities[n_train : n_train + n_val]) | |
| def split_for_identity(identity: str) -> str: | |
| if identity in train_ids: | |
| return "train" | |
| if identity in val_ids: | |
| return "val" | |
| return "test" | |
| df["split"] = df["identity"].map(split_for_identity) | |
| return df | |
| def main() -> None: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--data-root", type=Path, required=True) | |
| parser.add_argument("--out", type=Path, required=True) | |
| parser.add_argument("--train-ratio", type=float, default=0.7) | |
| parser.add_argument("--val-ratio", type=float, default=0.15) | |
| args = parser.parse_args() | |
| rows: List[Dict[str, str]] = [] | |
| ffpp = args.data_root / "FaceForensics++" | |
| celeb = args.data_root / "Celeb-DF" | |
| if ffpp.exists(): | |
| rows.extend(discover_dataset(ffpp, "FaceForensics++")) | |
| if celeb.exists(): | |
| rows.extend(discover_dataset(celeb, "Celeb-DF")) | |
| if not rows: | |
| raise FileNotFoundError("No supported videos found under data root.") | |
| df = pd.DataFrame(rows) | |
| df = identity_disjoint_split(df, args.train_ratio, args.val_ratio) | |
| args.out.parent.mkdir(parents=True, exist_ok=True) | |
| df.to_csv(args.out, index=False) | |
| print(f"Saved metadata: {args.out} ({len(df)} videos)") | |
| if __name__ == "__main__": | |
| main() | |