import csv from pathlib import Path import datasets _CITATION = "" _DESCRIPTION = "Local video dataset with nested 'video' field (id, path, bytes)." class LocalVideoConfig(datasets.BuilderConfig): def __init__(self, **kwargs): super().__init__(version=datasets.Version("1.0.0"), **kwargs) class LocalVideoDataset(datasets.GeneratorBasedBuilder): BUILDER_CONFIGS = [LocalVideoConfig(name="default", description=_DESCRIPTION)] def _info(self): features = datasets.Features({ "video": { "id": datasets.Value("string"), "path": datasets.Value("string"), "bytes": datasets.Value("binary"), } }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, citation=_CITATION, homepage="", ) def _split_generators(self, dl_manager): base = Path(__file__).parent.resolve() manifest = base / "test_manifest.csv" if not manifest.exists(): raise FileNotFoundError(f"Missing manifest CSV: {manifest}") return [datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"manifest_path": str(manifest)})] def _generate_examples(self, manifest_path): with open(manifest_path, newline="") as f: reader = csv.DictReader(f) for row in reader: vid_id = row["id"] path = row["path"] data = Path(path).read_bytes() yield vid_id, {"video": {"id": vid_id, "path": path, "bytes": data}}