File size: 1,646 Bytes
8720f73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import csv
from pathlib import Path
import datasets

_CITATION = ""
_DESCRIPTION = "Local video dataset with nested 'video' field (id, path, bytes)."

class LocalVideoConfig(datasets.BuilderConfig):
    def __init__(self, **kwargs):
        super().__init__(version=datasets.Version("1.0.0"), **kwargs)

class LocalVideoDataset(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [LocalVideoConfig(name="default", description=_DESCRIPTION)]

    def _info(self):
        features = datasets.Features({
            "video": {
                "id": datasets.Value("string"),
                "path": datasets.Value("string"),
                "bytes": datasets.Value("binary"),
            }
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            citation=_CITATION,
            homepage="",
        )

    def _split_generators(self, dl_manager):
        base = Path(__file__).parent.resolve()
        manifest = base / "test_manifest.csv"
        if not manifest.exists():
            raise FileNotFoundError(f"Missing manifest CSV: {manifest}")
        return [datasets.SplitGenerator(name=datasets.Split.TEST,
                                        gen_kwargs={"manifest_path": str(manifest)})]

    def _generate_examples(self, manifest_path):
        with open(manifest_path, newline="") as f:
            reader = csv.DictReader(f)
            for row in reader:
                vid_id = row["id"]
                path = row["path"]
                data = Path(path).read_bytes()
                yield vid_id, {"video": {"id": vid_id, "path": path, "bytes": data}}