File size: 1,230 Bytes
6835659
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import json
from pathlib import Path


class WikimediaSampleLoader:
    def __init__(self, base_dir: str = "data/wikimedia"):
        self.base_dir = Path(base_dir)
        self.samples_file = self.base_dir / "samples.json"

    def load(self):
        print("Wikimedia loader starting")
        if not self.samples_file.exists():
            raise FileNotFoundError(f"Missing {self.samples_file}")

        with self.samples_file.open("r", encoding="utf-8") as f:
            data = json.load(f)

        if isinstance(data, dict) and "samples" in data:
            data = data["samples"]
        elif isinstance(data, dict):
            data = list(data.values())

        samples = []
        for item in data:
            print("Checking item:", item.get("id"))
            samples.append(
                {
                    "id": item["id"],
                    "caption": item["caption"],
                    "image_path": self.base_dir / item["image"],
                    "audio_path": (
                        self.base_dir / item["audio"] if item.get("audio") else None
                    ),
                }
            )

        print("Wikimedia loader finished. Samples:", len(samples))
        return samples