File size: 705 Bytes
a5fd608
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
"""Wiki 数据集 Runner

Usage:
    python data/wiki/runner.py test_dataset
    ENV=production python data/wiki/runner.py test_dataset
"""

import pathlib
import sys

sys.path.insert(0, str(pathlib.Path(__file__).parent.parent.parent))

from data.runner import DatasetRunner
from data.wiki.dataset import WikiDataset
from env.resolve import resolve_path, resolve_env


dataset = WikiDataset(
    data_dir=str(
        resolve_env(resolve_path("data/dev/mini_c4"), resolve_path("~/data/wiki/mini_c4"))
    ),
    tokenizer_type=resolve_env("character", "sentence_piece"),
    sequence_length=256,
)

runner = DatasetRunner(
    dataset=dataset,
    name="wiki",
)

if __name__ == "__main__":
    runner()