Spaces:
Sleeping
Sleeping
File size: 874 Bytes
a5fd608 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | from data.wiki.loader import doc_load
from env.resolve import resolve_path
def test_dataset_load_mini_c4():
data_dir = resolve_path("data/dev/mini_c4")
ds = doc_load(data_dir=data_dir, glob_pattern="*.txt", cycle_length=1)
result = list(ds.as_numpy_iterator())
assert len(result) == 10
assert result[0] == b"first document of first file"
assert result[1] == b"second document of first file"
assert result[2] == b"third document of first file"
assert result[3] == b"first document of second file"
assert result[4] == b"second document of second file"
assert result[5] == b"third document of second file"
assert result[6] == b"fourth document of second file"
assert result[7] == b"first document of third file"
assert result[8] == b"second document of third file"
assert result[9] == b"third document of third file"
|