Spaces:
Sleeping
Sleeping
File size: 1,927 Bytes
1c68bde | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | from unittest.mock import patch
def test_build_allow_patterns_limits_chunked_remote_pack_downloads():
from warbler_cda.remote_pack_loader import RemotePackLoader
repo_files = [
"packs/warbler-pack-core/package.json",
"packs/warbler-pack-core/pack/templates.json",
"packs/warbler-pack-hf-arxiv/package.json",
"packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-001.jsonl",
"packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-002.jsonl",
"packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-003.jsonl",
]
loader = RemotePackLoader(
repo_id="Bellok/warbler-cda-corpus",
max_documents_per_pack=5000,
)
with patch.object(loader, "_list_repo_files", return_value=repo_files), patch.object(
loader,
"_load_pack_metadata",
side_effect=lambda pack_name: {
"chunked": True,
"docs_per_chunk": 50000,
}
if pack_name == "warbler-pack-hf-arxiv"
else {"chunked": False},
):
allow_patterns = loader.build_allow_patterns()
assert "packs/warbler-pack-core/package.json" in allow_patterns
assert "packs/warbler-pack-core/pack/templates.json" in allow_patterns
assert "packs/warbler-pack-hf-arxiv/package.json" in allow_patterns
assert "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-001.jsonl" in allow_patterns
assert "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-002.jsonl" not in allow_patterns
def test_from_environment_applies_hosted_defaults_for_remote_loader():
from warbler_cda.remote_pack_loader import RemotePackLoader
with patch.dict("os.environ", {"SPACE_ID": "Bellok/warbler-cda"}, clear=False):
loader = RemotePackLoader.from_environment("Bellok/warbler-cda-corpus")
assert loader.max_documents_per_pack == 5000
assert "warbler-pack-hf-tinystories" in loader.exclude_packs |