warbler-cda / tests /test_remote_pack_loader.py
Bellok
feat: Add support for remote pack loading with environment configuration and enhanced metadata handling
1c68bde
from unittest.mock import patch
def test_build_allow_patterns_limits_chunked_remote_pack_downloads():
from warbler_cda.remote_pack_loader import RemotePackLoader
repo_files = [
"packs/warbler-pack-core/package.json",
"packs/warbler-pack-core/pack/templates.json",
"packs/warbler-pack-hf-arxiv/package.json",
"packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-001.jsonl",
"packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-002.jsonl",
"packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-003.jsonl",
]
loader = RemotePackLoader(
repo_id="Bellok/warbler-cda-corpus",
max_documents_per_pack=5000,
)
with patch.object(loader, "_list_repo_files", return_value=repo_files), patch.object(
loader,
"_load_pack_metadata",
side_effect=lambda pack_name: {
"chunked": True,
"docs_per_chunk": 50000,
}
if pack_name == "warbler-pack-hf-arxiv"
else {"chunked": False},
):
allow_patterns = loader.build_allow_patterns()
assert "packs/warbler-pack-core/package.json" in allow_patterns
assert "packs/warbler-pack-core/pack/templates.json" in allow_patterns
assert "packs/warbler-pack-hf-arxiv/package.json" in allow_patterns
assert "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-001.jsonl" in allow_patterns
assert "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-002.jsonl" not in allow_patterns
def test_from_environment_applies_hosted_defaults_for_remote_loader():
from warbler_cda.remote_pack_loader import RemotePackLoader
with patch.dict("os.environ", {"SPACE_ID": "Bellok/warbler-cda"}, clear=False):
loader = RemotePackLoader.from_environment("Bellok/warbler-cda-corpus")
assert loader.max_documents_per_pack == 5000
assert "warbler-pack-hf-tinystories" in loader.exclude_packs