from unittest.mock import patch def test_build_allow_patterns_limits_chunked_remote_pack_downloads(): from warbler_cda.remote_pack_loader import RemotePackLoader repo_files = [ "packs/warbler-pack-core/package.json", "packs/warbler-pack-core/pack/templates.json", "packs/warbler-pack-hf-arxiv/package.json", "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-001.jsonl", "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-002.jsonl", "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-003.jsonl", ] loader = RemotePackLoader( repo_id="Bellok/warbler-cda-corpus", max_documents_per_pack=5000, ) with patch.object(loader, "_list_repo_files", return_value=repo_files), patch.object( loader, "_load_pack_metadata", side_effect=lambda pack_name: { "chunked": True, "docs_per_chunk": 50000, } if pack_name == "warbler-pack-hf-arxiv" else {"chunked": False}, ): allow_patterns = loader.build_allow_patterns() assert "packs/warbler-pack-core/package.json" in allow_patterns assert "packs/warbler-pack-core/pack/templates.json" in allow_patterns assert "packs/warbler-pack-hf-arxiv/package.json" in allow_patterns assert "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-001.jsonl" in allow_patterns assert "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-002.jsonl" not in allow_patterns def test_from_environment_applies_hosted_defaults_for_remote_loader(): from warbler_cda.remote_pack_loader import RemotePackLoader with patch.dict("os.environ", {"SPACE_ID": "Bellok/warbler-cda"}, clear=False): loader = RemotePackLoader.from_environment("Bellok/warbler-cda-corpus") assert loader.max_documents_per_pack == 5000 assert "warbler-pack-hf-tinystories" in loader.exclude_packs