File size: 1,927 Bytes
1c68bde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from unittest.mock import patch


def test_build_allow_patterns_limits_chunked_remote_pack_downloads():
    from warbler_cda.remote_pack_loader import RemotePackLoader

    repo_files = [
        "packs/warbler-pack-core/package.json",
        "packs/warbler-pack-core/pack/templates.json",
        "packs/warbler-pack-hf-arxiv/package.json",
        "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-001.jsonl",
        "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-002.jsonl",
        "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-003.jsonl",
    ]

    loader = RemotePackLoader(
        repo_id="Bellok/warbler-cda-corpus",
        max_documents_per_pack=5000,
    )

    with patch.object(loader, "_list_repo_files", return_value=repo_files), patch.object(
        loader,
        "_load_pack_metadata",
        side_effect=lambda pack_name: {
            "chunked": True,
            "docs_per_chunk": 50000,
        }
        if pack_name == "warbler-pack-hf-arxiv"
        else {"chunked": False},
    ):
        allow_patterns = loader.build_allow_patterns()

    assert "packs/warbler-pack-core/package.json" in allow_patterns
    assert "packs/warbler-pack-core/pack/templates.json" in allow_patterns
    assert "packs/warbler-pack-hf-arxiv/package.json" in allow_patterns
    assert "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-001.jsonl" in allow_patterns
    assert "packs/warbler-pack-hf-arxiv/warbler-pack-hf-arxiv-chunk-002.jsonl" not in allow_patterns


def test_from_environment_applies_hosted_defaults_for_remote_loader():
    from warbler_cda.remote_pack_loader import RemotePackLoader

    with patch.dict("os.environ", {"SPACE_ID": "Bellok/warbler-cda"}, clear=False):
        loader = RemotePackLoader.from_environment("Bellok/warbler-cda-corpus")

    assert loader.max_documents_per_pack == 5000
    assert "warbler-pack-hf-tinystories" in loader.exclude_packs