File size: 5,573 Bytes
50aa233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import json
import os
from datetime import datetime

import gradio
import pytest
import pyarrow as pa
from agenteval.models import SubmissionMetadata
from datasets import load_dataset, VerificationMode
from huggingface_hub import HfApi, hf_hub_download

from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY
from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET
from submission import add_new_eval

_hf = HfApi()


class TestSubmission:
    @pytest.fixture(autouse=True)
    def setup(self):
        # These need to be set before imports are evaluated so all we can do here
        # is check that they have been set correctly.
        assert IS_INTERNAL == True
        assert CONFIG_NAME == "continuous-integration"

    def test_add_new_eval(self, mocker):
        # Bypass some checks so that the test can cover later parts of the code.
        mocker.patch("submission._is_hf_acct_too_new", return_value=False)
        mocker.patch("submission._is_last_submission_too_recent", return_value=False)

        # We use this to find records corresponding to this test.
        agent_description = f"CI run at {datetime.now().isoformat()}"
        print(f"Using unique agent description: {agent_description}")

        print("Submitting test submission...")
        with open(os.path.join(os.path.dirname(__file__), "test-submission.tar.gz"), "rb") as f:
            result = add_new_eval(
                val_or_test="test",
                agent_name="TestSubmissionIntegration",
                agent_description=agent_description,
                agent_url="https://github.com/allenai/asta-bench-leaderboard/blob/main/tests/integration/test_submission.py",
                openness=CANONICAL_OPENNESS_CLOSED_UI_ONLY,
                degree_of_control=CANONICAL_TOOL_USAGE_STANDARD,
                path_to_file=f,
                username="test_user",
                role="Other",
                email="jasond+asta_testing@allenai.org",
                email_opt_in=True,
                profile=gradio.OAuthProfile({
                    "name": "Test User",
                    "preferred_username": "test_user",
                    "profile": "test_user_profile",
                    "picture": "https://placecats.com/150/150",
                }),
            )

        message, error_modal, success_modal, loading_modal = result
        assert message == ""  # Success
        assert error_modal == {'__type__': 'update', 'visible': False}
        assert success_modal == {'__type__': 'update', 'visible': True}
        assert loading_modal == {'__type__': 'update', 'visible': False}

        print("Looking up contact record...")
        contacts = load_dataset(path=CONTACT_DATASET,
                                name=CONFIG_NAME,
                                download_mode="force_redownload",
                                verification_mode=VerificationMode.NO_CHECKS)
        # There should have been a new entry due to this test with our unique description.
        found_contact = next(row for row in contacts['test'] if row['agent_description'] == agent_description)
        assert found_contact

        # This contains an attribute that should lead us to files in the submissions dataset.
        dataset_url = found_contact['dataset_url']
        print(f"Found dataset URL: {dataset_url}")
        assert dataset_url.startswith(
            "hf://datasets/allenai/asta-bench-internal-submissions/continuous-integration/test/")

        print("Checking submission dataset...")
        # Commit message itself should link this and the contact record together unambiguously.
        recent_commits = _hf.list_repo_commits(repo_type="dataset", repo_id=SUBMISSION_DATASET)
        assert any(dataset_url in c.title for c in recent_commits)

        print("Checking that files are present...")
        rel_path = dataset_url[len("hf://datasets/allenai/asta-bench-internal-submissions/"):]
        ds_info = _hf.dataset_info(SUBMISSION_DATASET)
        # These are the files in our test-submission.tar.gz
        assert any(f"{rel_path}/eval_config.json" == f.rfilename for f in ds_info.siblings)
        assert any(f"{rel_path}/task_sqa_solver_openscilm.eval" == f.rfilename for f in ds_info.siblings)
        # This is the generated metadata put into the dataset itself.
        assert any(f"{rel_path}/submission.json" == f.rfilename for f in ds_info.siblings)

        print("Checking contact record against submission.json...")
        # Checks on contact record which is stored in a private dataset.
        local_path = hf_hub_download(repo_type="dataset",
                                     repo_id=SUBMISSION_DATASET,
                                     filename=f"{rel_path}/submission.json")
        with open(local_path) as f:
            contact_from_json = json.load(f)
        # Assert that all keys and values in submission.json are present in the contact record
        for key, value_from_json in contact_from_json.items():
            value_from_dataset = found_contact[key]
            if isinstance(value_from_dataset, datetime):
                value_from_dataset = found_contact[key].isoformat().replace('+00:00', 'Z')
            assert value_from_dataset == value_from_json
        # submission.json should not contain sensitive PII, specifically, email.
        assert 'email' in found_contact
        assert 'email' not in contact_from_json
        # submission.json is defined by a specific data model.
        SubmissionMetadata.model_validate(contact_from_json)