File size: 6,160 Bytes
c34e3ac
 
 
adea8c3
c34e3ac
c90ac2d
c34e3ac
 
 
 
 
c90ac2d
c34e3ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c90ac2d
c34e3ac
 
 
 
 
 
 
 
 
 
 
 
 
 
4df824f
 
 
 
c34e3ac
c90ac2d
c34e3ac
 
 
 
 
f27b882
 
 
 
 
 
 
 
 
 
8c875ce
 
f27b882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9eb1b4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f27b882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0c19e3
 
 
 
 
 
 
 
 
 
 
 
8c875ce
 
 
 
 
 
 
b0c19e3
8c875ce
 
 
 
b0c19e3
8c875ce
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import pytest
from fastapi.testclient import TestClient
from app import app
from codelens_env.models import TaskId, ActionType, Category, Severity, Verdict

def test_api_health(client):
    response = client.get("/health")
    assert response.status_code == 200
    assert response.json()["status"] == "ok"
    assert response.json()["env_ready"] is True

def test_api_workflow(client):
    
    # 1. Reset
    reset_resp = client.post("/reset", json={"task_id": "bug_detection", "seed": 1})
    assert reset_resp.status_code == 200
    data = reset_resp.json()
    episode_id = data["episode_id"]
    assert "observation" in data["result"]

    # 2. Step
    action = {
        "action_type": "comment",
        "body": "Starting review",
    }
    step_resp = client.post(f"/step/{episode_id}", json=action)
    assert step_resp.status_code == 200
    assert "observation" in step_resp.json()

    # 3. Get Result
    result_resp = client.get(f"/result/{episode_id}")
    assert result_resp.status_code == 200
    assert result_resp.json()["final_score"] >= 0

def test_api_leaderboard(client):
    # Submit a score
    sub = {
        "agent_name": "test_agent",
        "task_id": "bug_detection",
        "score": 0.95,
        "seed": 42
    }
    resp = client.post("/submit", json=sub)
    assert resp.status_code == 200
    assert resp.json()["status"] == "submitted"
    
    # Check leaderboard
    lb_resp = client.get("/leaderboard")
    assert lb_resp.status_code == 200
    lb_data = lb_resp.json()
    bug_entries = lb_data["bug_detection"]["entries"]
    assert len(bug_entries) > 0
    assert bug_entries[0]["agent_name"] == "test_agent"

def test_api_invalid_episode(client):
    response = client.post("/step/nonexistent-id", json={
        "action_type": "comment",
        "body": "hello"
    })
    assert response.status_code == 404

def test_api_health_fields(client):
    resp = client.get("/health")
    data = resp.json()
    assert "active_episodes" in data
    assert "auth_enabled" in data
    assert "env" in data

def test_api_reset_invalid_task(client):
    resp = client.post("/reset", json={"task_id": "invalid_task", "seed": 0})
    assert resp.status_code == 200
    assert resp.json()["result"]["task_id"] == "bug_detection" # Fallback

def test_api_step_invalid_action_type(client):
    reset_resp = client.post("/reset", json={"task_id": "bug_detection", "seed": 0})
    episode_id = reset_resp.json()["episode_id"]
    resp = client.post(f"/step/{episode_id}", json={"action_type": "not_valid", "body": "x"})
    assert resp.status_code == 422

def test_api_result_after_completion(client):
    """Result endpoint should return persisted data for completed episodes."""
    reset_resp = client.post("/reset", json={"task_id": "bug_detection", "seed": 0})
    episode_id = reset_resp.json()["episode_id"]
    # Complete the episode
    client.post(f"/step/{episode_id}", json={
        "action_type": "approve", "body": "LGTM", "verdict": "lgtm"
    })
    # Result must be available
    result_resp = client.get(f"/result/{episode_id}")
    assert result_resp.status_code == 200
    assert result_resp.json()["final_score"] >= 0

def test_api_stats_endpoint(client):
    resp = client.get("/stats")
    assert resp.status_code == 200
    assert "total_episodes" in resp.json()

@pytest.mark.parametrize("task_id", ["bug_detection", "security_audit", "architectural_review"])
def test_api_full_workflow_all_tasks(client, task_id):
    reset = client.post("/reset", json={"task_id": task_id, "seed": 1})
    assert reset.status_code == 200
    episode_id = reset.json()["episode_id"]
    
    step = client.post(f"/step/{episode_id}", json={
        "action_type": "approve", "body": "LGTM", "verdict": "lgtm"
    })
    assert step.status_code == 200
    assert step.json()["done"] is True

def test_api_state_endpoint(client):
    reset = client.post("/reset", json={"task_id": "bug_detection", "seed": 1})
    episode_id = reset.json()["episode_id"]

    # Test state retrieval
    state_resp = client.get(f"/state/{episode_id}")
    assert state_resp.status_code == 200
    state_data = state_resp.json()
    assert "observation" not in state_data # Pydantic model unwrapped
    assert state_data["task_id"] == "bug_detection"
    assert "max_steps" in state_data
    
    # Test invalid state
    invalid_state = client.get("/state/invalid-id")
    assert invalid_state.status_code == 404

def test_api_leaderboard_pagination(client):
    # Submit 3 entries
    for i, score in enumerate([0.9, 0.7, 0.5]):
        client.post("/submit", json={
            "agent_name": f"agent_{i}", "task_id": "bug_detection",
            "score": score, "seed": i
        })
    
    # Test limit
    resp = client.get("/leaderboard?task_id=bug_detection&limit=2")
    assert resp.status_code == 200
    data = resp.json()
    assert len(data["entries"]) == 2
    assert data["total"] >= 3
    
    # Test ordering (best first)
    assert data["entries"][0]["score"] >= data["entries"][1]["score"]

def test_api_reset_robustness(client):
    # 1. No body at all
    resp = client.post("/reset")
    assert resp.status_code == 200
    assert resp.json()["result"]["task_id"] == "bug_detection"

    # 2. Empty JSON body
    resp = client.post("/reset", json={})
    assert resp.status_code == 200
    assert resp.json()["result"]["task_id"] == "bug_detection"

    # 3. Invalid JSON (should not trigger 422 now)
    resp = client.post("/reset", content="invalid json {", headers={"Content-Type": "application/json"})
    assert resp.status_code == 200
    assert resp.json()["result"]["task_id"] == "bug_detection"

    # 4. Plain text body (unexpected header, should still pass)
    resp = client.post("/reset", content="just some text", headers={"Content-Type": "text/plain"})
    assert resp.status_code == 200
    assert resp.json()["result"]["task_id"] == "bug_detection"

    # 5. Query params override
    resp = client.post("/reset?task_id=security_audit&seed=100")
    assert resp.status_code == 200
    data = resp.json()
    assert data["result"]["task_id"] == "security_audit"
    assert data["result"]["seed"] == 100