CodeLens / tests /test_env.py
AIMLxDIV's picture
chore : added testcase
9eb1b4f
Raw
History Blame Contribute Delete
9.78 kB
import pytest
from codelens_env.env import CodeLensEnv
from codelens_env.models import (
TaskId, Action, ActionType, Category, Severity, Verdict
)
# ─────────────────────────────────────────────────────────────────────────────
# Reset tests
# ─────────────────────────────────────────────────────────────────────────────
def test_env_reset():
env = CodeLensEnv()
res = env.reset(TaskId.BUG_DETECTION, seed=0)
assert res.task_id == TaskId.BUG_DETECTION
assert res.seed == 0
assert res.observation.step_count == 0
assert res.observation.noise_budget == 5
def test_env_reset_populates_blast_radius():
"""Observation should carry blast-radius metadata from the scenario."""
env = CodeLensEnv()
res = env.reset(TaskId.SECURITY_AUDIT, seed=0)
obs = res.observation
# Note: New models have different fields or names, but the env should map them.
assert obs.step_count == 0
def test_env_state():
"""Test the python interface state method."""
env = CodeLensEnv()
res = env.reset(TaskId.BUG_DETECTION, seed=0)
state_obs = env.state()
assert state_obs.task_id == TaskId.BUG_DETECTION
assert state_obs.step_count == 0
assert state_obs.noise_budget == 5# ─────────────────────────────────────────────────────────────────────────────
# Step tests
# ─────────────────────────────────────────────────────────────────────────────
def test_env_step_bug_detection():
env = CodeLensEnv()
env.reset(TaskId.BUG_DETECTION, seed=1)
# seed=1 β†’ bug_003: None dereference in auth.py (per reordering)
action = Action(
action_type=ActionType.FLAG_ISSUE,
body="None dereference null check guard clause AttributeError",
filename="auth.py",
line_number=16,
category=Category.BUG,
severity=Severity.HIGH
)
step_res = env.step(action)
assert step_res.observation.step_count == 1
assert step_res.reward > 0, "Correct issue flag should give positive reward delta"
assert step_res.done == False
# Terminal action
step_term = env.step(Action(
action_type=ActionType.APPROVE,
body="LGTM",
verdict=Verdict.LGTM
))
assert step_term.done == True
final = env.get_final_result()
assert final.final_score > 0
def test_env_step_reward_is_incremental_not_cumulative():
"""Each step reward should be a delta (positive or zero or penalty), not a running total."""
env = CodeLensEnv()
# seed=1 selects bug_003: None dereference in auth.py at line 16
env.reset(TaskId.BUG_DETECTION, seed=1)
correct_action = Action(
action_type=ActionType.FLAG_ISSUE,
body="None dereference null check guard clause AttributeError",
filename="auth.py",
line_number=16,
category=Category.BUG,
severity=Severity.HIGH
)
step1 = env.step(correct_action)
# First correct flag β†’ positive incremental delta
assert step1.reward > 0, f"Correct issue flag should give positive reward delta, got {step1.reward}"
# Second identical flag on same file/line β€” already matched, counts as FP
step2 = env.step(correct_action)
# Already matched β†’ false positive β†’ -0.05 penalty
assert step2.reward == -0.05
def test_env_step_false_positive_penalty():
"""False positives should decrement noise_budget and return negative reward."""
env = CodeLensEnv()
env.reset(TaskId.BUG_DETECTION, seed=0)
fp_action = Action(
action_type=ActionType.FLAG_ISSUE,
body="completely wrong flag",
filename="nonexistent_file.py",
line_number=999,
category=Category.BUG,
severity=Severity.LOW
)
step_res = env.step(fp_action)
assert step_res.reward == -0.05
assert step_res.observation.noise_budget == 4
def test_env_noise_budget_exhaustion():
env = CodeLensEnv()
env.reset(TaskId.BUG_DETECTION, seed=0)
fp_action = Action(
action_type=ActionType.FLAG_ISSUE,
body="fp",
filename="nonexistent",
line_number=999,
category=Category.BUG,
severity=Severity.LOW
)
for i in range(4):
res = env.step(fp_action)
assert res.done == False
assert res.observation.noise_budget == 5 - (i + 1)
res_final = env.step(fp_action)
assert res_final.done == True
assert res_final.observation.noise_budget == 0
def test_env_max_steps():
env = CodeLensEnv()
env.reset(TaskId.BUG_DETECTION, seed=0)
action = Action(action_type=ActionType.ASK_QUESTION, body="what's this?")
for i in range(9):
res = env.step(action)
assert res.done == False
res_final = env.step(action)
assert res_final.done == True
assert res_final.observation.step_count == 10
# ─────────────────────────────────────────────────────────────────────────────
# Multi-task smoke tests
# ─────────────────────────────────────────────────────────────────────────────
def test_security_task_runs_to_completion():
env = CodeLensEnv()
# seed=1 selects sec_002: Hardcoded secret (if 0-indexed and order is preserved)
# Actually get_scenario(TaskId.SECURITY_AUDIT, 1) selects the second item.
env.reset(TaskId.SECURITY_AUDIT, seed=1)
# sec_002 is bug with sk_live_abc123XYZ in payments/webhook.py line 5
action = Action(
action_type=ActionType.FLAG_ISSUE,
body="hardcoded secret sk_live_abc123XYZ",
filename="payments/webhook.py",
line_number=5,
category=Category.SECURITY,
severity=Severity.CRITICAL
)
step_res = env.step(action)
assert step_res.reward >= 0
env.step(Action(
action_type=ActionType.REQUEST_CHANGES,
body="Hardcoded secret found.",
verdict=Verdict.REQUEST_CHANGES
))
final = env.get_final_result()
assert final.final_score > 0
def test_arch_task_runs_to_completion():
env = CodeLensEnv()
env.reset(TaskId.ARCHITECTURAL_REVIEW, seed=0)
# arch_001 is UserManager god class
action = Action(
action_type=ActionType.FLAG_ISSUE,
body="god class single responsibility violation",
filename="services/user_manager.py",
line_number=2,
category=Category.ARCHITECTURE,
severity=Severity.HIGH
)
env.step(action)
env.step(Action(
action_type=ActionType.REQUEST_CHANGES,
body="Must refactor out of god class.",
verdict=Verdict.REQUEST_CHANGES
))
final = env.get_final_result()
assert final.final_score > 0
@pytest.mark.parametrize("task_id", list(TaskId))
def test_env_reset_all_tasks(task_id, env):
"""Reset must work for all three task types."""
result = env.reset(task_id, seed=0)
assert result.task_id == task_id
assert result.observation.noise_budget == 5
@pytest.mark.parametrize("task_id,expected_max_steps", [
(TaskId.BUG_DETECTION, 10),
(TaskId.SECURITY_AUDIT, 15),
(TaskId.ARCHITECTURAL_REVIEW, 20),
])
def test_env_max_steps_per_task(task_id, expected_max_steps, env):
result = env.reset(task_id, seed=0)
assert result.observation.max_steps == expected_max_steps
def test_env_step_raises_when_done(env, approve_action):
"""Calling step on a done episode must raise ValueError."""
env.reset(TaskId.BUG_DETECTION, seed=0)
env.step(approve_action)
with pytest.raises(ValueError):
env.step(approve_action)
def test_env_history_recorded(env):
"""All steps should appear in final result history."""
env.reset(TaskId.BUG_DETECTION, seed=0)
from codelens_env.models import Action, ActionType
for _ in range(3):
env.step(Action(action_type=ActionType.ASK_QUESTION, body="question"))
env.step(Action(action_type=ActionType.APPROVE, body="LGTM", verdict=Verdict.LGTM))
result = env.get_final_result()
assert result.steps_taken == 4
assert len(result.history) == 4
def test_env_get_final_result_score_clamped(env, approve_action):
"""Final score must always be in [0, 1]."""
env.reset(TaskId.BUG_DETECTION, seed=0)
env.step(approve_action)
result = env.get_final_result()
# Check that score is a float and within [0, 1]
assert isinstance(result.final_score, float)
assert 0.0 <= result.final_score <= 1.0
@pytest.mark.parametrize("task_id", list(TaskId))
@pytest.mark.parametrize("seed", [0, 3, 7])
def test_env_full_episode_completes(task_id, seed, env):
"""Full episodes must always reach a terminal state."""
env.reset(task_id, seed=seed)
from codelens_env.models import Action, ActionType, Verdict
# Just skip to terminal
action = Action(action_type=ActionType.APPROVE, body="LGTM", verdict=Verdict.LGTM)
result = env.step(action)
assert result.done is True
final = env.get_final_result()
assert final.terminated_reason == "terminal_action"