Spaces:

ArshVerma
/

CodeLens

Sleeping

File size: 9,783 Bytes

e2e527d
adea8c3
 
fc6ff5a
d8ee465
 
 
 
 
 
e2e527d
 
adea8c3
e2e527d
 
 
 
 
 
d8ee465
 
 
adea8c3
d8ee465
 
fc6ff5a
 
d8ee465
 
9eb1b4f
 
 
 
 
 
 
 
d8ee465
 
 
e2e527d
adea8c3
d8ee465
fc6ff5a
d8ee465
e2e527d
 
d8ee465
e2e527d
 
 
 
 
 
 
d8ee465
e2e527d
 
 
d8ee465
e2e527d
 
 
d8ee465
e2e527d
d8ee465
e2e527d
 
 
d8ee465
 
 
adea8c3
d8ee465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adea8c3
d8ee465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2e527d
adea8c3
e2e527d
d8ee465
 
e2e527d
 
 
 
 
 
 
d8ee465
e2e527d
d8ee465
e2e527d
 
d8ee465
 
e2e527d
 
 
d8ee465
e2e527d
adea8c3
e2e527d
d8ee465
 
e2e527d
d8ee465
e2e527d
d8ee465
 
e2e527d
 
d8ee465
 
 
 
 
 
 
adea8c3
fc6ff5a
 
d8ee465
 
fc6ff5a
d8ee465
 
fc6ff5a
 
 
d8ee465
 
 
 
fc6ff5a
d8ee465
 
 
fc6ff5a
d8ee465
 
 
 
 
 
 
adea8c3
d8ee465
 
fc6ff5a
d8ee465
 
fc6ff5a
 
 
d8ee465
fc6ff5a
d8ee465
 
 
 
 
fc6ff5a
d8ee465
 
 
 
f27b882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adea8c3
f27b882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adea8c3
f27b882

import pytest
from codelens_env.env import CodeLensEnv
from codelens_env.models import (
    TaskId, Action, ActionType, Category, Severity, Verdict
)


# ─────────────────────────────────────────────────────────────────────────────
# Reset tests
# ─────────────────────────────────────────────────────────────────────────────

def test_env_reset():
    env = CodeLensEnv()
    res = env.reset(TaskId.BUG_DETECTION, seed=0)
    assert res.task_id == TaskId.BUG_DETECTION
    assert res.seed == 0
    assert res.observation.step_count == 0
    assert res.observation.noise_budget == 5


def test_env_reset_populates_blast_radius():
    """Observation should carry blast-radius metadata from the scenario."""
    env = CodeLensEnv()
    res = env.reset(TaskId.SECURITY_AUDIT, seed=0)
    obs = res.observation
    # Note: New models have different fields or names, but the env should map them.
    assert obs.step_count == 0


def test_env_state():
    """Test the python interface state method."""
    env = CodeLensEnv()
    res = env.reset(TaskId.BUG_DETECTION, seed=0)
    state_obs = env.state()
    assert state_obs.task_id == TaskId.BUG_DETECTION
    assert state_obs.step_count == 0
    assert state_obs.noise_budget == 5# ─────────────────────────────────────────────────────────────────────────────
# Step tests
# ─────────────────────────────────────────────────────────────────────────────

def test_env_step_bug_detection():
    env = CodeLensEnv()
    env.reset(TaskId.BUG_DETECTION, seed=1)
    # seed=1 → bug_003: None dereference in auth.py (per reordering)

    action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="None dereference null check guard clause AttributeError",
        filename="auth.py",
        line_number=16,
        category=Category.BUG,
        severity=Severity.HIGH
    )
    step_res = env.step(action)
    assert step_res.observation.step_count == 1
    assert step_res.reward > 0, "Correct issue flag should give positive reward delta"
    assert step_res.done == False

    # Terminal action
    step_term = env.step(Action(
        action_type=ActionType.APPROVE,
        body="LGTM",
        verdict=Verdict.LGTM
    ))
    assert step_term.done == True

    final = env.get_final_result()
    assert final.final_score > 0


def test_env_step_reward_is_incremental_not_cumulative():
    """Each step reward should be a delta (positive or zero or penalty), not a running total."""
    env = CodeLensEnv()
    # seed=1 selects bug_003: None dereference in auth.py at line 16
    env.reset(TaskId.BUG_DETECTION, seed=1)

    correct_action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="None dereference null check guard clause AttributeError",
        filename="auth.py",
        line_number=16,
        category=Category.BUG,
        severity=Severity.HIGH
    )
    step1 = env.step(correct_action)
    # First correct flag → positive incremental delta
    assert step1.reward > 0, f"Correct issue flag should give positive reward delta, got {step1.reward}"

    # Second identical flag on same file/line — already matched, counts as FP
    step2 = env.step(correct_action)
    # Already matched → false positive → -0.05 penalty
    assert step2.reward == -0.05


def test_env_step_false_positive_penalty():
    """False positives should decrement noise_budget and return negative reward."""
    env = CodeLensEnv()
    env.reset(TaskId.BUG_DETECTION, seed=0)

    fp_action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="completely wrong flag",
        filename="nonexistent_file.py",
        line_number=999,
        category=Category.BUG,
        severity=Severity.LOW
    )
    step_res = env.step(fp_action)
    assert step_res.reward == -0.05
    assert step_res.observation.noise_budget == 4


def test_env_noise_budget_exhaustion():
    env = CodeLensEnv()
    env.reset(TaskId.BUG_DETECTION, seed=0)

    fp_action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="fp",
        filename="nonexistent",
        line_number=999,
        category=Category.BUG,
        severity=Severity.LOW
    )

    for i in range(4):
        res = env.step(fp_action)
        assert res.done == False
        assert res.observation.noise_budget == 5 - (i + 1)

    res_final = env.step(fp_action)
    assert res_final.done == True
    assert res_final.observation.noise_budget == 0


def test_env_max_steps():
    env = CodeLensEnv()
    env.reset(TaskId.BUG_DETECTION, seed=0)

    action = Action(action_type=ActionType.ASK_QUESTION, body="what's this?")
    for i in range(9):
        res = env.step(action)
        assert res.done == False

    res_final = env.step(action)
    assert res_final.done == True
    assert res_final.observation.step_count == 10


# ─────────────────────────────────────────────────────────────────────────────
# Multi-task smoke tests
# ─────────────────────────────────────────────────────────────────────────────

def test_security_task_runs_to_completion():
    env = CodeLensEnv()
    # seed=1 selects sec_002: Hardcoded secret (if 0-indexed and order is preserved)
    # Actually get_scenario(TaskId.SECURITY_AUDIT, 1) selects the second item.
    env.reset(TaskId.SECURITY_AUDIT, seed=1)

    # sec_002 is bug with sk_live_abc123XYZ in payments/webhook.py line 5
    action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="hardcoded secret sk_live_abc123XYZ",
        filename="payments/webhook.py",
        line_number=5,
        category=Category.SECURITY,
        severity=Severity.CRITICAL
    )
    step_res = env.step(action)
    assert step_res.reward >= 0

    env.step(Action(
        action_type=ActionType.REQUEST_CHANGES,
        body="Hardcoded secret found.",
        verdict=Verdict.REQUEST_CHANGES
    ))
    final = env.get_final_result()
    assert final.final_score > 0


def test_arch_task_runs_to_completion():
    env = CodeLensEnv()
    env.reset(TaskId.ARCHITECTURAL_REVIEW, seed=0)

    # arch_001 is UserManager god class
    action = Action(
        action_type=ActionType.FLAG_ISSUE,
        body="god class single responsibility violation",
        filename="services/user_manager.py",
        line_number=2,
        category=Category.ARCHITECTURE,
        severity=Severity.HIGH
    )
    env.step(action)

    env.step(Action(
        action_type=ActionType.REQUEST_CHANGES,
        body="Must refactor out of god class.",
        verdict=Verdict.REQUEST_CHANGES
    ))
    final = env.get_final_result()
    assert final.final_score > 0

@pytest.mark.parametrize("task_id", list(TaskId))
def test_env_reset_all_tasks(task_id, env):
    """Reset must work for all three task types."""
    result = env.reset(task_id, seed=0)
    assert result.task_id == task_id
    assert result.observation.noise_budget == 5

@pytest.mark.parametrize("task_id,expected_max_steps", [
    (TaskId.BUG_DETECTION, 10),
    (TaskId.SECURITY_AUDIT, 15),
    (TaskId.ARCHITECTURAL_REVIEW, 20),
])
def test_env_max_steps_per_task(task_id, expected_max_steps, env):
    result = env.reset(task_id, seed=0)
    assert result.observation.max_steps == expected_max_steps

def test_env_step_raises_when_done(env, approve_action):
    """Calling step on a done episode must raise ValueError."""
    env.reset(TaskId.BUG_DETECTION, seed=0)
    env.step(approve_action)
    with pytest.raises(ValueError):
        env.step(approve_action)

def test_env_history_recorded(env):
    """All steps should appear in final result history."""
    env.reset(TaskId.BUG_DETECTION, seed=0)
    from codelens_env.models import Action, ActionType
    for _ in range(3):
        env.step(Action(action_type=ActionType.ASK_QUESTION, body="question"))
    env.step(Action(action_type=ActionType.APPROVE, body="LGTM", verdict=Verdict.LGTM))
    result = env.get_final_result()
    assert result.steps_taken == 4
    assert len(result.history) == 4

def test_env_get_final_result_score_clamped(env, approve_action):
    """Final score must always be in [0, 1]."""
    env.reset(TaskId.BUG_DETECTION, seed=0)
    env.step(approve_action)
    result = env.get_final_result()
    # Check that score is a float and within [0, 1]
    assert isinstance(result.final_score, float)
    assert 0.0 <= result.final_score <= 1.0

@pytest.mark.parametrize("task_id", list(TaskId))
@pytest.mark.parametrize("seed", [0, 3, 7])
def test_env_full_episode_completes(task_id, seed, env):
    """Full episodes must always reach a terminal state."""
    env.reset(task_id, seed=seed)
    from codelens_env.models import Action, ActionType, Verdict
    # Just skip to terminal
    action = Action(action_type=ActionType.APPROVE, body="LGTM", verdict=Verdict.LGTM)
    result = env.step(action)
    assert result.done is True
    final = env.get_final_result()
    assert final.terminated_reason == "terminal_action"