CodeLens / tests /test_database.py
ArshVerma's picture
feat: finalize CodeLens. rebranding and production environment polish
adea8c3
Raw
History Blame Contribute Delete
2.37 kB
from sqlmodel import Session
from codelens_env.database import save_episode, get_episode, submit_leaderboard, get_leaderboard_db, get_stats
from codelens_env.models import EpisodeResult, TaskId, ActionRecord, ActionType
def make_result(episode_id="test-ep-1", score=0.85):
return EpisodeResult(
episode_id=episode_id,
task_id=TaskId.BUG_DETECTION,
scenario_hash="bug_001",
seed=0,
final_score=score,
steps_taken=3,
issues_found=1,
issues_total=1,
noise_penalties=0,
terminated_reason="terminal_action",
history=[ActionRecord(action_type=ActionType.APPROVE, body="LGTM")]
)
def test_save_and_get_episode(session):
result = make_result()
save_episode(session, result)
record = get_episode(session, "test-ep-1")
assert record is not None
assert record.final_score == 0.85
assert record.scenario_hash == "bug_001"
def test_get_nonexistent_episode(session):
record = get_episode(session, "does-not-exist")
assert record is None
def test_episode_history_serialized(session):
result = make_result()
save_episode(session, result)
record = get_episode(session, result.episode_id)
import json
history = json.loads(record.history_json)
assert len(history) == 1
assert history[0]["action_type"] == "approve"
def test_leaderboard_submit_and_rank(session):
rank = submit_leaderboard(session, "agent_a", "bug_detection", 0.9, 0)
assert rank == 1
rank2 = submit_leaderboard(session, "agent_b", "bug_detection", 0.7, 1)
assert rank2 == 2
def test_leaderboard_ordering(session):
submit_leaderboard(session, "low", "security_audit", 0.3, 0)
submit_leaderboard(session, "high", "security_audit", 0.95, 1)
submit_leaderboard(session, "mid", "security_audit", 0.6, 2)
entries, total = get_leaderboard_db(session, "security_audit")
assert total == 3
assert entries[0].agent_name == "high"
assert entries[0].score == 0.95
def test_get_stats_empty(session):
stats = get_stats(session)
assert stats["total_episodes"] == 0
def test_get_stats_populated(session):
save_episode(session, make_result("ep1", 0.9))
save_episode(session, make_result("ep2", 0.5))
stats = get_stats(session)
assert stats["total_episodes"] == 2
assert abs(stats["avg_score"] - 0.7) < 0.001