Spaces:
Sleeping
Sleeping
File size: 5,422 Bytes
e708130 92e5c18 e708130 92e5c18 e708130 92e5c18 e708130 92e5c18 e708130 92e5c18 e708130 92e5c18 e708130 92e5c18 0bbb422 92e5c18 e708130 92e5c18 0bbb422 92e5c18 e708130 92e5c18 0bbb422 92e5c18 e708130 92e5c18 0bbb422 92e5c18 e708130 0bbb422 e708130 92e5c18 0bbb422 92e5c18 0bbb422 92e5c18 e708130 92e5c18 e708130 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | # Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""Code Review environment implementation for OpenEnv."""
from uuid import uuid4
from openenv.core.env_server.interfaces import Environment
try:
from ..models import ReviewAction, ReviewObservation, ReviewState
from .graders import grade_review_with_breakdown
from .tasks import get_task
except ImportError:
from models import ReviewAction, ReviewObservation, ReviewState
from server.graders import grade_review_with_breakdown
from server.tasks import get_task
MAX_STEPS = 3
class CodeReviewEnvironment(Environment):
"""Environment where an agent reviews code and tags planted issues."""
SUPPORTS_CONCURRENT_SESSIONS: bool = True
def __init__(self):
default_task = get_task("task_easy")
self._state = ReviewState(
episode_id=str(uuid4()),
step_count=0,
current_task_id=default_task.task_id,
max_steps=MAX_STEPS,
)
self._current_task = default_task
def reset(self, task_id: str = "task_easy", **kwargs) -> ReviewObservation:
"""Reset episode and load selected task (fallback to task_easy)."""
_ = kwargs
task = get_task(task_id)
self._current_task = task
self._state = ReviewState(
episode_id=str(uuid4()),
step_count=0,
current_task_id=task.task_id,
max_steps=MAX_STEPS,
)
return ReviewObservation(
task_id=task.task_id,
file_name=task.file_name,
task_description=task.description,
code_snippet=task.code,
feedback="Environment reset. Submit issues_found and review_comment.",
step_number=0,
reward=0.0,
done=False,
metadata={
"difficulty": task.difficulty,
"planted_issue_count": len(task.planted_issues),
},
)
def step(self, action: ReviewAction) -> ReviewObservation: # type: ignore[override]
"""Grade one review action and return updated observation with refinement feedback."""
self._state.step_count += 1
breakdown = grade_review_with_breakdown(
action_issues=action.issues_found,
action_comment=action.review_comment,
task=self._current_task,
action_severity=action.severity,
)
score = breakdown.score
# Track best score across steps (iterative refinement)
self._state.best_score = max(self._state.best_score, score)
done = (score >= 0.95) or (self._state.step_count >= MAX_STEPS)
correctly_found = sorted(breakdown.correctly_found)
missed_tags = sorted(breakdown.missed)
missed_count = len(missed_tags)
false_positive_count = len(breakdown.false_positives)
# Iterative refinement feedback: tell agent what to improve
feedback_parts = [
f"Score: {score:.3f}",
f"Found: {correctly_found}",
f"Missed: {missed_count} remaining",
f"False positives: {false_positive_count}",
]
if not done and missed_count > 0:
# Give hints about missed categories without revealing exact tags
hint_categories = []
for tag in missed_tags:
if tag in ("null_pointer", "missing_return", "type_error", "index_out_of_bounds"):
hint_categories.append("logic/type issue")
elif tag in ("sql_injection", "hardcoded_secret", "path_traversal"):
hint_categories.append("security vulnerability")
elif tag in ("race_condition", "timing_attack", "improper_error_handling"):
hint_categories.append("robustness/concurrency flaw")
elif tag in ("integer_overflow", "missing_input_validation"):
hint_categories.append("input handling issue")
unique_hints = sorted(set(hint_categories))
feedback_parts.append(f"Hint: look for {', '.join(unique_hints)}")
if not breakdown.severity_correct:
feedback_parts.append("Severity assessment could be improved")
feedback = " | ".join(feedback_parts)
return ReviewObservation(
task_id=self._current_task.task_id,
file_name=self._current_task.file_name,
task_description=self._current_task.description,
code_snippet=self._current_task.code,
feedback=feedback,
step_number=self._state.step_count,
reward=score,
done=done,
metadata={
"correctly_found": correctly_found,
"missed": missed_tags,
"false_positives": sorted(breakdown.false_positives),
"submitted_severity": action.severity,
"severity_correct": breakdown.severity_correct,
"best_score": self._state.best_score,
"max_achievable_score": 1.0,
"steps_remaining": MAX_STEPS - self._state.step_count,
},
)
@property
def state(self) -> ReviewState:
"""Return current episode state."""
return self._state
|