| |
| |
| |
| |
| |
|
|
| """ |
| Code Output Assessment Environment Implementation. |
| |
| An RL environment that tests an agent's ability to solve coding problems |
| across three difficulty levels with automated grading and reward shaping. |
| """ |
|
|
| import random |
| from uuid import uuid4 |
| from typing import Dict, List, Tuple, Literal |
|
|
| from openenv.core.env_server.interfaces import Environment |
| from openenv.core.env_server.types import State |
|
|
| try: |
| from ..models import CodeAssessmentAction, CodeAssessmentObservation |
| except ImportError: |
| from models import CodeAssessmentAction, CodeAssessmentObservation |
|
|
|
|
| |
| PROBLEMS = { |
| "easy": [ |
| { |
| "description": "Add two numbers. Given input 'a,b', output a+b.", |
| "test_cases": [("3,5", "8"), ("10,20", "30"), ("0,0", "0"), ("-5,5", "0")], |
| }, |
| { |
| "description": "Reverse a string. Given input 'hello', output 'olleh'.", |
| "test_cases": [("hello", "olleh"), ("world", "dlrow"), ("a", "a"), ("12345", "54321")], |
| }, |
| { |
| "description": "Count vowels in a string (a,e,i,o,u). Return the count.", |
| "test_cases": [("hello", "2"), ("aeiou", "5"), ("xyz", "0"), ("programming", "3")], |
| }, |
| { |
| "description": "Find maximum of two numbers. Given input 'a,b', output the larger number.", |
| "test_cases": [("5,10", "10"), ("20,15", "20"), ("7,7", "7"), ("-5,3", "3")], |
| }, |
| ], |
| "medium": [ |
| { |
| "description": "Check if a string is a palindrome. Output 'true' or 'false'.", |
| "test_cases": [("racecar", "true"), ("hello", "false"), ("a", "true"), ("abba", "true")], |
| }, |
| { |
| "description": "Find the sum of all numbers in a comma-separated list. Input: '1,2,3', Output: '6'.", |
| "test_cases": [("1,2,3", "6"), ("10,20,30", "60"), ("5", "5"), ("-1,1", "0")], |
| }, |
| { |
| "description": "Count occurrences of a character in a string. Input format: 'string,char'. Output: count.", |
| "test_cases": [("hello,l", "2"), ("programming,m", "2"), ("test,x", "0"), ("aaa,a", "3")], |
| }, |
| { |
| "description": "Remove duplicates from a comma-separated list, keep order. Input: '1,2,2,3', Output: '1,2,3'.", |
| "test_cases": [("1,2,2,3", "1,2,3"), ("a,b,a,c", "a,b,c"), ("1,1,1", "1"), ("1,2,3", "1,2,3")], |
| }, |
| ], |
| "hard": [ |
| { |
| "description": "Find the longest word in a sentence. Input: sentence. Output: longest word.", |
| "test_cases": [ |
| ("the quick brown fox", "quick"), |
| ("hello world", "hello"), |
| ("a bb ccc", "ccc"), |
| ("programming is fun", "programming"), |
| ], |
| }, |
| { |
| "description": "Find the nth Fibonacci number (0-indexed). Input: n. Output: fibonacci(n).", |
| "test_cases": [("0", "0"), ("1", "1"), ("5", "5"), ("10", "55")], |
| }, |
| { |
| "description": "Check if parentheses are balanced. Input: string with (){}[]. Output: 'true' or 'false'.", |
| "test_cases": [("()", "true"), ("({[]})", "true"), ("(]", "false"), ("(()", "false")], |
| }, |
| { |
| "description": "Find prime numbers up to n (comma-separated). Input: n. Output: primes.", |
| "test_cases": [("10", "2,3,5,7"), ("20", "2,3,5,7,11,13,17,19"), ("2", "2"), ("1", "")], |
| }, |
| ], |
| } |
|
|
|
|
| class CodeAssessmentEnvironment(Environment): |
| """ |
| Code Output Assessment Environment. |
| |
| Tests an agent's ability to solve coding problems across three difficulty levels. |
| Features automated grading with normalized scores (0.0-1.0) and shaped rewards. |
| |
| Difficulty Levels: |
| - Easy: Basic operations (addition, string reversal, simple counting) |
| - Medium: String/list processing, basic algorithms |
| - Hard: Advanced algorithms, recursion, complex logic |
| |
| Grading System: |
| All graders produce normalized scores between 0.0-1.0: |
| - 1.0: Perfect answer |
| - 0.5-0.9: High partial credit (very close) |
| - 0.2-0.4: Low partial credit (some correct elements) |
| - 0.0: Completely incorrect |
| |
| Reward Structure (grader score × difficulty multiplier): |
| - Easy: score × 1.0 (max +1.0 for correct, +0.5 partial, 0.0 wrong) |
| - Medium: score × 2.0 (max +2.0 for correct, +1.0 partial, 0.0 wrong) |
| - Hard: score × 5.0 (max +5.0 for correct, +2.5 partial, -0.3 wrong) |
| - Streak bonus: +0.5 for 3+ consecutive correct answers |
| """ |
|
|
| SUPPORTS_CONCURRENT_SESSIONS: bool = True |
| MAX_STEPS: int = 15 |
|
|
| def __init__(self): |
| """Initialize the code assessment environment.""" |
| self._state = State(episode_id=str(uuid4()), step_count=0) |
| self._current_problem: Dict = {} |
| self._current_test_case_idx: int = 0 |
| self._difficulty: Literal["easy", "medium", "hard"] = "easy" |
| self._problems_solved: int = 0 |
| self._current_streak: int = 0 |
| self._total_reward: float = 0.0 |
|
|
| def reset(self) -> CodeAssessmentObservation: |
| """ |
| Reset the environment and present the first problem. |
| |
| Returns: |
| CodeAssessmentObservation with the first problem description |
| """ |
| self._state = State(episode_id=str(uuid4()), step_count=0) |
| self._problems_solved = 0 |
| self._current_streak = 0 |
| self._total_reward = 0.0 |
| self._difficulty = "easy" |
|
|
| |
| self._current_problem = random.choice(PROBLEMS["easy"]) |
| self._current_test_case_idx = 0 |
|
|
| test_input, _ = self._current_problem["test_cases"][0] |
|
|
| return CodeAssessmentObservation( |
| problem_description=self._current_problem["description"], |
| difficulty=self._difficulty, |
| test_case_input=test_input, |
| expected_output=None, |
| feedback="Welcome! Solve the problem and submit your answer.", |
| is_correct=False, |
| partial_credit=0.0, |
| problems_solved=0, |
| current_streak=0, |
| done=False, |
| reward=0.0, |
| ) |
|
|
| def step(self, action: CodeAssessmentAction) -> CodeAssessmentObservation: |
| """ |
| Evaluate the submitted answer and provide feedback. |
| |
| Args: |
| action: CodeAssessmentAction containing the agent's answer |
| |
| Returns: |
| CodeAssessmentObservation with grading results and next problem |
| """ |
| self._state.step_count += 1 |
|
|
| |
| test_input, expected_output = self._current_problem["test_cases"][self._current_test_case_idx] |
| |
| |
| is_correct, partial_credit, feedback = self._grade_answer(action.answer, expected_output) |
| |
| |
| reward = self._calculate_reward(is_correct, partial_credit) |
| self._total_reward += reward |
|
|
| |
| if is_correct: |
| self._problems_solved += 1 |
| self._current_streak += 1 |
| else: |
| self._current_streak = 0 |
|
|
| |
| done = self._state.step_count >= self.MAX_STEPS |
|
|
| |
| if is_correct: |
| self._advance_to_next_problem() |
|
|
| |
| test_input, _ = self._current_problem["test_cases"][self._current_test_case_idx] |
|
|
| return CodeAssessmentObservation( |
| problem_description=self._current_problem["description"], |
| difficulty=self._difficulty, |
| test_case_input=test_input, |
| expected_output=expected_output if not is_correct else None, |
| feedback=feedback, |
| is_correct=is_correct, |
| partial_credit=partial_credit, |
| problems_solved=self._problems_solved, |
| current_streak=self._current_streak, |
| done=done, |
| reward=reward, |
| metadata={ |
| "total_reward": self._total_reward, |
| "step": self._state.step_count, |
| "difficulty": self._difficulty, |
| }, |
| ) |
|
|
| def _grade_answer(self, answer: str, expected: str) -> Tuple[bool, float, str]: |
| """ |
| Grade the submitted answer and return normalized score (0.0-1.0). |
| |
| This grader produces scores between 0.0-1.0 regardless of difficulty: |
| - 1.0: Perfect answer |
| - 0.5-0.9: Partial credit (close, some correct elements) |
| - 0.1-0.4: Format correct but values wrong |
| - 0.0: Completely incorrect |
| |
| Args: |
| answer: The agent's submitted answer |
| expected: The expected correct answer |
| |
| Returns: |
| Tuple of (is_correct, normalized_score, feedback) |
| """ |
| answer_clean = answer.strip().lower() |
| expected_clean = expected.strip().lower() |
|
|
| |
| if answer_clean == expected_clean: |
| return True, 1.0, "✓ Correct! Well done." |
|
|
| |
| score = 0.0 |
| feedback = "✗ Incorrect." |
|
|
| |
| try: |
| if ',' in expected_clean or expected_clean.replace('-', '').isdigit(): |
| expected_nums = [int(x.strip()) for x in expected_clean.split(',') if x.strip()] |
| answer_nums = [int(x.strip()) for x in answer_clean.split(',') if x.strip()] |
| |
| if len(expected_nums) == len(answer_nums): |
| |
| correct_count = sum(1 for e, a in zip(expected_nums, answer_nums) if e == a) |
| score = correct_count / len(expected_nums) |
| if score >= 0.8: |
| feedback = f"⚡ Very close! {int(score*100)}% correct values." |
| elif score >= 0.5: |
| feedback = f"⚡ Partial credit: {int(score*100)}% correct values." |
| elif score > 0: |
| feedback = f"⚡ Some correct: {int(score*100)}%. Review the problem." |
| elif len(answer_nums) > 0: |
| |
| score = 0.2 |
| feedback = "⚡ Format is numeric, but count/values are wrong." |
| except (ValueError, AttributeError): |
| |
| pass |
|
|
| |
| if score == 0.0: |
| |
| len_ratio = min(len(answer_clean), len(expected_clean)) / max(len(answer_clean), len(expected_clean), 1) |
| |
| |
| set_overlap = len(set(answer_clean) & set(expected_clean)) / max(len(set(expected_clean)), 1) |
| |
| |
| similarity = (len_ratio * 0.3 + set_overlap * 0.7) |
| |
| if similarity >= 0.7: |
| score = 0.6 |
| feedback = f"⚡ Close! Similar to expected answer ({int(similarity*100)}% match)." |
| elif similarity >= 0.4: |
| score = 0.3 |
| feedback = f"⚡ Some similarity ({int(similarity*100)}%). Review requirements." |
| elif ',' in expected and ',' in answer_clean: |
| |
| score = 0.1 |
| feedback = "⚡ Correct format style, but content is incorrect." |
|
|
| return False, score, feedback |
|
|
| def _calculate_reward(self, is_correct: bool, normalized_score: float) -> float: |
| """ |
| Calculate reward by applying difficulty multipliers to normalized grader scores. |
| |
| The grader produces normalized scores (0.0-1.0), which are then scaled by difficulty: |
| - Easy: 1x multiplier |
| - Medium: 2x multiplier |
| - Hard: 5x multiplier |
| |
| Args: |
| is_correct: Whether the answer was fully correct (score = 1.0) |
| normalized_score: Grader score between 0.0-1.0 |
| |
| Returns: |
| The calculated reward (scaled by difficulty and bonuses) |
| """ |
| |
| multipliers = { |
| "easy": 1.0, |
| "medium": 2.0, |
| "hard": 5.0, |
| } |
|
|
| base_multiplier = multipliers[self._difficulty] |
|
|
| if is_correct: |
| |
| reward = base_multiplier * 1.0 |
| |
| |
| if self._current_streak >= 3: |
| reward += 0.5 |
| elif normalized_score > 0: |
| |
| reward = base_multiplier * normalized_score |
| |
| |
| if self._difficulty == "easy": |
| reward *= 0.5 |
| else: |
| |
| |
| reward = -0.3 if self._difficulty == "hard" else 0.0 |
|
|
| return reward |
|
|
| def _advance_to_next_problem(self): |
| """Advance to the next problem, increasing difficulty as needed.""" |
| |
| self._current_test_case_idx += 1 |
| |
| |
| if self._current_test_case_idx >= len(self._current_problem["test_cases"]): |
| self._current_test_case_idx = 0 |
| |
| |
| if self._problems_solved >= 8 and self._difficulty != "hard": |
| self._difficulty = "hard" |
| elif self._problems_solved >= 4 and self._difficulty == "easy": |
| self._difficulty = "medium" |
| |
| |
| self._current_problem = random.choice(PROBLEMS[self._difficulty]) |
|
|
| @property |
| def state(self) -> State: |
| """ |
| Get the current environment state. |
| |
| Returns: |
| Current State with episode_id and step_count |
| """ |
| return self._state |
|
|