# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
Code Output Assessment Environment Implementation.

An RL environment that tests an agent's ability to solve coding problems
across three difficulty levels with automated grading and reward shaping.
"""

import random
from uuid import uuid4
from typing import Dict, List, Tuple, Literal

from openenv.core.env_server.interfaces import Environment
from openenv.core.env_server.types import State

try:
    from ..models import CodeAssessmentAction, CodeAssessmentObservation
except ImportError:
    from models import CodeAssessmentAction, CodeAssessmentObservation


# Problem sets for each difficulty level
PROBLEMS = {
    "easy": [
        {
            "description": "Add two numbers. Given input 'a,b', output a+b.",
            "test_cases": [("3,5", "8"), ("10,20", "30"), ("0,0", "0"), ("-5,5", "0")],
        },
        {
            "description": "Reverse a string. Given input 'hello', output 'olleh'.",
            "test_cases": [("hello", "olleh"), ("world", "dlrow"), ("a", "a"), ("12345", "54321")],
        },
        {
            "description": "Count vowels in a string (a,e,i,o,u). Return the count.",
            "test_cases": [("hello", "2"), ("aeiou", "5"), ("xyz", "0"), ("programming", "3")],
        },
        {
            "description": "Find maximum of two numbers. Given input 'a,b', output the larger number.",
            "test_cases": [("5,10", "10"), ("20,15", "20"), ("7,7", "7"), ("-5,3", "3")],
        },
    ],
    "medium": [
        {
            "description": "Check if a string is a palindrome. Output 'true' or 'false'.",
            "test_cases": [("racecar", "true"), ("hello", "false"), ("a", "true"), ("abba", "true")],
        },
        {
            "description": "Find the sum of all numbers in a comma-separated list. Input: '1,2,3', Output: '6'.",
            "test_cases": [("1,2,3", "6"), ("10,20,30", "60"), ("5", "5"), ("-1,1", "0")],
        },
        {
            "description": "Count occurrences of a character in a string. Input format: 'string,char'. Output: count.",
            "test_cases": [("hello,l", "2"), ("programming,m", "2"), ("test,x", "0"), ("aaa,a", "3")],
        },
        {
            "description": "Remove duplicates from a comma-separated list, keep order. Input: '1,2,2,3', Output: '1,2,3'.",
            "test_cases": [("1,2,2,3", "1,2,3"), ("a,b,a,c", "a,b,c"), ("1,1,1", "1"), ("1,2,3", "1,2,3")],
        },
    ],
    "hard": [
        {
            "description": "Find the longest word in a sentence. Input: sentence. Output: longest word.",
            "test_cases": [
                ("the quick brown fox", "quick"),
                ("hello world", "hello"),
                ("a bb ccc", "ccc"),
                ("programming is fun", "programming"),
            ],
        },
        {
            "description": "Find the nth Fibonacci number (0-indexed). Input: n. Output: fibonacci(n).",
            "test_cases": [("0", "0"), ("1", "1"), ("5", "5"), ("10", "55")],
        },
        {
            "description": "Check if parentheses are balanced. Input: string with (){}[]. Output: 'true' or 'false'.",
            "test_cases": [("()", "true"), ("({[]})", "true"), ("(]", "false"), ("(()", "false")],
        },
        {
            "description": "Find prime numbers up to n (comma-separated). Input: n. Output: primes.",
            "test_cases": [("10", "2,3,5,7"), ("20", "2,3,5,7,11,13,17,19"), ("2", "2"), ("1", "")],
        },
    ],
}


class CodeAssessmentEnvironment(Environment):
    """
    Code Output Assessment Environment.

    Tests an agent's ability to solve coding problems across three difficulty levels.
    Features automated grading with normalized scores (0.0-1.0) and shaped rewards.

    Difficulty Levels:
        - Easy: Basic operations (addition, string reversal, simple counting)
        - Medium: String/list processing, basic algorithms
        - Hard: Advanced algorithms, recursion, complex logic

    Grading System:
        All graders produce normalized scores between 0.0-1.0:
        - 1.0: Perfect answer
        - 0.5-0.9: High partial credit (very close)
        - 0.2-0.4: Low partial credit (some correct elements)
        - 0.0: Completely incorrect

    Reward Structure (grader score × difficulty multiplier):
        - Easy: score × 1.0 (max +1.0 for correct, +0.5 partial, 0.0 wrong)
        - Medium: score × 2.0 (max +2.0 for correct, +1.0 partial, 0.0 wrong)
        - Hard: score × 5.0 (max +5.0 for correct, +2.5 partial, -0.3 wrong)
        - Streak bonus: +0.5 for 3+ consecutive correct answers
    """

    SUPPORTS_CONCURRENT_SESSIONS: bool = True
    MAX_STEPS: int = 15  # Maximum steps per episode

    def __init__(self):
        """Initialize the code assessment environment."""
        self._state = State(episode_id=str(uuid4()), step_count=0)
        self._current_problem: Dict = {}
        self._current_test_case_idx: int = 0
        self._difficulty: Literal["easy", "medium", "hard"] = "easy"
        self._problems_solved: int = 0
        self._current_streak: int = 0
        self._total_reward: float = 0.0

    def reset(self) -> CodeAssessmentObservation:
        """
        Reset the environment and present the first problem.

        Returns:
            CodeAssessmentObservation with the first problem description
        """
        self._state = State(episode_id=str(uuid4()), step_count=0)
        self._problems_solved = 0
        self._current_streak = 0
        self._total_reward = 0.0
        self._difficulty = "easy"

        # Select a random problem from the easy category
        self._current_problem = random.choice(PROBLEMS["easy"])
        self._current_test_case_idx = 0

        test_input, _ = self._current_problem["test_cases"][0]

        return CodeAssessmentObservation(
            problem_description=self._current_problem["description"],
            difficulty=self._difficulty,
            test_case_input=test_input,
            expected_output=None,
            feedback="Welcome! Solve the problem and submit your answer.",
            is_correct=False,
            partial_credit=0.0,
            problems_solved=0,
            current_streak=0,
            done=False,
            reward=0.0,
        )

    def step(self, action: CodeAssessmentAction) -> CodeAssessmentObservation:  # type: ignore[override]
        """
        Evaluate the submitted answer and provide feedback.

        Args:
            action: CodeAssessmentAction containing the agent's answer

        Returns:
            CodeAssessmentObservation with grading results and next problem
        """
        self._state.step_count += 1

        # Get current test case
        test_input, expected_output = self._current_problem["test_cases"][self._current_test_case_idx]
        
        # Grade the answer
        is_correct, partial_credit, feedback = self._grade_answer(action.answer, expected_output)
        
        # Calculate reward
        reward = self._calculate_reward(is_correct, partial_credit)
        self._total_reward += reward

        # Update statistics
        if is_correct:
            self._problems_solved += 1
            self._current_streak += 1
        else:
            self._current_streak = 0

        # Check if episode should end
        done = self._state.step_count >= self.MAX_STEPS

        # Move to next problem if current one is solved
        if is_correct:
            self._advance_to_next_problem()

        # Get next test case
        test_input, _ = self._current_problem["test_cases"][self._current_test_case_idx]

        return CodeAssessmentObservation(
            problem_description=self._current_problem["description"],
            difficulty=self._difficulty,
            test_case_input=test_input,
            expected_output=expected_output if not is_correct else None,
            feedback=feedback,
            is_correct=is_correct,
            partial_credit=partial_credit,
            problems_solved=self._problems_solved,
            current_streak=self._current_streak,
            done=done,
            reward=reward,
            metadata={
                "total_reward": self._total_reward,
                "step": self._state.step_count,
                "difficulty": self._difficulty,
            },
        )

    def _grade_answer(self, answer: str, expected: str) -> Tuple[bool, float, str]:
        """
        Grade the submitted answer and return normalized score (0.0-1.0).

        This grader produces scores between 0.0-1.0 regardless of difficulty:
        - 1.0: Perfect answer
        - 0.5-0.9: Partial credit (close, some correct elements)
        - 0.1-0.4: Format correct but values wrong
        - 0.0: Completely incorrect

        Args:
            answer: The agent's submitted answer
            expected: The expected correct answer

        Returns:
            Tuple of (is_correct, normalized_score, feedback)
        """
        answer_clean = answer.strip().lower()
        expected_clean = expected.strip().lower()

        # Exact match = 1.0
        if answer_clean == expected_clean:
            return True, 1.0, "✓ Correct! Well done."

        # Start evaluating partial credit
        score = 0.0
        feedback = "✗ Incorrect."

        # Check for numeric list answers (comma-separated numbers)
        try:
            if ',' in expected_clean or expected_clean.replace('-', '').isdigit():
                expected_nums = [int(x.strip()) for x in expected_clean.split(',') if x.strip()]
                answer_nums = [int(x.strip()) for x in answer_clean.split(',') if x.strip()]
                
                if len(expected_nums) == len(answer_nums):
                    # Calculate percentage of correct values
                    correct_count = sum(1 for e, a in zip(expected_nums, answer_nums) if e == a)
                    score = correct_count / len(expected_nums)
                    if score >= 0.8:
                        feedback = f"⚡ Very close! {int(score*100)}% correct values."
                    elif score >= 0.5:
                        feedback = f"⚡ Partial credit: {int(score*100)}% correct values."
                    elif score > 0:
                        feedback = f"⚡ Some correct: {int(score*100)}%. Review the problem."
                elif len(answer_nums) > 0:
                    # Wrong length but has numbers - give format credit
                    score = 0.2
                    feedback = "⚡ Format is numeric, but count/values are wrong."
        except (ValueError, AttributeError):
            # Not a numeric answer, try string-based grading
            pass

        # String similarity for non-numeric answers
        if score == 0.0:
            # Check length similarity
            len_ratio = min(len(answer_clean), len(expected_clean)) / max(len(answer_clean), len(expected_clean), 1)
            
            # Character overlap
            set_overlap = len(set(answer_clean) & set(expected_clean)) / max(len(set(expected_clean)), 1)
            
            # Combine metrics
            similarity = (len_ratio * 0.3 + set_overlap * 0.7)
            
            if similarity >= 0.7:
                score = 0.6
                feedback = f"⚡ Close! Similar to expected answer ({int(similarity*100)}% match)."
            elif similarity >= 0.4:
                score = 0.3
                feedback = f"⚡ Some similarity ({int(similarity*100)}%). Review requirements."
            elif ',' in expected and ',' in answer_clean:
                # Has comma format like expected
                score = 0.1
                feedback = "⚡ Correct format style, but content is incorrect."

        return False, score, feedback

    def _calculate_reward(self, is_correct: bool, normalized_score: float) -> float:
        """
        Calculate reward by applying difficulty multipliers to normalized grader scores.

        The grader produces normalized scores (0.0-1.0), which are then scaled by difficulty:
        - Easy: 1x multiplier
        - Medium: 2x multiplier  
        - Hard: 5x multiplier

        Args:
            is_correct: Whether the answer was fully correct (score = 1.0)
            normalized_score: Grader score between 0.0-1.0

        Returns:
            The calculated reward (scaled by difficulty and bonuses)
        """
        # Difficulty multipliers
        multipliers = {
            "easy": 1.0,
            "medium": 2.0,
            "hard": 5.0,
        }

        base_multiplier = multipliers[self._difficulty]

        if is_correct:
            # Perfect score: full multiplier
            reward = base_multiplier * 1.0
            
            # Streak bonus for 3+ consecutive correct answers
            if self._current_streak >= 3:
                reward += 0.5
        elif normalized_score > 0:
            # Partial credit: scale the normalized score by difficulty
            reward = base_multiplier * normalized_score
            
            # Reduce partial rewards slightly for easy problems
            if self._difficulty == "easy":
                reward *= 0.5
        else:
            # Complete failure
            # Small penalty on hard problems to discourage random guessing
            reward = -0.3 if self._difficulty == "hard" else 0.0

        return reward

    def _advance_to_next_problem(self):
        """Advance to the next problem, increasing difficulty as needed."""
        # Move to next test case in current problem
        self._current_test_case_idx += 1
        
        # If completed all test cases, select new problem
        if self._current_test_case_idx >= len(self._current_problem["test_cases"]):
            self._current_test_case_idx = 0
            
            # Increase difficulty based on problems solved
            if self._problems_solved >= 8 and self._difficulty != "hard":
                self._difficulty = "hard"
            elif self._problems_solved >= 4 and self._difficulty == "easy":
                self._difficulty = "medium"
            
            # Select new random problem from current difficulty
            self._current_problem = random.choice(PROBLEMS[self._difficulty])

    @property
    def state(self) -> State:
        """
        Get the current environment state.

        Returns:
            Current State with episode_id and step_count
        """
        return self._state