"""
Code Mutation Engine — makes each episode surface unique.

Each call to ``mutate_task()`` returns a deep copy of a task with three
deterministic, seed-controlled transforms applied:

  1. Variable renaming   — one identifier swapped for a drop-in synonym so
                           the agent cannot memorise exact token strings.
  2. Line shifting       — one blank line inserted above the first issue,
                           shifting all issue line_ranges down by 1.
  3. Constant variance   — one numeric literal nudged ±1 so the agent
                           sees a fresh surface without changing the bug.

Mutations are fully deterministic given a seed, making training runs
reproducible while still presenting a different surface each episode.

Design constraint
-----------------
Mutations must NEVER change *whether* a bug exists or *which line category*
it belongs to.  Only surface tokens and line positions may change.
"""

from __future__ import annotations

import copy
import random
import re
from typing import Any


# ── Variable synonym table ───────────────────────────────────────────────────────────────────
# Maps original identifier → list of semantically equivalent drop-in synonyms.
# Only single-token renames that do not affect runtime behaviour are listed.

_IDENTIFIER_SYNONYMS: dict[str, list[str]] = {
    "total":       ["acc", "running_total", "summed"],
    "numbers":     ["values", "nums", "items"],
    "result":      ["output", "response", "ret"],
    "data":        ["payload", "records", "entries"],
    "item":        ["record", "entry", "obj"],
    "items":       ["records", "entries", "objects"],
    "user":        ["account", "principal", "member"],
    "users":       ["accounts", "principals", "members"],
    "password":    ["passwd", "secret", "credential"],
    "username":    ["user_name", "login", "uname"],
    "command":     ["cmd", "instruction", "directive"],
    "filename":    ["file_name", "fname", "path_name"],
    "url":         ["endpoint", "uri", "address"],
    "attempt":     ["try_num", "iteration", "retry_idx"],
    "counter":     ["count", "tally", "n"],
    "session":     ["conn", "http_session", "client"],
    "results":     ["findings", "collected", "gathered"],
    "cache":       ["store", "lookup", "memo"],
    "transformed": ["processed", "mapped", "converted"],
}

# Minimum numeric literal value after variance nudge (avoids nonsensical 0 or 1).
_MIN_CONSTANT_VALUE: int = 2


def mutate_task(base_task: dict[str, Any], seed: int) -> dict[str, Any]:
    """
    Return a mutated deep-copy of *base_task* using *seed* for reproducibility.

    The returned task is structurally identical to the original — same keys,
    same issue ids, same categories — but with surface-level code changes and
    adjusted line_ranges to match.
    """
    rng = random.Random(seed)
    mutated_task: dict[str, Any] = copy.deepcopy(base_task)

    source_code: str = mutated_task["code"]
    issues: list[dict[str, Any]] = mutated_task["issues"]

    source_code, issues = _apply_variable_rename(source_code, issues, rng)
    source_code, issues = _apply_line_shift(source_code, issues)
    source_code = _apply_constant_variance(source_code, rng)

    mutated_task["code"] = source_code
    mutated_task["issues"] = issues
    mutated_task["_mutation_seed"] = seed
    return mutated_task


# ── Private mutation helpers ───────────────────────────────────────────────────────────

def _apply_variable_rename(
    source_code: str,
    issues: list[dict[str, Any]],
    rng: random.Random,
) -> tuple[str, list[dict[str, Any]]]:
    """
    Swap one identifier in the source for a synonym from _IDENTIFIER_SYNONYMS.

    Also updates each issue's keyword list so the grader continues to match
    after the rename.
    """
    # \b word-boundary anchors prevent partial substitutions such as
    # replacing 'data' inside 'database' or 'user' inside 'username'.
    renameable = [orig for orig in _IDENTIFIER_SYNONYMS if re.search(rf"\b{orig}\b", source_code)]
    if not renameable:
        return source_code, issues

    original_identifier = rng.choice(renameable)
    replacement_identifier = rng.choice(_IDENTIFIER_SYNONYMS[original_identifier])

    source_code = re.sub(rf"\b{original_identifier}\b", replacement_identifier, source_code)

    # Keep issue keywords in sync so the grader still matches post-rename.
    for issue in issues:
        issue["keywords"] = [
            replacement_identifier if kw == original_identifier else kw
            for kw in issue["keywords"]
        ]
    return source_code, issues


def _apply_line_shift(
    source_code: str,
    issues: list[dict[str, Any]],
) -> tuple[str, list[dict[str, Any]]]:
    """
    Insert one blank line above the first issue, shifting all line_ranges down by 1.

    Forces the agent to re-read the code each episode rather than relying on
    memorised line numbers.
    """
    if not issues:
        return source_code, issues

    first_issue_line = min(iss["line_range"][0] for iss in issues)
    # Convert 1-based line number to 0-based list index.
    # first_issue_line is 1-based; subtract 2 to get the 0-based index of the
    # line immediately above it (where the blank line will be inserted).
    insert_position = max(0, first_issue_line - 2)

    lines = source_code.split("\n")
    lines.insert(insert_position, "")
    source_code = "\n".join(lines)

    for issue in issues:
        start, end = issue["line_range"]
        issue["line_range"] = (start + 1, end + 1)

    return source_code, issues


def _apply_constant_variance(source_code: str, rng: random.Random) -> str:
    """
    Nudge one numeric literal by ±1 to vary the code surface without changing
    which bug is present.

    Numbers that appear only inside a comment on the same line are excluded to
    avoid corrupting annotated line references.
    """
    # Match literals >= 2 only — nudging 0 or 1 could produce 0 or a negative
    # value, breaking constructs like range(1) or timeout=1.
    # The lookahead on comment text prevents shifting annotated line references
    # that appear in inline comments (e.g. '# line 42').
    numeric_matches = [
        match
        for match in re.finditer(r"\b([2-9]|[1-9]\d+)\b", source_code)
        if not re.search(r"#[^\n]*" + re.escape(match.group()), source_code[: match.end()])
    ]
    if not numeric_matches:
        return source_code

    chosen_match = rng.choice(numeric_matches)
    original_value = int(chosen_match.group())
    nudge = rng.choice([-1, 1])
    new_value = max(_MIN_CONSTANT_VALUE, original_value + nudge)

    return source_code[: chosen_match.start()] + str(new_value) + source_code[chosen_match.end() :]


__all__ = ["mutate_task"]