File size: 5,132 Bytes

53f0cc2

"""
Component 6 evaluation helpers.
"""

from __future__ import annotations

import ast
import json
import re
from pathlib import Path
from typing import Dict, List


def python_syntax_ok(code: str) -> bool:
    try:
        ast.parse(code)
        return True
    except Exception:
        return False


def save_json(path: str, payload: Dict) -> None:
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")


def _normalize_punctuation_spacing(text: str) -> str:
    text = re.sub(r"\s+([,.:;\)\]\}])", r"\1", text)
    text = re.sub(r"([\(\[\{])\s+", r"\1", text)
    text = re.sub(r"\s*=\s*", " = ", text)
    text = re.sub(r"\s*\+\s*", " + ", text)
    text = re.sub(r"\s*-\s*", " - ", text)
    text = re.sub(r"\s*\*\s*", " * ", text)
    text = re.sub(r"\s*/\s*", " / ", text)
    text = re.sub(r"\s*%\s*", " % ", text)
    return re.sub(r"[ \t]+", " ", text).strip()


def _remove_non_python_noise(line: str) -> str:
    line = line.replace("<UNK>", "1")
    line = line.replace("\u0000", "")
    line = line.replace("{", "")
    line = line.replace("}", "")
    line = line.replace(";", "")
    return line


def _fix_identifier_spacing(line: str) -> str:
    # def name with spaces -> def name_with_spaces
    m = re.match(r"^(\s*def\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*\(.*)$", line)
    if m:
        fn = re.sub(r"\s+", "_", m.group(2).strip())
        line = f"{m.group(1)}{fn}{m.group(3)}"

    # class name with spaces -> class Name_With_Spaces
    m = re.match(r"^(\s*class\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*:.*)$", line)
    if m:
        cn = re.sub(r"\s+", "_", m.group(2).strip())
        line = f"{m.group(1)}{cn}{m.group(3)}"

    # assignment lhs spaces -> underscore.
    if "=" in line and "==" not in line:
        lhs, rhs = line.split("=", 1)
        lhs_clean = lhs.strip()
        if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_\s]*", lhs_clean):
            lhs_clean = re.sub(r"\s+", "_", lhs_clean)
            line = f"{lhs_clean} = {rhs.strip()}"

    return line


def _looks_like_python_line(line: str) -> bool:
    if not line.strip():
        return False
    starts = (
        "def ",
        "class ",
        "if ",
        "for ",
        "while ",
        "try:",
        "except",
        "with ",
        "return ",
        "import ",
        "from ",
        "print(",
    )
    s = line.strip()
    if s.startswith(starts):
        return True
    if re.match(r"^[A-Za-z_][A-Za-z0-9_]*\s*=", s):
        return True
    return False


def _trim_to_code(lines: List[str]) -> List[str]:
    # Drop noisy preamble lines until first plausible Python line.
    i = 0
    while i < len(lines) and not _looks_like_python_line(lines[i]):
        i += 1
    lines = lines[i:] if i < len(lines) else []
    # Keep only plausible lines after start; allow blank lines.
    out = []
    for line in lines:
        if not line.strip():
            out.append(line)
            continue
        if _looks_like_python_line(line) or line.startswith("    "):
            out.append(line)
    return out


def _best_effort_python_format(lines: List[str]) -> List[str]:
    out: List[str] = []
    indent = 0
    for raw in lines:
        line = raw.strip()
        if not line:
            out.append("")
            continue

        if line in {"return", "pass", "break", "continue"}:
            indent = max(0, indent - 1)

        out.append(("    " * indent) + line)

        if line.endswith(":"):
            indent += 1

    return out


def restore_code_from_structured(decoded: str) -> str:
    text = decoded
    for tok in ["<BOS>", "<EOS>", "<PROMPT>", "<PYTHON>", "<JAVASCRIPT>"]:
        text = text.replace(tok, "")

    if "<CODE>" in text:
        text = text.split("<CODE>", 1)[1]

    text = text.replace("_", " ")
    tokens = text.strip().split()

    lines: List[str] = []
    current_tokens: List[str] = []
    indent = 0

    for tok in tokens:
        if tok == "<INDENT>":
            indent += 1
            continue
        if tok == "<DEDENT>":
            indent = max(0, indent - 1)
            continue
        if tok == "<NL>":
            line = " ".join(current_tokens).strip()
            line = _remove_non_python_noise(line)
            line = _normalize_punctuation_spacing(line)
            line = _fix_identifier_spacing(line)
            if line:
                lines.append(("    " * indent) + line)
            else:
                lines.append("")
            current_tokens = []
            continue
        current_tokens.append(tok)

    if current_tokens:
        line = " ".join(current_tokens).strip()
        line = _remove_non_python_noise(line)
        line = _normalize_punctuation_spacing(line)
        line = _fix_identifier_spacing(line)
        if line:
            lines.append(("    " * indent) + line)

    lines = _trim_to_code(lines)
    lines = _best_effort_python_format(lines)

    while lines and not lines[0].strip():
        lines.pop(0)
    while lines and not lines[-1].strip():
        lines.pop()

    return "\n".join(lines).strip()