| """ |
| Component 6 evaluation helpers. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import ast |
| import json |
| import re |
| from pathlib import Path |
| from typing import Dict, List |
|
|
|
|
| def python_syntax_ok(code: str) -> bool: |
| try: |
| ast.parse(code) |
| return True |
| except Exception: |
| return False |
|
|
|
|
| def save_json(path: str, payload: Dict) -> None: |
| p = Path(path) |
| p.parent.mkdir(parents=True, exist_ok=True) |
| p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") |
|
|
|
|
| def _normalize_punctuation_spacing(text: str) -> str: |
| text = re.sub(r"\s+([,.:;\)\]\}])", r"\1", text) |
| text = re.sub(r"([\(\[\{])\s+", r"\1", text) |
| text = re.sub(r"\s*=\s*", " = ", text) |
| text = re.sub(r"\s*\+\s*", " + ", text) |
| text = re.sub(r"\s*-\s*", " - ", text) |
| text = re.sub(r"\s*\*\s*", " * ", text) |
| text = re.sub(r"\s*/\s*", " / ", text) |
| text = re.sub(r"\s*%\s*", " % ", text) |
| return re.sub(r"[ \t]+", " ", text).strip() |
|
|
|
|
| def _remove_non_python_noise(line: str) -> str: |
| line = line.replace("<UNK>", "1") |
| line = line.replace("\u0000", "") |
| line = line.replace("{", "") |
| line = line.replace("}", "") |
| line = line.replace(";", "") |
| return line |
|
|
|
|
| def _fix_identifier_spacing(line: str) -> str: |
| |
| m = re.match(r"^(\s*def\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*\(.*)$", line) |
| if m: |
| fn = re.sub(r"\s+", "_", m.group(2).strip()) |
| line = f"{m.group(1)}{fn}{m.group(3)}" |
|
|
| |
| m = re.match(r"^(\s*class\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*:.*)$", line) |
| if m: |
| cn = re.sub(r"\s+", "_", m.group(2).strip()) |
| line = f"{m.group(1)}{cn}{m.group(3)}" |
|
|
| |
| if "=" in line and "==" not in line: |
| lhs, rhs = line.split("=", 1) |
| lhs_clean = lhs.strip() |
| if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_\s]*", lhs_clean): |
| lhs_clean = re.sub(r"\s+", "_", lhs_clean) |
| line = f"{lhs_clean} = {rhs.strip()}" |
|
|
| return line |
|
|
|
|
| def _looks_like_python_line(line: str) -> bool: |
| if not line.strip(): |
| return False |
| starts = ( |
| "def ", |
| "class ", |
| "if ", |
| "for ", |
| "while ", |
| "try:", |
| "except", |
| "with ", |
| "return ", |
| "import ", |
| "from ", |
| "print(", |
| ) |
| s = line.strip() |
| if s.startswith(starts): |
| return True |
| if re.match(r"^[A-Za-z_][A-Za-z0-9_]*\s*=", s): |
| return True |
| return False |
|
|
|
|
| def _trim_to_code(lines: List[str]) -> List[str]: |
| |
| i = 0 |
| while i < len(lines) and not _looks_like_python_line(lines[i]): |
| i += 1 |
| lines = lines[i:] if i < len(lines) else [] |
| |
| out = [] |
| for line in lines: |
| if not line.strip(): |
| out.append(line) |
| continue |
| if _looks_like_python_line(line) or line.startswith(" "): |
| out.append(line) |
| return out |
|
|
|
|
| def _best_effort_python_format(lines: List[str]) -> List[str]: |
| out: List[str] = [] |
| indent = 0 |
| for raw in lines: |
| line = raw.strip() |
| if not line: |
| out.append("") |
| continue |
|
|
| if line in {"return", "pass", "break", "continue"}: |
| indent = max(0, indent - 1) |
|
|
| out.append((" " * indent) + line) |
|
|
| if line.endswith(":"): |
| indent += 1 |
|
|
| return out |
|
|
|
|
| def restore_code_from_structured(decoded: str) -> str: |
| text = decoded |
| for tok in ["<BOS>", "<EOS>", "<PROMPT>", "<PYTHON>", "<JAVASCRIPT>"]: |
| text = text.replace(tok, "") |
|
|
| if "<CODE>" in text: |
| text = text.split("<CODE>", 1)[1] |
|
|
| text = text.replace("_", " ") |
| tokens = text.strip().split() |
|
|
| lines: List[str] = [] |
| current_tokens: List[str] = [] |
| indent = 0 |
|
|
| for tok in tokens: |
| if tok == "<INDENT>": |
| indent += 1 |
| continue |
| if tok == "<DEDENT>": |
| indent = max(0, indent - 1) |
| continue |
| if tok == "<NL>": |
| line = " ".join(current_tokens).strip() |
| line = _remove_non_python_noise(line) |
| line = _normalize_punctuation_spacing(line) |
| line = _fix_identifier_spacing(line) |
| if line: |
| lines.append((" " * indent) + line) |
| else: |
| lines.append("") |
| current_tokens = [] |
| continue |
| current_tokens.append(tok) |
|
|
| if current_tokens: |
| line = " ".join(current_tokens).strip() |
| line = _remove_non_python_noise(line) |
| line = _normalize_punctuation_spacing(line) |
| line = _fix_identifier_spacing(line) |
| if line: |
| lines.append((" " * indent) + line) |
|
|
| lines = _trim_to_code(lines) |
| lines = _best_effort_python_format(lines) |
|
|
| while lines and not lines[0].strip(): |
| lines.pop(0) |
| while lines and not lines[-1].strip(): |
| lines.pop() |
|
|
| return "\n".join(lines).strip() |
|
|