File size: 5,132 Bytes
53f0cc2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | """
Component 6 evaluation helpers.
"""
from __future__ import annotations
import ast
import json
import re
from pathlib import Path
from typing import Dict, List
def python_syntax_ok(code: str) -> bool:
try:
ast.parse(code)
return True
except Exception:
return False
def save_json(path: str, payload: Dict) -> None:
p = Path(path)
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
def _normalize_punctuation_spacing(text: str) -> str:
text = re.sub(r"\s+([,.:;\)\]\}])", r"\1", text)
text = re.sub(r"([\(\[\{])\s+", r"\1", text)
text = re.sub(r"\s*=\s*", " = ", text)
text = re.sub(r"\s*\+\s*", " + ", text)
text = re.sub(r"\s*-\s*", " - ", text)
text = re.sub(r"\s*\*\s*", " * ", text)
text = re.sub(r"\s*/\s*", " / ", text)
text = re.sub(r"\s*%\s*", " % ", text)
return re.sub(r"[ \t]+", " ", text).strip()
def _remove_non_python_noise(line: str) -> str:
line = line.replace("<UNK>", "1")
line = line.replace("\u0000", "")
line = line.replace("{", "")
line = line.replace("}", "")
line = line.replace(";", "")
return line
def _fix_identifier_spacing(line: str) -> str:
# def name with spaces -> def name_with_spaces
m = re.match(r"^(\s*def\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*\(.*)$", line)
if m:
fn = re.sub(r"\s+", "_", m.group(2).strip())
line = f"{m.group(1)}{fn}{m.group(3)}"
# class name with spaces -> class Name_With_Spaces
m = re.match(r"^(\s*class\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*:.*)$", line)
if m:
cn = re.sub(r"\s+", "_", m.group(2).strip())
line = f"{m.group(1)}{cn}{m.group(3)}"
# assignment lhs spaces -> underscore.
if "=" in line and "==" not in line:
lhs, rhs = line.split("=", 1)
lhs_clean = lhs.strip()
if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_\s]*", lhs_clean):
lhs_clean = re.sub(r"\s+", "_", lhs_clean)
line = f"{lhs_clean} = {rhs.strip()}"
return line
def _looks_like_python_line(line: str) -> bool:
if not line.strip():
return False
starts = (
"def ",
"class ",
"if ",
"for ",
"while ",
"try:",
"except",
"with ",
"return ",
"import ",
"from ",
"print(",
)
s = line.strip()
if s.startswith(starts):
return True
if re.match(r"^[A-Za-z_][A-Za-z0-9_]*\s*=", s):
return True
return False
def _trim_to_code(lines: List[str]) -> List[str]:
# Drop noisy preamble lines until first plausible Python line.
i = 0
while i < len(lines) and not _looks_like_python_line(lines[i]):
i += 1
lines = lines[i:] if i < len(lines) else []
# Keep only plausible lines after start; allow blank lines.
out = []
for line in lines:
if not line.strip():
out.append(line)
continue
if _looks_like_python_line(line) or line.startswith(" "):
out.append(line)
return out
def _best_effort_python_format(lines: List[str]) -> List[str]:
out: List[str] = []
indent = 0
for raw in lines:
line = raw.strip()
if not line:
out.append("")
continue
if line in {"return", "pass", "break", "continue"}:
indent = max(0, indent - 1)
out.append((" " * indent) + line)
if line.endswith(":"):
indent += 1
return out
def restore_code_from_structured(decoded: str) -> str:
text = decoded
for tok in ["<BOS>", "<EOS>", "<PROMPT>", "<PYTHON>", "<JAVASCRIPT>"]:
text = text.replace(tok, "")
if "<CODE>" in text:
text = text.split("<CODE>", 1)[1]
text = text.replace("_", " ")
tokens = text.strip().split()
lines: List[str] = []
current_tokens: List[str] = []
indent = 0
for tok in tokens:
if tok == "<INDENT>":
indent += 1
continue
if tok == "<DEDENT>":
indent = max(0, indent - 1)
continue
if tok == "<NL>":
line = " ".join(current_tokens).strip()
line = _remove_non_python_noise(line)
line = _normalize_punctuation_spacing(line)
line = _fix_identifier_spacing(line)
if line:
lines.append((" " * indent) + line)
else:
lines.append("")
current_tokens = []
continue
current_tokens.append(tok)
if current_tokens:
line = " ".join(current_tokens).strip()
line = _remove_non_python_noise(line)
line = _normalize_punctuation_spacing(line)
line = _fix_identifier_spacing(line)
if line:
lines.append((" " * indent) + line)
lines = _trim_to_code(lines)
lines = _best_effort_python_format(lines)
while lines and not lines[0].strip():
lines.pop(0)
while lines and not lines[-1].strip():
lines.pop()
return "\n".join(lines).strip()
|