Mindigenous
Initial full project backup with Git LFS
53f0cc2
"""
Component 6 evaluation helpers.
"""
from __future__ import annotations
import ast
import json
import re
from pathlib import Path
from typing import Dict, List
def python_syntax_ok(code: str) -> bool:
try:
ast.parse(code)
return True
except Exception:
return False
def save_json(path: str, payload: Dict) -> None:
p = Path(path)
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
def _normalize_punctuation_spacing(text: str) -> str:
text = re.sub(r"\s+([,.:;\)\]\}])", r"\1", text)
text = re.sub(r"([\(\[\{])\s+", r"\1", text)
text = re.sub(r"\s*=\s*", " = ", text)
text = re.sub(r"\s*\+\s*", " + ", text)
text = re.sub(r"\s*-\s*", " - ", text)
text = re.sub(r"\s*\*\s*", " * ", text)
text = re.sub(r"\s*/\s*", " / ", text)
text = re.sub(r"\s*%\s*", " % ", text)
return re.sub(r"[ \t]+", " ", text).strip()
def _remove_non_python_noise(line: str) -> str:
line = line.replace("<UNK>", "1")
line = line.replace("\u0000", "")
line = line.replace("{", "")
line = line.replace("}", "")
line = line.replace(";", "")
return line
def _fix_identifier_spacing(line: str) -> str:
# def name with spaces -> def name_with_spaces
m = re.match(r"^(\s*def\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*\(.*)$", line)
if m:
fn = re.sub(r"\s+", "_", m.group(2).strip())
line = f"{m.group(1)}{fn}{m.group(3)}"
# class name with spaces -> class Name_With_Spaces
m = re.match(r"^(\s*class\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*:.*)$", line)
if m:
cn = re.sub(r"\s+", "_", m.group(2).strip())
line = f"{m.group(1)}{cn}{m.group(3)}"
# assignment lhs spaces -> underscore.
if "=" in line and "==" not in line:
lhs, rhs = line.split("=", 1)
lhs_clean = lhs.strip()
if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_\s]*", lhs_clean):
lhs_clean = re.sub(r"\s+", "_", lhs_clean)
line = f"{lhs_clean} = {rhs.strip()}"
return line
def _looks_like_python_line(line: str) -> bool:
if not line.strip():
return False
starts = (
"def ",
"class ",
"if ",
"for ",
"while ",
"try:",
"except",
"with ",
"return ",
"import ",
"from ",
"print(",
)
s = line.strip()
if s.startswith(starts):
return True
if re.match(r"^[A-Za-z_][A-Za-z0-9_]*\s*=", s):
return True
return False
def _trim_to_code(lines: List[str]) -> List[str]:
# Drop noisy preamble lines until first plausible Python line.
i = 0
while i < len(lines) and not _looks_like_python_line(lines[i]):
i += 1
lines = lines[i:] if i < len(lines) else []
# Keep only plausible lines after start; allow blank lines.
out = []
for line in lines:
if not line.strip():
out.append(line)
continue
if _looks_like_python_line(line) or line.startswith(" "):
out.append(line)
return out
def _best_effort_python_format(lines: List[str]) -> List[str]:
out: List[str] = []
indent = 0
for raw in lines:
line = raw.strip()
if not line:
out.append("")
continue
if line in {"return", "pass", "break", "continue"}:
indent = max(0, indent - 1)
out.append((" " * indent) + line)
if line.endswith(":"):
indent += 1
return out
def restore_code_from_structured(decoded: str) -> str:
text = decoded
for tok in ["<BOS>", "<EOS>", "<PROMPT>", "<PYTHON>", "<JAVASCRIPT>"]:
text = text.replace(tok, "")
if "<CODE>" in text:
text = text.split("<CODE>", 1)[1]
text = text.replace("_", " ")
tokens = text.strip().split()
lines: List[str] = []
current_tokens: List[str] = []
indent = 0
for tok in tokens:
if tok == "<INDENT>":
indent += 1
continue
if tok == "<DEDENT>":
indent = max(0, indent - 1)
continue
if tok == "<NL>":
line = " ".join(current_tokens).strip()
line = _remove_non_python_noise(line)
line = _normalize_punctuation_spacing(line)
line = _fix_identifier_spacing(line)
if line:
lines.append((" " * indent) + line)
else:
lines.append("")
current_tokens = []
continue
current_tokens.append(tok)
if current_tokens:
line = " ".join(current_tokens).strip()
line = _remove_non_python_noise(line)
line = _normalize_punctuation_spacing(line)
line = _fix_identifier_spacing(line)
if line:
lines.append((" " * indent) + line)
lines = _trim_to_code(lines)
lines = _best_effort_python_format(lines)
while lines and not lines[0].strip():
lines.pop(0)
while lines and not lines[-1].strip():
lines.pop()
return "\n".join(lines).strip()