Mindigenous

Initial full project backup with Git LFS

53f0cc2 10 days ago

5.13 kB

	"""
	Component 6 evaluation helpers.
	"""

	from __future__ import annotations

	import ast
	import json
	import re
	from pathlib import Path
	from typing import Dict, List


	def python_syntax_ok(code: str) -> bool:
	try:
	ast.parse(code)
	return True
	except Exception:
	return False


	def save_json(path: str, payload: Dict) -> None:
	p = Path(path)
	p.parent.mkdir(parents=True, exist_ok=True)
	p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")


	def _normalize_punctuation_spacing(text: str) -> str:
	text = re.sub(r"\s+([,.:;\)\]\}])", r"\1", text)
	text = re.sub(r"([\(\[\{])\s+", r"\1", text)
	text = re.sub(r"\s=\s", " = ", text)
	text = re.sub(r"\s\+\s", " + ", text)
	text = re.sub(r"\s-\s", " - ", text)
	text = re.sub(r"\s\\s", " ", text)
	text = re.sub(r"\s/\s", " / ", text)
	text = re.sub(r"\s%\s", " % ", text)
	return re.sub(r"[ \t]+", " ", text).strip()


	def _remove_non_python_noise(line: str) -> str:
	line = line.replace("<UNK>", "1")
	line = line.replace("\u0000", "")
	line = line.replace("{", "")
	line = line.replace("}", "")
	line = line.replace(";", "")
	return line


	def _fix_identifier_spacing(line: str) -> str:
	# def name with spaces -> def name_with_spaces
	m = re.match(r"^(\sdef\s+)([A-Za-z_][A-Za-z0-9_\s])(\s\(.)$", line)
	if m:
	fn = re.sub(r"\s+", "_", m.group(2).strip())
	line = f"{m.group(1)}{fn}{m.group(3)}"

	# class name with spaces -> class Name_With_Spaces
	m = re.match(r"^(\sclass\s+)([A-Za-z_][A-Za-z0-9_\s])(\s:.)$", line)
	if m:
	cn = re.sub(r"\s+", "_", m.group(2).strip())
	line = f"{m.group(1)}{cn}{m.group(3)}"

	# assignment lhs spaces -> underscore.
	if "=" in line and "==" not in line:
	lhs, rhs = line.split("=", 1)
	lhs_clean = lhs.strip()
	if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_\s]*", lhs_clean):
	lhs_clean = re.sub(r"\s+", "_", lhs_clean)
	line = f"{lhs_clean} = {rhs.strip()}"

	return line


	def _looks_like_python_line(line: str) -> bool:
	if not line.strip():
	return False
	starts = (
	"def ",
	"class ",
	"if ",
	"for ",
	"while ",
	"try:",
	"except",
	"with ",
	"return ",
	"import ",
	"from ",
	"print(",
	)
	s = line.strip()
	if s.startswith(starts):
	return True
	if re.match(r"^[A-Za-z_][A-Za-z0-9_]\s=", s):
	return True
	return False


	def _trim_to_code(lines: List[str]) -> List[str]:
	# Drop noisy preamble lines until first plausible Python line.
	i = 0
	while i < len(lines) and not _looks_like_python_line(lines[i]):
	i += 1
	lines = lines[i:] if i < len(lines) else []
	# Keep only plausible lines after start; allow blank lines.
	out = []
	for line in lines:
	if not line.strip():
	out.append(line)
	continue
	if _looks_like_python_line(line) or line.startswith(" "):
	out.append(line)
	return out


	def _best_effort_python_format(lines: List[str]) -> List[str]:
	out: List[str] = []
	indent = 0
	for raw in lines:
	line = raw.strip()
	if not line:
	out.append("")
	continue

	if line in {"return", "pass", "break", "continue"}:
	indent = max(0, indent - 1)

	out.append((" " * indent) + line)

	if line.endswith(":"):
	indent += 1

	return out


	def restore_code_from_structured(decoded: str) -> str:
	text = decoded
	for tok in ["<BOS>", "<EOS>", "<PROMPT>", "<PYTHON>", "<JAVASCRIPT>"]:
	text = text.replace(tok, "")

	if "<CODE>" in text:
	text = text.split("<CODE>", 1)[1]

	text = text.replace("_", " ")
	tokens = text.strip().split()

	lines: List[str] = []
	current_tokens: List[str] = []
	indent = 0

	for tok in tokens:
	if tok == "<INDENT>":
	indent += 1
	continue
	if tok == "<DEDENT>":
	indent = max(0, indent - 1)
	continue
	if tok == "<NL>":
	line = " ".join(current_tokens).strip()
	line = _remove_non_python_noise(line)
	line = _normalize_punctuation_spacing(line)
	line = _fix_identifier_spacing(line)
	if line:
	lines.append((" " * indent) + line)
	else:
	lines.append("")
	current_tokens = []
	continue
	current_tokens.append(tok)

	if current_tokens:
	line = " ".join(current_tokens).strip()
	line = _remove_non_python_noise(line)
	line = _normalize_punctuation_spacing(line)
	line = _fix_identifier_spacing(line)
	if line:
	lines.append((" " * indent) + line)

	lines = _trim_to_code(lines)
	lines = _best_effort_python_format(lines)

	while lines and not lines[0].strip():
	lines.pop(0)
	while lines and not lines[-1].strip():
	lines.pop()

	return "\n".join(lines).strip()