Spaces:
Sleeping
Sleeping
| """Verifier for code problems. | |
| Exposes :func:`verify_code_answer`, which executes candidate Python | |
| solutions in an isolated subprocess and returns ``True`` iff every test | |
| case in the provided verification metadata passes. | |
| The verifier supports two verification styles: | |
| * ``execute_and_assert`` — MBPP-style: run the candidate code followed | |
| by a list of ``assert`` statements; success iff the subprocess exits 0. | |
| * ``stdin_stdout`` — APPS-style: for each input/output pair, run the | |
| candidate code as a subprocess with the input on stdin and compare the | |
| (normalized) stdout to the expected output. | |
| Safety notes: | |
| * The model's code is *never* imported, ``exec``'d, or ``eval``'d in the | |
| parent process — it is always executed in a fresh subprocess via a | |
| temp file, with a wall-clock timeout. | |
| * On POSIX, a ``preexec_fn`` sets soft RLIMITs on CPU time and address | |
| space to cap runaway solutions. These are best-effort — the parent | |
| ``subprocess.run(timeout=...)`` is the authoritative kill switch. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import subprocess | |
| import sys | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional | |
| _MEMORY_LIMIT_BYTES = 512 * 1024 * 1024 # 512 MB | |
| _CPU_LIMIT_SECONDS = 15 # >= subprocess timeout; parent timeout is authoritative. | |
| def _set_child_limits() -> None: # pragma: no cover — runs in child | |
| """Best-effort rlimits for child processes on POSIX systems.""" | |
| try: | |
| import resource | |
| try: | |
| resource.setrlimit( | |
| resource.RLIMIT_CPU, (_CPU_LIMIT_SECONDS, _CPU_LIMIT_SECONDS) | |
| ) | |
| except (ValueError, OSError): | |
| pass | |
| try: | |
| resource.setrlimit( | |
| resource.RLIMIT_AS, (_MEMORY_LIMIT_BYTES, _MEMORY_LIMIT_BYTES) | |
| ) | |
| except (ValueError, OSError): | |
| pass | |
| except Exception: | |
| pass | |
| def _run_python( | |
| script_path: Path, stdin: str, timeout_seconds: int | |
| ) -> Optional[subprocess.CompletedProcess]: | |
| """Run ``script_path`` as a fresh Python subprocess. | |
| Returns the :class:`CompletedProcess` on success, or ``None`` on | |
| timeout. Any other failure propagates to the caller's try/except. | |
| """ | |
| preexec = _set_child_limits if os.name == "posix" else None | |
| try: | |
| return subprocess.run( | |
| [sys.executable, "-I", str(script_path)], | |
| input=stdin, | |
| capture_output=True, | |
| text=True, | |
| timeout=timeout_seconds, | |
| preexec_fn=preexec, | |
| ) | |
| except subprocess.TimeoutExpired: | |
| return None | |
| def _normalize_output(s: Any) -> str: | |
| """Normalize stdout/expected output for comparison. | |
| APPS sometimes stores outputs as lists (for multi-line expected | |
| output); coerce to a single string with Unix line endings, trim | |
| trailing whitespace per line, and strip leading/trailing whitespace. | |
| """ | |
| if s is None: | |
| return "" | |
| if isinstance(s, list): | |
| text = "\n".join(str(x) for x in s) | |
| else: | |
| text = str(s) | |
| text = text.replace("\r\n", "\n").replace("\r", "\n") | |
| lines = [line.rstrip() for line in text.split("\n")] | |
| # Drop trailing empty lines for forgiving comparison. | |
| while lines and lines[-1] == "": | |
| lines.pop() | |
| return "\n".join(lines).strip() | |
| def _coerce_stdin(value: Any) -> str: | |
| if value is None: | |
| return "" | |
| if isinstance(value, list): | |
| return "\n".join(str(x) for x in value) | |
| return str(value) | |
| def _verify_execute_and_assert( | |
| model_code: str, metadata: Dict[str, Any], timeout_seconds: int | |
| ) -> bool: | |
| tests: List[str] = list(metadata.get("test_list") or []) | |
| test_imports: List[str] = list(metadata.get("test_imports") or []) | |
| if not tests: | |
| return False | |
| script = "\n".join(test_imports) + "\n" + model_code + "\n\n" + "\n".join(tests) + "\n" | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| script_path = Path(tmpdir) / "candidate.py" | |
| script_path.write_text(script, encoding="utf-8") | |
| result = _run_python(script_path, stdin="", timeout_seconds=timeout_seconds) | |
| if result is None: | |
| return False | |
| return result.returncode == 0 | |
| def _verify_stdin_stdout( | |
| model_code: str, metadata: Dict[str, Any], timeout_seconds: int | |
| ) -> bool: | |
| inputs = metadata.get("inputs") or [] | |
| outputs = metadata.get("outputs") or [] | |
| if not isinstance(inputs, list) or not isinstance(outputs, list): | |
| return False | |
| if not inputs or len(inputs) != len(outputs): | |
| return False | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| script_path = Path(tmpdir) / "candidate.py" | |
| script_path.write_text(model_code, encoding="utf-8") | |
| for stdin_value, expected in zip(inputs, outputs): | |
| result = _run_python( | |
| script_path, | |
| stdin=_coerce_stdin(stdin_value), | |
| timeout_seconds=timeout_seconds, | |
| ) | |
| if result is None or result.returncode != 0: | |
| return False | |
| if _normalize_output(result.stdout) != _normalize_output(expected): | |
| return False | |
| return True | |
| def verify_code_answer( | |
| model_code: str, | |
| verification_metadata: Dict[str, Any], | |
| timeout_seconds: int = 5, | |
| ) -> bool: | |
| """Return ``True`` iff ``model_code`` passes every test in the metadata. | |
| Any exception (syntax errors, missing imports, runtime errors in the | |
| candidate code, infrastructure failures) is caught and reported as | |
| ``False`` — this function is designed never to raise. | |
| """ | |
| try: | |
| if not isinstance(model_code, str) or not model_code.strip(): | |
| return False | |
| if not isinstance(verification_metadata, dict): | |
| return False | |
| vtype = verification_metadata.get("verification_type") | |
| if vtype == "execute_and_assert": | |
| return _verify_execute_and_assert( | |
| model_code, verification_metadata, timeout_seconds | |
| ) | |
| if vtype == "stdin_stdout": | |
| return _verify_stdin_stdout( | |
| model_code, verification_metadata, timeout_seconds | |
| ) | |
| return False | |
| except Exception: | |
| return False | |