"""Tests for execution-based coder eval helpers.""" import subprocess from maris_core.code.execution_eval import CodeExecutionSpec, evaluate_code_response def test_evaluate_code_response_runs_python_assertions() -> None: result = evaluate_code_response( "```python\ndef normalize_email(email: str) -> str:\n normalized = email.strip().lower()\n if not normalized:\n raise ValueError('required')\n return normalized\n```", CodeExecutionSpec( language="python", test_code=( "assert normalize_email(' A@Example.COM ') == 'a@example.com'\n" "try:\n" " normalize_email(' ')\n" "except ValueError:\n" " pass\n" "else:\n" " raise AssertionError('expected ValueError')" ), ), ) assert result.available is True assert result.passed is True def test_evaluate_code_response_reports_python_failures() -> None: result = evaluate_code_response( "```python\ndef normalize_email(email: str) -> str:\n return email\n```", CodeExecutionSpec( language="python", test_code="assert normalize_email(' A@Example.COM ') == 'a@example.com'", ), ) assert result.available is True assert result.passed is False assert result.summary def test_evaluate_code_response_runs_typescript_assertions_with_isolated_env(monkeypatch) -> None: commands: list[tuple[list[str], dict[str, object]]] = [] def fake_which(name: str) -> str | None: if name in {"tsc", "node"}: return f"/usr/bin/{name}" return None def fake_run(command: list[str], **kwargs): commands.append((command, kwargs)) return subprocess.CompletedProcess(command, 0, "", "") monkeypatch.setattr("maris_core.code.execution_eval.shutil.which", fake_which) monkeypatch.setattr("maris_core.code.execution_eval.subprocess.run", fake_run) result = evaluate_code_response( "```typescript\nexport function nextDelay(attempt: number, baseMs = 250): number {\n" " if (attempt <= 0) return 0;\n" " return baseMs * 2 ** (attempt - 1);\n}\n```", CodeExecutionSpec( language="typescript", test_code=( "function assert(condition: boolean, message: string): void {" " if (!condition) throw new Error(message); }\n" "assert(nextDelay(0) === 0, 'attempt 0');" ), ), ) assert result.available is True assert result.passed is True assert len(commands) == 2 assert commands[0][0][0].endswith("tsc") assert commands[1][0][0].endswith("node") assert commands[0][1]["stdin"] is subprocess.DEVNULL assert isinstance(commands[0][1]["env"], dict) assert commands[0][1]["env"]["HOME"] def test_evaluate_code_response_runs_sql_harness_with_placeholder() -> None: result = evaluate_code_response( "```sql\nSELECT\n b.branch,\n e.language,\n" " AVG(CASE WHEN e.passed THEN 1.0 ELSE 0.0 END) AS execution_pass_rate,\n" " CASE WHEN AVG(CASE WHEN e.passed THEN 1.0 ELSE 0.0 END) < 0.8 THEN 1 ELSE 0 END AS is_regression\n" "FROM benchmark_results b\nJOIN execution_results e ON e.benchmark_run_id = b.id\n" "GROUP BY b.branch, e.language\n```", CodeExecutionSpec( language="sql", test_code=( "CREATE TABLE benchmark_results (id INTEGER PRIMARY KEY, branch TEXT);\n" "CREATE TABLE execution_results (benchmark_run_id INTEGER, language TEXT, passed INTEGER);\n" "INSERT INTO benchmark_results (id, branch) VALUES (1, 'coder');\n" "INSERT INTO execution_results (benchmark_run_id, language, passed) VALUES " "(1, 'typescript', 1), (1, 'typescript', 0), (1, 'rust', 1);\n" "CREATE TEMP TABLE actual AS {{CODE}};\n" "SELECT branch, language, execution_pass_rate, is_regression FROM actual;" ), ), ) assert result.available is True assert result.passed is True def test_evaluate_code_response_reports_sql_failures() -> None: result = evaluate_code_response( "```sql\nSELECT * FROM missing_table\n```", CodeExecutionSpec(language="sql"), ) assert result.available is True assert result.passed is False assert "SQL" in result.summary