| """Tests for execution-based coder eval helpers.""" | |
| import subprocess | |
| from maris_core.code.execution_eval import CodeExecutionSpec, evaluate_code_response | |
| def test_evaluate_code_response_runs_python_assertions() -> None: | |
| result = evaluate_code_response( | |
| "```python\ndef normalize_email(email: str) -> str:\n normalized = email.strip().lower()\n if not normalized:\n raise ValueError('required')\n return normalized\n```", | |
| CodeExecutionSpec( | |
| language="python", | |
| test_code=( | |
| "assert normalize_email(' A@Example.COM ') == 'a@example.com'\n" | |
| "try:\n" | |
| " normalize_email(' ')\n" | |
| "except ValueError:\n" | |
| " pass\n" | |
| "else:\n" | |
| " raise AssertionError('expected ValueError')" | |
| ), | |
| ), | |
| ) | |
| assert result.available is True | |
| assert result.passed is True | |
| def test_evaluate_code_response_reports_python_failures() -> None: | |
| result = evaluate_code_response( | |
| "```python\ndef normalize_email(email: str) -> str:\n return email\n```", | |
| CodeExecutionSpec( | |
| language="python", | |
| test_code="assert normalize_email(' A@Example.COM ') == 'a@example.com'", | |
| ), | |
| ) | |
| assert result.available is True | |
| assert result.passed is False | |
| assert result.summary | |
| def test_evaluate_code_response_runs_typescript_assertions_with_isolated_env(monkeypatch) -> None: | |
| commands: list[tuple[list[str], dict[str, object]]] = [] | |
| def fake_which(name: str) -> str | None: | |
| if name in {"tsc", "node"}: | |
| return f"/usr/bin/{name}" | |
| return None | |
| def fake_run(command: list[str], **kwargs): | |
| commands.append((command, kwargs)) | |
| return subprocess.CompletedProcess(command, 0, "", "") | |
| monkeypatch.setattr("maris_core.code.execution_eval.shutil.which", fake_which) | |
| monkeypatch.setattr("maris_core.code.execution_eval.subprocess.run", fake_run) | |
| result = evaluate_code_response( | |
| "```typescript\nexport function nextDelay(attempt: number, baseMs = 250): number {\n" | |
| " if (attempt <= 0) return 0;\n" | |
| " return baseMs * 2 ** (attempt - 1);\n}\n```", | |
| CodeExecutionSpec( | |
| language="typescript", | |
| test_code=( | |
| "function assert(condition: boolean, message: string): void {" | |
| " if (!condition) throw new Error(message); }\n" | |
| "assert(nextDelay(0) === 0, 'attempt 0');" | |
| ), | |
| ), | |
| ) | |
| assert result.available is True | |
| assert result.passed is True | |
| assert len(commands) == 2 | |
| assert commands[0][0][0].endswith("tsc") | |
| assert commands[1][0][0].endswith("node") | |
| assert commands[0][1]["stdin"] is subprocess.DEVNULL | |
| assert isinstance(commands[0][1]["env"], dict) | |
| assert commands[0][1]["env"]["HOME"] | |
| def test_evaluate_code_response_runs_sql_harness_with_placeholder() -> None: | |
| result = evaluate_code_response( | |
| "```sql\nSELECT\n b.branch,\n e.language,\n" | |
| " AVG(CASE WHEN e.passed THEN 1.0 ELSE 0.0 END) AS execution_pass_rate,\n" | |
| " CASE WHEN AVG(CASE WHEN e.passed THEN 1.0 ELSE 0.0 END) < 0.8 THEN 1 ELSE 0 END AS is_regression\n" | |
| "FROM benchmark_results b\nJOIN execution_results e ON e.benchmark_run_id = b.id\n" | |
| "GROUP BY b.branch, e.language\n```", | |
| CodeExecutionSpec( | |
| language="sql", | |
| test_code=( | |
| "CREATE TABLE benchmark_results (id INTEGER PRIMARY KEY, branch TEXT);\n" | |
| "CREATE TABLE execution_results (benchmark_run_id INTEGER, language TEXT, passed INTEGER);\n" | |
| "INSERT INTO benchmark_results (id, branch) VALUES (1, 'coder');\n" | |
| "INSERT INTO execution_results (benchmark_run_id, language, passed) VALUES " | |
| "(1, 'typescript', 1), (1, 'typescript', 0), (1, 'rust', 1);\n" | |
| "CREATE TEMP TABLE actual AS {{CODE}};\n" | |
| "SELECT branch, language, execution_pass_rate, is_regression FROM actual;" | |
| ), | |
| ), | |
| ) | |
| assert result.available is True | |
| assert result.passed is True | |
| def test_evaluate_code_response_reports_sql_failures() -> None: | |
| result = evaluate_code_response( | |
| "```sql\nSELECT * FROM missing_table\n```", | |
| CodeExecutionSpec(language="sql"), | |
| ) | |
| assert result.available is True | |
| assert result.passed is False | |
| assert "SQL" in result.summary | |