maris-ai-master / core-python /tests /test_execution_eval.py
MarisUK's picture
Maris AI model sync
f440f03 verified
"""Tests for execution-based coder eval helpers."""
import subprocess
from maris_core.code.execution_eval import CodeExecutionSpec, evaluate_code_response
def test_evaluate_code_response_runs_python_assertions() -> None:
result = evaluate_code_response(
"```python\ndef normalize_email(email: str) -> str:\n normalized = email.strip().lower()\n if not normalized:\n raise ValueError('required')\n return normalized\n```",
CodeExecutionSpec(
language="python",
test_code=(
"assert normalize_email(' A@Example.COM ') == 'a@example.com'\n"
"try:\n"
" normalize_email(' ')\n"
"except ValueError:\n"
" pass\n"
"else:\n"
" raise AssertionError('expected ValueError')"
),
),
)
assert result.available is True
assert result.passed is True
def test_evaluate_code_response_reports_python_failures() -> None:
result = evaluate_code_response(
"```python\ndef normalize_email(email: str) -> str:\n return email\n```",
CodeExecutionSpec(
language="python",
test_code="assert normalize_email(' A@Example.COM ') == 'a@example.com'",
),
)
assert result.available is True
assert result.passed is False
assert result.summary
def test_evaluate_code_response_runs_typescript_assertions_with_isolated_env(monkeypatch) -> None:
commands: list[tuple[list[str], dict[str, object]]] = []
def fake_which(name: str) -> str | None:
if name in {"tsc", "node"}:
return f"/usr/bin/{name}"
return None
def fake_run(command: list[str], **kwargs):
commands.append((command, kwargs))
return subprocess.CompletedProcess(command, 0, "", "")
monkeypatch.setattr("maris_core.code.execution_eval.shutil.which", fake_which)
monkeypatch.setattr("maris_core.code.execution_eval.subprocess.run", fake_run)
result = evaluate_code_response(
"```typescript\nexport function nextDelay(attempt: number, baseMs = 250): number {\n"
" if (attempt <= 0) return 0;\n"
" return baseMs * 2 ** (attempt - 1);\n}\n```",
CodeExecutionSpec(
language="typescript",
test_code=(
"function assert(condition: boolean, message: string): void {"
" if (!condition) throw new Error(message); }\n"
"assert(nextDelay(0) === 0, 'attempt 0');"
),
),
)
assert result.available is True
assert result.passed is True
assert len(commands) == 2
assert commands[0][0][0].endswith("tsc")
assert commands[1][0][0].endswith("node")
assert commands[0][1]["stdin"] is subprocess.DEVNULL
assert isinstance(commands[0][1]["env"], dict)
assert commands[0][1]["env"]["HOME"]
def test_evaluate_code_response_runs_sql_harness_with_placeholder() -> None:
result = evaluate_code_response(
"```sql\nSELECT\n b.branch,\n e.language,\n"
" AVG(CASE WHEN e.passed THEN 1.0 ELSE 0.0 END) AS execution_pass_rate,\n"
" CASE WHEN AVG(CASE WHEN e.passed THEN 1.0 ELSE 0.0 END) < 0.8 THEN 1 ELSE 0 END AS is_regression\n"
"FROM benchmark_results b\nJOIN execution_results e ON e.benchmark_run_id = b.id\n"
"GROUP BY b.branch, e.language\n```",
CodeExecutionSpec(
language="sql",
test_code=(
"CREATE TABLE benchmark_results (id INTEGER PRIMARY KEY, branch TEXT);\n"
"CREATE TABLE execution_results (benchmark_run_id INTEGER, language TEXT, passed INTEGER);\n"
"INSERT INTO benchmark_results (id, branch) VALUES (1, 'coder');\n"
"INSERT INTO execution_results (benchmark_run_id, language, passed) VALUES "
"(1, 'typescript', 1), (1, 'typescript', 0), (1, 'rust', 1);\n"
"CREATE TEMP TABLE actual AS {{CODE}};\n"
"SELECT branch, language, execution_pass_rate, is_regression FROM actual;"
),
),
)
assert result.available is True
assert result.passed is True
def test_evaluate_code_response_reports_sql_failures() -> None:
result = evaluate_code_response(
"```sql\nSELECT * FROM missing_table\n```",
CodeExecutionSpec(language="sql"),
)
assert result.available is True
assert result.passed is False
assert "SQL" in result.summary