maris-ai-master / core-python /tests /test_execution_eval.py

Maris AI model sync

f440f03 verified about 1 month ago

4.52 kB

	"""Tests for execution-based coder eval helpers."""

	import subprocess

	from maris_core.code.execution_eval import CodeExecutionSpec, evaluate_code_response


	def test_evaluate_code_response_runs_python_assertions() -> None:
	result = evaluate_code_response(
	"```python\ndef normalize_email(email: str) -> str:\n normalized = email.strip().lower()\n if not normalized:\n raise ValueError('required')\n return normalized\n```",
	CodeExecutionSpec(
	language="python",
	test_code=(
	"assert normalize_email(' A@Example.COM ') == 'a@example.com'\n"
	"try:\n"
	" normalize_email(' ')\n"
	"except ValueError:\n"
	" pass\n"
	"else:\n"
	" raise AssertionError('expected ValueError')"
	),
	),
	)

	assert result.available is True
	assert result.passed is True


	def test_evaluate_code_response_reports_python_failures() -> None:
	result = evaluate_code_response(
	"```python\ndef normalize_email(email: str) -> str:\n return email\n```",
	CodeExecutionSpec(
	language="python",
	test_code="assert normalize_email(' A@Example.COM ') == 'a@example.com'",
	),
	)

	assert result.available is True
	assert result.passed is False
	assert result.summary


	def test_evaluate_code_response_runs_typescript_assertions_with_isolated_env(monkeypatch) -> None:
	commands: list[tuple[list[str], dict[str, object]]] = []

	def fake_which(name: str) -> str \| None:
	if name in {"tsc", "node"}:
	return f"/usr/bin/{name}"
	return None

	def fake_run(command: list[str], **kwargs):
	commands.append((command, kwargs))
	return subprocess.CompletedProcess(command, 0, "", "")

	monkeypatch.setattr("maris_core.code.execution_eval.shutil.which", fake_which)
	monkeypatch.setattr("maris_core.code.execution_eval.subprocess.run", fake_run)

	result = evaluate_code_response(
	"```typescript\nexport function nextDelay(attempt: number, baseMs = 250): number {\n"
	" if (attempt <= 0) return 0;\n"
	" return baseMs * 2 ** (attempt - 1);\n}\n```",
	CodeExecutionSpec(
	language="typescript",
	test_code=(
	"function assert(condition: boolean, message: string): void {"
	" if (!condition) throw new Error(message); }\n"
	"assert(nextDelay(0) === 0, 'attempt 0');"
	),
	),
	)

	assert result.available is True
	assert result.passed is True
	assert len(commands) == 2
	assert commands[0][0][0].endswith("tsc")
	assert commands[1][0][0].endswith("node")
	assert commands[0][1]["stdin"] is subprocess.DEVNULL
	assert isinstance(commands[0][1]["env"], dict)
	assert commands[0][1]["env"]["HOME"]


	def test_evaluate_code_response_runs_sql_harness_with_placeholder() -> None:
	result = evaluate_code_response(
	"```sql\nSELECT\n b.branch,\n e.language,\n"
	" AVG(CASE WHEN e.passed THEN 1.0 ELSE 0.0 END) AS execution_pass_rate,\n"
	" CASE WHEN AVG(CASE WHEN e.passed THEN 1.0 ELSE 0.0 END) < 0.8 THEN 1 ELSE 0 END AS is_regression\n"
	"FROM benchmark_results b\nJOIN execution_results e ON e.benchmark_run_id = b.id\n"
	"GROUP BY b.branch, e.language\n```",
	CodeExecutionSpec(
	language="sql",
	test_code=(
	"CREATE TABLE benchmark_results (id INTEGER PRIMARY KEY, branch TEXT);\n"
	"CREATE TABLE execution_results (benchmark_run_id INTEGER, language TEXT, passed INTEGER);\n"
	"INSERT INTO benchmark_results (id, branch) VALUES (1, 'coder');\n"
	"INSERT INTO execution_results (benchmark_run_id, language, passed) VALUES "
	"(1, 'typescript', 1), (1, 'typescript', 0), (1, 'rust', 1);\n"
	"CREATE TEMP TABLE actual AS {{CODE}};\n"
	"SELECT branch, language, execution_pass_rate, is_regression FROM actual;"
	),
	),
	)

	assert result.available is True
	assert result.passed is True


	def test_evaluate_code_response_reports_sql_failures() -> None:
	result = evaluate_code_response(
	"```sql\nSELECT * FROM missing_table\n```",
	CodeExecutionSpec(language="sql"),
	)

	assert result.available is True
	assert result.passed is False
	assert "SQL" in result.summary