Spaces:

Ishangtxl
/

mainframe-modernization-openenv

Running

App Files Files Community

mainframe-modernization-openenv / tests /test_api_contract.py

Ishangtxl

Sync from GitHub c4e4dad

a537615 verified 27 days ago

raw

history blame contribute delete

8.14 kB

	import asyncio

	from fastapi.testclient import TestClient
	from openenv.core.env_server.mcp_types import CallToolAction

	from legacy_cobol_env.models import FinalSubmissionResult, RewardComponents
	from legacy_cobol_env.server.app import app
	from legacy_cobol_env.server.legacy_cobol_env_environment import (
	MAX_STEPS,
	LegacyCobolEnvironment,
	)
	from legacy_cobol_env.server.sandbox import CaseResult, EvaluationResult
	from legacy_cobol_env.tests.test_environment import GOOD_SOLUTION


	def call_obs(env: LegacyCobolEnvironment, tool_name: str, **arguments):
	return env.step(CallToolAction(tool_name=tool_name, arguments=arguments))


	async def call_obs_async(env: LegacyCobolEnvironment, tool_name: str, **arguments):
	return await env.step_async(CallToolAction(tool_name=tool_name, arguments=arguments))


	def result_data(observation):
	if hasattr(observation.result, "data"):
	return observation.result.data
	return observation.result


	def test_thirteenth_action_exceeds_max_steps_without_executing_tool():
	env = LegacyCobolEnvironment()
	env.reset(task_id="payroll_net_pay_001")

	for _ in range(MAX_STEPS):
	obs = call_obs(env, "read_cobol_file", filename="PAYROLL.cbl")
	assert obs.done is False

	blocked = call_obs(env, "write_python_solution", code=GOOD_SOLUTION)
	blocked_result = result_data(blocked)

	assert blocked.done is True
	assert blocked.reward == 0.0
	assert blocked_result["ok"] is False
	assert "max_steps" in blocked_result["error"]
	assert env.state.done is True
	assert env.state.step_count == MAX_STEPS
	assert env.state.draft_count == 0
	assert env.state.last_tool != "write_python_solution"


	def test_post_done_steps_are_terminal_noops_without_state_mutation():
	env = LegacyCobolEnvironment()
	env.reset(task_id="payroll_net_pay_001")

	written = result_data(call_obs(env, "write_python_solution", code=GOOD_SOLUTION))
	final_obs = call_obs(env, "submit_final", draft_id=written["draft_id"])
	assert final_obs.done is True

	state_before = env.state.model_dump()
	blocked = call_obs(env, "write_python_solution", code="def migrate(input_record):\n return ''\n")
	blocked_result = result_data(blocked)

	assert blocked.done is True
	assert blocked.reward == 0.0
	assert blocked_result["ok"] is False
	assert "terminal" in blocked_result["error"]
	assert env.state.model_dump() == state_before


	def test_final_reward_response_is_typed_and_clamped(monkeypatch):
	env = LegacyCobolEnvironment()
	env.reset(task_id="payroll_net_pay_001")
	written = result_data(call_obs(env, "write_python_solution", code=GOOD_SOLUTION))

	def out_of_bounds_components(hidden, fresh):
	return {
	"hidden_correctness": -0.25,
	"fresh_correctness": 1.25,
	"interface_contract": 2.0,
	"type_and_layout_fidelity": -1.0,
	"anti_hardcoding": 0.5,
	"safety": 9.0,
	}

	monkeypatch.setattr(env, "_reward_components", out_of_bounds_components)

	final = result_data(call_obs(env, "submit_final", draft_id=written["draft_id"]))

	typed_final = FinalSubmissionResult.model_validate(final)
	typed_components = RewardComponents.model_validate(final["components"])
	component_values = typed_components.model_dump().values()

	assert isinstance(typed_final.public_score, float)
	assert all(isinstance(value, float) for value in component_values)
	assert all(0.0 <= value <= 1.0 for value in component_values)
	assert 0.0 <= typed_final.public_score <= 1.0


	def test_fresh_timeout_counts_against_safety_component():
	env = LegacyCobolEnvironment()
	env.reset(task_id="payroll_net_pay_001")
	case = CaseResult(case_id="case", passed=True, input_summary="case", actual="x", expected="x")
	hidden = EvaluationResult(
	syntax_ok=True,
	safety_ok=True,
	interface_ok=True,
	timed_out=False,
	passed=1,
	total=1,
	case_results=[case],
	)
	fresh = EvaluationResult(
	syntax_ok=True,
	safety_ok=True,
	interface_ok=True,
	timed_out=True,
	passed=1,
	total=1,
	case_results=[case],
	)

	components = RewardComponents.model_validate(env._reward_components(hidden, fresh))

	assert components.safety == 0.0


	def test_async_max_step_guard_matches_sync_behavior():
	async def scenario():
	env = LegacyCobolEnvironment()
	env.reset(task_id="payroll_net_pay_001")

	for _ in range(MAX_STEPS):
	obs = await call_obs_async(env, "read_cobol_file", filename="PAYROLL.cbl")
	assert obs.done is False

	blocked = await call_obs_async(env, "write_python_solution", code=GOOD_SOLUTION)
	return env, blocked, result_data(blocked)

	env, blocked, blocked_result = asyncio.run(scenario())

	assert blocked.done is True
	assert blocked.reward == 0.0
	assert blocked_result["ok"] is False
	assert "max_steps" in blocked_result["error"]
	assert env.state.step_count == MAX_STEPS
	assert env.state.draft_count == 0


	def test_schema_exposes_project_typed_state_fields():
	schema = TestClient(app).get("/schema").json()

	state_properties = schema["state"]["properties"]
	action_properties = schema["action"]["properties"]
	observation_properties = schema["observation"]["properties"]

	assert schema["action"]["title"] == "ToolActionWrapper"
	assert schema["observation"]["title"] == "ToolObservationWrapper"
	assert "tool_name" in action_properties
	assert "result" in observation_properties
	for field_name in [
	"task_id",
	"draft_count",
	"visible_runs",
	"final_score",
	"reward_components",
	]:
	assert field_name in state_properties


	def test_rest_reset_step_and_state_share_one_episode():
	client = TestClient(app)

	reset = client.post("/reset", json={"task_id": "invoice_occurs_001"})
	assert reset.status_code == 200
	assert reset.json()["observation"]["result"]["ticket"]["task_id"] == "invoice_occurs_001"

	step = client.post(
	"/step",
	json={
	"action": {
	"tool_name": "read_cobol_file",
	"arguments": {"filename": "INVTOTAL.cbl"},
	}
	},
	)
	assert step.status_code == 200
	step_body = step.json()
	assert step_body["observation"]["result"]["data"]["ok"] is True
	assert step_body["reward"] == 0.02

	state = client.get("/state")
	assert state.status_code == 200
	state_body = state.json()
	assert state_body["task_id"] == "invoice_occurs_001"
	assert state_body["files_read"] == ["INVTOTAL.cbl"]
	assert state_body["last_tool"] == "read_cobol_file"
	assert state_body["step_count"] == 1


	def test_redundant_discovery_actions_do_not_repeat_progress_reward():
	env = LegacyCobolEnvironment()
	env.reset(task_id="payroll_net_pay_001")

	first_read = call_obs(env, "read_cobol_file", filename="PAYROLL.cbl")
	duplicate_read = call_obs(env, "read_cobol_file", filename="PAYROLL.cbl")
	first_parse = call_obs(env, "parse_copybook_layout", filename="EMPLOYEE_PAY.cpy")
	duplicate_parse = call_obs(env, "parse_copybook_layout", filename="EMPLOYEE_PAY.cpy")
	first_rules = call_obs(env, "inspect_business_rules")
	duplicate_rules = call_obs(env, "inspect_business_rules")

	assert first_read.reward == 0.02
	assert duplicate_read.reward == 0.0
	assert first_parse.reward == 0.03
	assert duplicate_parse.reward == 0.0
	assert first_rules.reward == 0.01
	assert duplicate_rules.reward == 0.0


	def test_visible_test_reward_only_pays_for_new_progress():
	env = LegacyCobolEnvironment()
	env.reset(task_id="payroll_net_pay_001")

	written = result_data(call_obs(env, "write_python_solution", code=GOOD_SOLUTION))
	first_visible = call_obs(env, "run_visible_tests", draft_id=written["draft_id"])
	repeated_visible = call_obs(env, "run_visible_tests", draft_id=written["draft_id"])

	assert first_visible.reward == 0.1
	assert repeated_visible.reward == 0.0