Spaces:

hanabhi
/

gridworld-env

Sleeping

gridworld-env / OpenEnv /examples /coding_env_inference.py

Abhilasha Kakoty

Initial deploy

7078f4d about 2 months ago

6.46 kB

	#!/usr/bin/env python3
	"""Solve a coding task with a hosted LLM via Hugging Face Inference.

	This script mirrors ``textarena_wordle_inference.py`` but targets the Coding
	environment. It launches the CodingEnv Docker image locally and asks an
	OpenAI-compatible model served through Hugging Face's router to iteratively
	produce Python code until the task is solved.

	Prerequisites
	-------------
	1. Build the Coding environment Docker image::

	docker build \
	-f envs/coding_env/server/Dockerfile \
	-t coding-env:latest .

	2. Set your Hugging Face token, or any other API key that is compatible with the OpenAI API:

	export HF_TOKEN=your_token_here
	export API_KEY=your_api_key_here

	3. Run the script::

	python examples/coding_env_inference.py

	The script keeps sending execution feedback to the model until it prints
	``Result: 338350`` or reaches the configured step limit.
	"""

	from __future__ import annotations

	import os
	import re
	from typing import List, Tuple

	from openai import OpenAI

	from coding_env import CodeAction, CodingEnv


	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------

	API_BASE_URL = "https://router.huggingface.co/v1"
	API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN")

	MODEL = "openai/gpt-oss-120b:novita"
	MAX_STEPS = 5
	VERBOSE = True

	CODING_TASK = (
	"Write Python code that prints the sum of squares of the integers from 1 "
	"to 100 inclusive. The final line must be exactly `Result: <value>` with "
	"the correct number substituted."
	)
	EXPECTED_SUBSTRING = "Result: 338350"

	SYSTEM_PROMPT = (
	"You are an expert Python programmer. Respond with valid Python code that "
	"solves the user's task. Always wrap your final answer in a fenced code "
	"block starting with ```python. Provide a complete script that can be "
	"executed as-is, with no commentary outside the code block."
	)


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------


	def extract_python_code(text: str) -> str:
	"""Extract the first Python code block from the model output."""

	code_blocks = re.findall(
	r"```(?:python)?\s(.?)```",
	text,
	re.IGNORECASE \| re.DOTALL,
	)
	if code_blocks:
	return code_blocks[0].strip()
	return text.strip()


	def format_feedback(
	step: int,
	stdout: str,
	stderr: str,
	exit_code: int,
	) -> str:
	"""Generate feedback text describing the previous execution."""

	stdout_display = stdout if stdout.strip() else "<empty>"
	stderr_display = stderr if stderr.strip() else "<empty>"
	return (
	f"Execution feedback for step {step}:\n"
	f"exit_code={exit_code}\n"
	f"stdout:\n{stdout_display}\n"
	f"stderr:\n{stderr_display}\n"
	"If the task is not solved, return an improved Python script."
	)


	def build_initial_prompt(task: str) -> str:
	"""Construct the first user prompt for the coding task."""

	return (
	"You must write Python code to satisfy the following task. "
	"When executed, your script should behave exactly as described.\n\n"
	f"Task:\n{task}\n\n"
	"Reply with the full script in a single ```python code block."
	)


	# ---------------------------------------------------------------------------
	# Gameplay
	# ---------------------------------------------------------------------------


	def solve_coding_task(
	env: CodingEnv,
	client: OpenAI,
	) -> Tuple[bool, List[str]]:
	"""Iteratively ask the model for code until the task is solved."""

	history = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": build_initial_prompt(CODING_TASK)},
	]

	obs = env.reset().observation

	transcripts: List[str] = []

	for step in range(1, MAX_STEPS + 1):
	response = client.chat.completions.create(
	model=MODEL,
	messages=history,
	max_tokens=2048,
	temperature=0.2,
	)

	assistant_message = response.choices[0].message.content.strip()
	history.append({"role": "assistant", "content": assistant_message})

	code = extract_python_code(assistant_message)

	if VERBOSE:
	print(f"\n🛠️ Step {step}: executing model-produced code")
	print(code)

	result = env.step(CodeAction(code=code))
	obs = result.observation

	transcripts.append(
	(
	f"Step {step} \| exit_code={obs.exit_code}\nstdout:\n{obs.stdout}\nstderr:\n{obs.stderr}\n"
	)
	)

	if VERBOSE:
	print(" ▶ exit_code:", obs.exit_code)
	if obs.stdout:
	print(" ▶ stdout:\n" + obs.stdout)
	if obs.stderr:
	print(" ▶ stderr:\n" + obs.stderr)

	solved = obs.exit_code == 0 and EXPECTED_SUBSTRING in obs.stdout
	if solved:
	return True, transcripts

	history.append(
	{
	"role": "user",
	"content": format_feedback(
	step,
	obs.stdout,
	obs.stderr,
	obs.exit_code,
	),
	}
	)

	# Keep conversation history compact to avoid exceeding context limits
	if len(history) > 20:
	history = [history[0]] + history[-19:]

	return False, transcripts


	# ---------------------------------------------------------------------------
	# Entrypoint
	# ---------------------------------------------------------------------------


	def main() -> None:
	if not API_KEY:
	raise SystemExit(
	"HF_TOKEN (or API_KEY) must be set to query the model."
	)

	client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

	env = CodingEnv.from_docker_image(
	"coding-env:latest",
	ports={8000: 8000},
	)

	try:
	success, transcripts = solve_coding_task(env, client)
	finally:
	env.close()

	print(
	"\n✅ Session complete"
	if success
	else "\n⚠️ Session finished without solving the task"
	)
	print("--- Execution transcripts ---")
	for entry in transcripts:
	print(entry)


	if __name__ == "__main__":
	main()