data-cleaning-openenv / OpenEnv /examples /coding_env_inference.py
kaustubhg73's picture
Upload folder using huggingface_hub
e5f64b3 verified
#!/usr/bin/env python3
"""Solve a coding task with a hosted LLM via Hugging Face Inference.
This script mirrors ``textarena_wordle_inference.py`` but targets the Coding
environment. It launches the CodingEnv Docker image locally and asks an
OpenAI-compatible model served through Hugging Face's router to iteratively
produce Python code until the task is solved.
Prerequisites
-------------
1. Build the Coding environment Docker image::
docker build \
-f envs/coding_env/server/Dockerfile \
-t coding-env:latest .
2. Set your Hugging Face token, or any other API key that is compatible with the OpenAI API:
export HF_TOKEN=your_token_here
export API_KEY=your_api_key_here
3. Run the script::
python examples/coding_env_inference.py
The script keeps sending execution feedback to the model until it prints
``Result: 338350`` or reaches the configured step limit.
"""
from __future__ import annotations
import os
import re
from typing import List, Tuple
from openai import OpenAI
from coding_env import CodeAction, CodingEnv
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
API_BASE_URL = "https://router.huggingface.co/v1"
API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN")
MODEL = "openai/gpt-oss-120b:novita"
MAX_STEPS = 5
VERBOSE = True
CODING_TASK = (
"Write Python code that prints the sum of squares of the integers from 1 "
"to 100 inclusive. The final line must be exactly `Result: <value>` with "
"the correct number substituted."
)
EXPECTED_SUBSTRING = "Result: 338350"
SYSTEM_PROMPT = (
"You are an expert Python programmer. Respond with valid Python code that "
"solves the user's task. Always wrap your final answer in a fenced code "
"block starting with ```python. Provide a complete script that can be "
"executed as-is, with no commentary outside the code block."
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def extract_python_code(text: str) -> str:
"""Extract the first Python code block from the model output."""
code_blocks = re.findall(
r"```(?:python)?\s*(.*?)```",
text,
re.IGNORECASE | re.DOTALL,
)
if code_blocks:
return code_blocks[0].strip()
return text.strip()
def format_feedback(
step: int,
stdout: str,
stderr: str,
exit_code: int,
) -> str:
"""Generate feedback text describing the previous execution."""
stdout_display = stdout if stdout.strip() else "<empty>"
stderr_display = stderr if stderr.strip() else "<empty>"
return (
f"Execution feedback for step {step}:\n"
f"exit_code={exit_code}\n"
f"stdout:\n{stdout_display}\n"
f"stderr:\n{stderr_display}\n"
"If the task is not solved, return an improved Python script."
)
def build_initial_prompt(task: str) -> str:
"""Construct the first user prompt for the coding task."""
return (
"You must write Python code to satisfy the following task. "
"When executed, your script should behave exactly as described.\n\n"
f"Task:\n{task}\n\n"
"Reply with the full script in a single ```python code block."
)
# ---------------------------------------------------------------------------
# Gameplay
# ---------------------------------------------------------------------------
def solve_coding_task(
env: CodingEnv,
client: OpenAI,
) -> Tuple[bool, List[str]]:
"""Iteratively ask the model for code until the task is solved."""
history = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": build_initial_prompt(CODING_TASK)},
]
obs = env.reset().observation
transcripts: List[str] = []
for step in range(1, MAX_STEPS + 1):
response = client.chat.completions.create(
model=MODEL,
messages=history,
max_tokens=2048,
temperature=0.2,
)
assistant_message = response.choices[0].message.content.strip()
history.append({"role": "assistant", "content": assistant_message})
code = extract_python_code(assistant_message)
if VERBOSE:
print(f"\n🛠️ Step {step}: executing model-produced code")
print(code)
result = env.step(CodeAction(code=code))
obs = result.observation
transcripts.append(
(
f"Step {step} | exit_code={obs.exit_code}\nstdout:\n{obs.stdout}\nstderr:\n{obs.stderr}\n"
)
)
if VERBOSE:
print(" ▶ exit_code:", obs.exit_code)
if obs.stdout:
print(" ▶ stdout:\n" + obs.stdout)
if obs.stderr:
print(" ▶ stderr:\n" + obs.stderr)
solved = obs.exit_code == 0 and EXPECTED_SUBSTRING in obs.stdout
if solved:
return True, transcripts
history.append(
{
"role": "user",
"content": format_feedback(
step,
obs.stdout,
obs.stderr,
obs.exit_code,
),
}
)
# Keep conversation history compact to avoid exceeding context limits
if len(history) > 20:
history = [history[0]] + history[-19:]
return False, transcripts
# ---------------------------------------------------------------------------
# Entrypoint
# ---------------------------------------------------------------------------
def main() -> None:
if not API_KEY:
raise SystemExit(
"HF_TOKEN (or API_KEY) must be set to query the model."
)
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
env = CodingEnv.from_docker_image(
"coding-env:latest",
ports={8000: 8000},
)
try:
success, transcripts = solve_coding_task(env, client)
finally:
env.close()
print(
"\n✅ Session complete"
if success
else "\n⚠️ Session finished without solving the task"
)
print("--- Execution transcripts ---")
for entry in transcripts:
print(entry)
if __name__ == "__main__":
main()