Spaces:

kaustubhg73
/

data-cleaning-openenv

Paused

File size: 6,463 Bytes

e5f64b3

#!/usr/bin/env python3
"""Solve a coding task with a hosted LLM via Hugging Face Inference.

This script mirrors ``textarena_wordle_inference.py`` but targets the Coding
environment. It launches the CodingEnv Docker image locally and asks an
OpenAI-compatible model served through Hugging Face's router to iteratively
produce Python code until the task is solved.

Prerequisites
-------------
1. Build the Coding environment Docker image::

       docker build \
           -f envs/coding_env/server/Dockerfile \
           -t coding-env:latest .

2. Set your Hugging Face token, or any other API key that is compatible with the OpenAI API:

       export HF_TOKEN=your_token_here
       export API_KEY=your_api_key_here

3. Run the script::

       python examples/coding_env_inference.py

The script keeps sending execution feedback to the model until it prints
``Result: 338350`` or reaches the configured step limit.
"""

from __future__ import annotations

import os
import re
from typing import List, Tuple

from openai import OpenAI

from coding_env import CodeAction, CodingEnv


# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

API_BASE_URL = "https://router.huggingface.co/v1"
API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN")

MODEL = "openai/gpt-oss-120b:novita"
MAX_STEPS = 5
VERBOSE = True

CODING_TASK = (
    "Write Python code that prints the sum of squares of the integers from 1 "
    "to 100 inclusive. The final line must be exactly `Result: <value>` with "
    "the correct number substituted."
)
EXPECTED_SUBSTRING = "Result: 338350"

SYSTEM_PROMPT = (
    "You are an expert Python programmer. Respond with valid Python code that "
    "solves the user's task. Always wrap your final answer in a fenced code "
    "block starting with ```python. Provide a complete script that can be "
    "executed as-is, with no commentary outside the code block."
)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def extract_python_code(text: str) -> str:
    """Extract the first Python code block from the model output."""

    code_blocks = re.findall(
        r"```(?:python)?\s*(.*?)```",
        text,
        re.IGNORECASE | re.DOTALL,
    )
    if code_blocks:
        return code_blocks[0].strip()
    return text.strip()


def format_feedback(
    step: int,
    stdout: str,
    stderr: str,
    exit_code: int,
) -> str:
    """Generate feedback text describing the previous execution."""

    stdout_display = stdout if stdout.strip() else "<empty>"
    stderr_display = stderr if stderr.strip() else "<empty>"
    return (
        f"Execution feedback for step {step}:\n"
        f"exit_code={exit_code}\n"
        f"stdout:\n{stdout_display}\n"
        f"stderr:\n{stderr_display}\n"
        "If the task is not solved, return an improved Python script."
    )


def build_initial_prompt(task: str) -> str:
    """Construct the first user prompt for the coding task."""

    return (
        "You must write Python code to satisfy the following task. "
        "When executed, your script should behave exactly as described.\n\n"
        f"Task:\n{task}\n\n"
        "Reply with the full script in a single ```python code block."
    )


# ---------------------------------------------------------------------------
# Gameplay
# ---------------------------------------------------------------------------


def solve_coding_task(
    env: CodingEnv,
    client: OpenAI,
) -> Tuple[bool, List[str]]:
    """Iteratively ask the model for code until the task is solved."""

    history = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": build_initial_prompt(CODING_TASK)},
    ]

    obs = env.reset().observation

    transcripts: List[str] = []

    for step in range(1, MAX_STEPS + 1):
        response = client.chat.completions.create(
            model=MODEL,
            messages=history,
            max_tokens=2048,
            temperature=0.2,
        )

        assistant_message = response.choices[0].message.content.strip()
        history.append({"role": "assistant", "content": assistant_message})

        code = extract_python_code(assistant_message)

        if VERBOSE:
            print(f"\n🛠️  Step {step}: executing model-produced code")
            print(code)

        result = env.step(CodeAction(code=code))
        obs = result.observation

        transcripts.append(
            (
                f"Step {step} | exit_code={obs.exit_code}\nstdout:\n{obs.stdout}\nstderr:\n{obs.stderr}\n"
            )
        )

        if VERBOSE:
            print("   ▶ exit_code:", obs.exit_code)
            if obs.stdout:
                print("   ▶ stdout:\n" + obs.stdout)
            if obs.stderr:
                print("   ▶ stderr:\n" + obs.stderr)

        solved = obs.exit_code == 0 and EXPECTED_SUBSTRING in obs.stdout
        if solved:
            return True, transcripts

        history.append(
            {
                "role": "user",
                "content": format_feedback(
                    step,
                    obs.stdout,
                    obs.stderr,
                    obs.exit_code,
                ),
            }
        )

        # Keep conversation history compact to avoid exceeding context limits
        if len(history) > 20:
            history = [history[0]] + history[-19:]

    return False, transcripts


# ---------------------------------------------------------------------------
# Entrypoint
# ---------------------------------------------------------------------------


def main() -> None:
    if not API_KEY:
        raise SystemExit(
            "HF_TOKEN (or API_KEY) must be set to query the model."
        )

    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

    env = CodingEnv.from_docker_image(
        "coding-env:latest",
        ports={8000: 8000},
    )

    try:
        success, transcripts = solve_coding_task(env, client)
    finally:
        env.close()

    print(
        "\n✅ Session complete"
        if success
        else "\n⚠️ Session finished without solving the task"
    )
    print("--- Execution transcripts ---")
    for entry in transcripts:
        print(entry)


if __name__ == "__main__":
    main()