Spaces:
Sleeping
Sleeping
| """ | |
| Inference Script — Dalaal Browser-Use Environment | |
| =================================== | |
| MANDATORY | |
| - Before submitting, ensure the following variables are defined in your environment configuration: | |
| API_BASE_URL The API endpoint for the LLM. | |
| MODEL_NAME The model identifier to use for inference. | |
| HF_TOKEN Your Hugging Face / API key. | |
| LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image() | |
| - The inference script must be named `inference.py` and placed in the root directory of the project | |
| - Participants must use OpenAI Client for all LLM calls using above variables | |
| STDOUT FORMAT | |
| - The script must emit exactly three line types to stdout, in this order: | |
| [START] task=<task_name> env=<benchmark> model=<model_name> | |
| [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null> | |
| [END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn> | |
| """ | |
| import asyncio | |
| import json | |
| import os | |
| import textwrap | |
| from typing import List, Optional | |
| from openai import OpenAI | |
| from dalaal_env import DalaalEnvAction, DalaalEnvEnv | |
| IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME") | |
| API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") | |
| API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") | |
| MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen3.5-27B") | |
| TASK_NAME = os.getenv("DALAAL_TASK", "todo_add") | |
| BENCHMARK = "dalaal_env" | |
| MAX_STEPS = 15 | |
| TEMPERATURE = 0.0 | |
| MAX_TOKENS = 300 | |
| SYSTEM_PROMPT = textwrap.dedent("""\ | |
| You are a browser automation agent. You interact with web pages by reading | |
| an accessibility tree and issuing actions. | |
| ACCESSIBILITY TREE FORMAT: | |
| Each element has an [ID] followed by its role and properties: | |
| [1] heading "Page Title" | |
| [2] textbox "Search" value="" | |
| [3] button "Submit" | |
| [4] checkbox "Accept terms" checked=false | |
| AVAILABLE ACTIONS (respond with exactly one JSON object): | |
| - Click an element: {"action_type": "click", "element_id": <id>} | |
| - Type into an element: {"action_type": "type", "element_id": <id>, "text": "<text>"} | |
| - Select a dropdown option: {"action_type": "select_option", "element_id": <id>, "text": "<option label>"} | |
| - Press a key: {"action_type": "press_key", "key": "<key name>"} | |
| - Scroll: {"action_type": "scroll", "direction": "up" or "down"} | |
| - Go back: {"action_type": "go_back"} | |
| - Signal task complete: {"action_type": "done"} | |
| STRATEGY: | |
| - Think step by step about what action to take next to accomplish the task. | |
| - Each action changes the page. After typing text into an input, you typically | |
| need to click a button to submit it. | |
| - Do NOT repeat the same action if the page hasn't changed. | |
| - When the task appears complete (e.g., you can see the expected result in the | |
| accessibility tree), use {"action_type": "done"}. | |
| RULES: | |
| - Respond with ONLY a JSON object. No explanation, no markdown, no extra text. | |
| - Use element IDs from the current accessibility tree. | |
| - If you see an error, try a different approach. | |
| """) | |
| def log_start(task: str, env: str, model: str) -> None: | |
| print(f"[START] task={task} env={env} model={model}", flush=True) | |
| def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None: | |
| error_val = error if error else "null" | |
| done_val = str(done).lower() | |
| print( | |
| f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", | |
| flush=True, | |
| ) | |
| def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None: | |
| rewards_str = ",".join(f"{r:.2f}" for r in rewards) | |
| print( | |
| f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", | |
| flush=True, | |
| ) | |
| def parse_action(text: str) -> DalaalEnvAction: | |
| """Parse LLM response into a DalaalEnvAction.""" | |
| text = text.strip() | |
| # Strip markdown code fences if present | |
| if text.startswith("```"): | |
| lines = text.split("\n") | |
| lines = [l for l in lines if not l.startswith("```")] | |
| text = "\n".join(lines).strip() | |
| data = json.loads(text) | |
| return DalaalEnvAction(**data) | |
| def build_user_prompt( | |
| task: str, | |
| tree: str, | |
| url: str, | |
| step: int, | |
| max_steps: int, | |
| last_error: Optional[str], | |
| ) -> str: | |
| parts = [ | |
| f"TASK: {task}", | |
| f"STEP: {step}/{max_steps}", | |
| f"URL: {url}", | |
| ] | |
| if last_error: | |
| parts.append(f"LAST ACTION ERROR: {last_error}") | |
| parts.append(f"ACCESSIBILITY TREE:\n{tree}") | |
| parts.append("Respond with your next action as a JSON object.") | |
| return "\n\n".join(parts) | |
| def get_action_from_llm( | |
| client: OpenAI, | |
| messages: list, | |
| task: str, | |
| tree: str, | |
| url: str, | |
| step: int, | |
| max_steps: int, | |
| last_error: Optional[str], | |
| ) -> tuple[DalaalEnvAction, str]: | |
| """Call the LLM and parse the response into an action. Returns (action, raw_text).""" | |
| user_prompt = build_user_prompt(task, tree, url, step, max_steps, last_error) | |
| messages.append({"role": "user", "content": user_prompt}) | |
| try: | |
| completion = client.chat.completions.create( | |
| model=MODEL_NAME, | |
| messages=messages, | |
| temperature=TEMPERATURE, | |
| max_tokens=MAX_TOKENS, | |
| stream=False, | |
| ) | |
| text = (completion.choices[0].message.content or "").strip() | |
| messages.append({"role": "assistant", "content": text}) | |
| print(f"[DEBUG] LLM response: {text}", flush=True) | |
| return parse_action(text), text | |
| except Exception as exc: | |
| print(f"[DEBUG] LLM/parse error: {exc}", flush=True) | |
| return DalaalEnvAction(action_type="done"), "" | |
| async def main() -> None: | |
| client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) | |
| if IMAGE_NAME: | |
| from openenv.core.containers.runtime import LocalDockerProvider | |
| provider = LocalDockerProvider() | |
| base_url = provider.start_container(IMAGE_NAME) | |
| provider.wait_for_ready(base_url, timeout_s=60.0) | |
| env = DalaalEnvEnv(base_url=base_url, provider=provider) | |
| # Retry WebSocket connect — server may need extra time after health check passes | |
| for attempt in range(5): | |
| try: | |
| await env.connect() | |
| break | |
| except (ConnectionError, OSError) as e: | |
| if attempt == 4: | |
| raise | |
| print(f"[DEBUG] WS connect attempt {attempt + 1} failed: {e}, retrying...", flush=True) | |
| await asyncio.sleep(3) | |
| else: | |
| env = DalaalEnvEnv(base_url=os.getenv("DALAAL_ENV_URL", "http://localhost:8000")) | |
| await env.connect() | |
| rewards: List[float] = [] | |
| steps_taken = 0 | |
| score = 0.0 | |
| success = False | |
| log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME) | |
| try: | |
| result = await env.reset(task=TASK_NAME) | |
| obs = result.observation | |
| # Conversation history for multi-turn reasoning | |
| messages = [{"role": "system", "content": SYSTEM_PROMPT}] | |
| for step in range(1, MAX_STEPS + 1): | |
| if result.done: | |
| break | |
| action, raw = get_action_from_llm( | |
| client=client, | |
| messages=messages, | |
| task=obs.task_description, | |
| tree=obs.accessibility_tree, | |
| url=obs.url, | |
| step=step, | |
| max_steps=obs.max_steps, | |
| last_error=obs.last_action_error, | |
| ) | |
| result = await env.step(action) | |
| obs = result.observation | |
| reward = result.reward or 0.0 | |
| done = result.done | |
| error = obs.last_action_error | |
| rewards.append(reward) | |
| steps_taken = step | |
| action_str = f"{action.action_type}({action.element_id or action.text or action.key or ''})" | |
| log_step(step=step, action=action_str, reward=reward, done=done, error=error) | |
| if done: | |
| break | |
| # Final reward is the last reward (which encodes success) | |
| if rewards and rewards[-1] > 0: | |
| score = rewards[-1] | |
| success = True | |
| else: | |
| score = 0.0 | |
| success = False | |
| score = min(max(score, 0.0), 1.0) | |
| finally: | |
| try: | |
| await env.close() | |
| except Exception as e: | |
| print(f"[DEBUG] env.close() error: {e}", flush=True) | |
| log_end(success=success, steps=steps_taken, score=score, rewards=rewards) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |