| """BrowserGym MiniWoB example with Qwen deciding the next action. |
| |
| This is an inference example for the BrowserGym environment. It uses the OpenAI |
| client and a vision language model to decide the next action. We use Hugging Face |
| Inference Providers API to access the model, but you can use any other provider that |
| is compatible with the OpenAI API. |
| |
| Prerequisites: |
| - (Optional) Export the MiniWoB URL if you are hosting the tasks yourself |
| (must include the `/miniwob/` suffix); the BrowserGym Docker image now |
| serves the MiniWoB bundle internally on port 8888. |
| - Export your Hugging Face token for the router: |
| `export HF_TOKEN=your_token_here` |
| |
| Usage: |
| python examples/browsergym_example.py |
| """ |
|
|
| import os |
| import re |
| import base64 |
| import textwrap |
| from io import BytesIO |
| from typing import List, Optional, Dict |
|
|
| from openai import OpenAI |
| import numpy as np |
| from PIL import Image |
|
|
| from browsergym_env import BrowserGymAction, BrowserGymEnv |
|
|
| API_BASE_URL = "https://router.huggingface.co/v1" |
| API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") |
| MODEL_NAME = "Qwen/Qwen3-VL-30B-A3B-Instruct:novita" |
| MAX_STEPS = 8 |
| MAX_DOM_CHARS = 3500 |
| TEMPERATURE = 0.2 |
| MAX_TOKENS = 200 |
| FALLBACK_ACTION = "noop()" |
|
|
| DEBUG = True |
| ACTION_PREFIX_RE = re.compile( |
| r"^(action|next action)\s*[:\-]\s*", |
| re.IGNORECASE, |
| ) |
| ACTION_PATTERN = re.compile(r"[A-Za-z_]+\s*\(.*\)", re.DOTALL) |
|
|
|
|
| SYSTEM_PROMPT = textwrap.dedent( |
| """ |
| You control a web browser through BrowserGym. |
| Reply with exactly one action string. |
| The action must be a valid BrowserGym command such as: |
| - noop() |
| - click('<BID>') |
| - type('selector', 'text to enter') |
| - fill('selector', 'text to enter') |
| - send_keys('Enter') |
| - scroll('down') |
| Use single quotes around string arguments. |
| When clicking, use the BrowserGym element IDs (BIDs) listed in the user message. |
| If you are unsure, respond with noop(). |
| Do not include explanations or additional text. |
| """ |
| ).strip() |
|
|
|
|
| def build_history_lines(history: List[str]) -> str: |
| if not history: |
| return "None" |
| return "\n".join(history[-4:]) |
|
|
|
|
| def extract_screenshot_uri(observation) -> Optional[str]: |
| if observation.screenshot is None: |
| return None |
| screen_array = np.array(observation.screenshot, dtype=np.uint8) |
| image = Image.fromarray(screen_array) |
| buffer = BytesIO() |
| image.save(buffer, format="PNG") |
| buffer.seek(0) |
| data_uri = base64.b64encode(buffer.read()).decode("utf-8") |
| return f"data:image/png;base64,{data_uri}" |
|
|
|
|
| def extract_clickable_elements(observation) -> List[Dict[str, str]]: |
| """Collect BrowserGym element IDs that can be clicked.""" |
|
|
| metadata = getattr(observation, "metadata", {}) or {} |
| obs_dict = metadata.get("browsergym_obs", {}) or {} |
| extra_props = obs_dict.get("extra_element_properties", {}) or {} |
|
|
| clickables: List[Dict[str, str]] = [] |
| for bid, props in extra_props.items(): |
| if not props.get("clickable"): |
| continue |
|
|
| bbox = props.get("bbox") or [] |
| bbox_str = ", ".join(bbox) if bbox else "?" |
| clickables.append( |
| { |
| "bid": str(bid), |
| "bbox": bbox_str, |
| } |
| ) |
|
|
| |
| clickables.sort(key=lambda item: item["bid"]) |
| return clickables |
|
|
|
|
| def build_user_prompt(step: int, observation, history: List[str]) -> str: |
| goal = observation.goal or "(not provided)" |
| url = observation.url or "(unknown)" |
| error_note = "Yes" if observation.last_action_error else "No" |
|
|
| clickables = extract_clickable_elements(observation) |
| if clickables: |
| actions_hint = "\n".join( |
| f" - {item['bid']} (bbox: {item['bbox']})" for item in clickables |
| ) |
| else: |
| actions_hint = " (none detected)" |
|
|
| prompt = textwrap.dedent( |
| f""" |
| Step: {step} |
| Goal: {goal} |
| Current URL: {url} |
| Previous steps: |
| {build_history_lines(history)} |
| Last action error: {error_note} |
| |
| Available clickable element IDs: {actions_hint} |
| |
| Reply with exactly one BrowserGym action string. |
| """ |
| ).strip() |
| return prompt |
|
|
|
|
| def parse_model_action(response_text: str) -> str: |
| if not response_text: |
| return FALLBACK_ACTION |
|
|
| |
| lines = response_text.splitlines() |
| for raw_line in lines: |
| line = raw_line.strip() |
| if not line: |
| continue |
| line = ACTION_PREFIX_RE.sub("", line) |
| match = ACTION_PATTERN.search(line) |
| if match: |
| action = match.group(0).strip() |
| |
| action = re.sub(r"\s+", " ", action) |
| |
| |
| return action |
|
|
| |
| match = ACTION_PATTERN.search(response_text) |
| if match: |
| action = match.group(0).strip() |
| action = re.sub(r"\s+", " ", action) |
| return action |
|
|
| return FALLBACK_ACTION |
|
|
|
|
| def main() -> None: |
| client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) |
|
|
| env = BrowserGymEnv.from_docker_image( |
| image="browsergym-env:latest", |
| env_vars={ |
| "BROWSERGYM_BENCHMARK": "miniwob", |
| "BROWSERGYM_TASK_NAME": "click-test", |
| }, |
| ) |
|
|
| history: List[str] = [] |
|
|
| try: |
| result = env.reset() |
| observation = result.observation |
| print(f"Episode goal: {observation.goal}") |
|
|
| for step in range(1, MAX_STEPS + 1): |
| if result.done: |
| print("Environment signalled done. Stopping early.") |
| break |
|
|
| user_prompt = build_user_prompt(step, observation, history) |
| user_content = [{"type": "text", "text": user_prompt}] |
| screenshot_uri = extract_screenshot_uri(observation) |
| if screenshot_uri: |
| user_content.append( |
| { |
| "type": "image_url", |
| "image_url": {"url": screenshot_uri}, |
| } |
| ) |
|
|
| messages = [ |
| { |
| "role": "system", |
| "content": [{"type": "text", "text": SYSTEM_PROMPT}], |
| }, |
| { |
| "role": "user", |
| "content": user_content, |
| }, |
| ] |
|
|
| try: |
| completion = client.chat.completions.create( |
| model=MODEL_NAME, |
| messages=messages, |
| temperature=TEMPERATURE, |
| max_tokens=MAX_TOKENS, |
| stream=False, |
| ) |
| response_text = completion.choices[0].message.content or "" |
| |
| except Exception as exc: |
| failure_msg = f"Model request failed ({exc}). Using fallback action." |
| print(failure_msg) |
| response_text = FALLBACK_ACTION |
|
|
| action_str = parse_model_action(response_text) |
| print(f"Step {step}: model suggested -> {action_str}") |
|
|
| result = env.step(BrowserGymAction(action_str=action_str)) |
| observation = result.observation |
|
|
| reward = result.reward or 0.0 |
| error_flag = " ERROR" if observation.last_action_error else "" |
| history_line = ( |
| f"Step {step}: {action_str} -> reward {reward:+.2f}{error_flag}" |
| ) |
| history.append(history_line) |
| print( |
| " Reward: " |
| f"{reward:+.2f} | Done: {result.done} | Last action error: " |
| f"{observation.last_action_error}" |
| ) |
|
|
| if result.done: |
| print("Episode complete.") |
| break |
|
|
| else: |
| print(f"Reached max steps ({MAX_STEPS}).") |
|
|
| finally: |
| env.close() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|