Spaces:

hanabhi
/

gridworld-env

Sleeping

gridworld-env / OpenEnv /examples /browsergym_example.py

Abhilasha Kakoty

Initial deploy

7078f4d about 2 months ago

8.16 kB

	"""BrowserGym MiniWoB example with Qwen deciding the next action.

	This is an inference example for the BrowserGym environment. It uses the OpenAI
	client and a vision language model to decide the next action. We use Hugging Face
	Inference Providers API to access the model, but you can use any other provider that
	is compatible with the OpenAI API.

	Prerequisites:
	- (Optional) Export the MiniWoB URL if you are hosting the tasks yourself
	(must include the `/miniwob/` suffix); the BrowserGym Docker image now
	serves the MiniWoB bundle internally on port 8888.
	- Export your Hugging Face token for the router:
	`export HF_TOKEN=your_token_here`

	Usage:
	python examples/browsergym_example.py
	"""

	import os
	import re
	import base64
	import textwrap
	from io import BytesIO
	from typing import List, Optional, Dict

	from openai import OpenAI
	import numpy as np
	from PIL import Image

	from browsergym_env import BrowserGymAction, BrowserGymEnv

	API_BASE_URL = "https://router.huggingface.co/v1"
	API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
	MODEL_NAME = "Qwen/Qwen3-VL-30B-A3B-Instruct:novita"
	MAX_STEPS = 8
	MAX_DOM_CHARS = 3500
	TEMPERATURE = 0.2
	MAX_TOKENS = 200
	FALLBACK_ACTION = "noop()"

	DEBUG = True
	ACTION_PREFIX_RE = re.compile(
	r"^(action\|next action)\s[:\-]\s",
	re.IGNORECASE,
	)
	ACTION_PATTERN = re.compile(r"[A-Za-z_]+\s\(.\)", re.DOTALL)


	SYSTEM_PROMPT = textwrap.dedent(
	"""
	You control a web browser through BrowserGym.
	Reply with exactly one action string.
	The action must be a valid BrowserGym command such as:
	- noop()
	- click('<BID>')
	- type('selector', 'text to enter')
	- fill('selector', 'text to enter')
	- send_keys('Enter')
	- scroll('down')
	Use single quotes around string arguments.
	When clicking, use the BrowserGym element IDs (BIDs) listed in the user message.
	If you are unsure, respond with noop().
	Do not include explanations or additional text.
	"""
	).strip()


	def build_history_lines(history: List[str]) -> str:
	if not history:
	return "None"
	return "\n".join(history[-4:])


	def extract_screenshot_uri(observation) -> Optional[str]:
	if observation.screenshot is None:
	return None
	screen_array = np.array(observation.screenshot, dtype=np.uint8)
	image = Image.fromarray(screen_array)
	buffer = BytesIO()
	image.save(buffer, format="PNG")
	buffer.seek(0)
	data_uri = base64.b64encode(buffer.read()).decode("utf-8")
	return f"data:image/png;base64,{data_uri}"


	def extract_clickable_elements(observation) -> List[Dict[str, str]]:
	"""Collect BrowserGym element IDs that can be clicked."""

	metadata = getattr(observation, "metadata", {}) or {}
	obs_dict = metadata.get("browsergym_obs", {}) or {}
	extra_props = obs_dict.get("extra_element_properties", {}) or {}

	clickables: List[Dict[str, str]] = []
	for bid, props in extra_props.items():
	if not props.get("clickable"):
	continue

	bbox = props.get("bbox") or []
	bbox_str = ", ".join(bbox) if bbox else "?"
	clickables.append(
	{
	"bid": str(bid),
	"bbox": bbox_str,
	}
	)

	# Keep a stable ordering for readability
	clickables.sort(key=lambda item: item["bid"])
	return clickables


	def build_user_prompt(step: int, observation, history: List[str]) -> str:
	goal = observation.goal or "(not provided)"
	url = observation.url or "(unknown)"
	error_note = "Yes" if observation.last_action_error else "No"

	clickables = extract_clickable_elements(observation)
	if clickables:
	actions_hint = "\n".join(
	f" - {item['bid']} (bbox: {item['bbox']})" for item in clickables
	)
	else:
	actions_hint = " (none detected)"

	prompt = textwrap.dedent(
	f"""
	Step: {step}
	Goal: {goal}
	Current URL: {url}
	Previous steps:
	{build_history_lines(history)}
	Last action error: {error_note}

	Available clickable element IDs: {actions_hint}

	Reply with exactly one BrowserGym action string.
	"""
	).strip()
	return prompt


	def parse_model_action(response_text: str) -> str:
	if not response_text:
	return FALLBACK_ACTION

	# Prefer the first line that looks like an action string
	lines = response_text.splitlines()
	for raw_line in lines:
	line = raw_line.strip()
	if not line:
	continue
	line = ACTION_PREFIX_RE.sub("", line)
	match = ACTION_PATTERN.search(line)
	if match:
	action = match.group(0).strip()
	# Collapse internal whitespace
	action = re.sub(r"\s+", " ", action)
	# If the model tried to click by natural-language description while we
	# only exposed numeric BrowserGym IDs, fallback to the single detected ID.
	return action

	# Fall back to searching the whole response
	match = ACTION_PATTERN.search(response_text)
	if match:
	action = match.group(0).strip()
	action = re.sub(r"\s+", " ", action)
	return action

	return FALLBACK_ACTION


	def main() -> None:
	client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

	env = BrowserGymEnv.from_docker_image(
	image="browsergym-env:latest",
	env_vars={
	"BROWSERGYM_BENCHMARK": "miniwob",
	"BROWSERGYM_TASK_NAME": "click-test",
	},
	)

	history: List[str] = []

	try:
	result = env.reset()
	observation = result.observation
	print(f"Episode goal: {observation.goal}")

	for step in range(1, MAX_STEPS + 1):
	if result.done:
	print("Environment signalled done. Stopping early.")
	break

	user_prompt = build_user_prompt(step, observation, history)
	user_content = [{"type": "text", "text": user_prompt}]
	screenshot_uri = extract_screenshot_uri(observation)
	if screenshot_uri:
	user_content.append(
	{
	"type": "image_url",
	"image_url": {"url": screenshot_uri},
	}
	)

	messages = [
	{
	"role": "system",
	"content": [{"type": "text", "text": SYSTEM_PROMPT}],
	},
	{
	"role": "user",
	"content": user_content,
	},
	]

	try:
	completion = client.chat.completions.create(
	model=MODEL_NAME,
	messages=messages,
	temperature=TEMPERATURE,
	max_tokens=MAX_TOKENS,
	stream=False,
	)
	response_text = completion.choices[0].message.content or ""
	# pylint: disable=broad-except
	except Exception as exc: # noqa: BLE001
	failure_msg = f"Model request failed ({exc}). Using fallback action."
	print(failure_msg)
	response_text = FALLBACK_ACTION

	action_str = parse_model_action(response_text)
	print(f"Step {step}: model suggested -> {action_str}")

	result = env.step(BrowserGymAction(action_str=action_str))
	observation = result.observation

	reward = result.reward or 0.0
	error_flag = " ERROR" if observation.last_action_error else ""
	history_line = (
	f"Step {step}: {action_str} -> reward {reward:+.2f}{error_flag}"
	)
	history.append(history_line)
	print(
	" Reward: "
	f"{reward:+.2f} \| Done: {result.done} \| Last action error: "
	f"{observation.last_action_error}"
	)

	if result.done:
	print("Episode complete.")
	break

	else:
	print(f"Reached max steps ({MAX_STEPS}).")

	finally:
	env.close()


	if __name__ == "__main__":
	main()