Spaces:

ankitdhiman
/

dalaal-env

Sleeping

App Files Files Community

dalaal-env / inference.py

ankitdhiman

Upload folder using huggingface_hub

dc4806d verified about 2 months ago

raw

history blame contribute delete

8.69 kB

	"""
	Inference Script — Dalaal Browser-Use Environment
	===================================
	MANDATORY
	- Before submitting, ensure the following variables are defined in your environment configuration:
	API_BASE_URL The API endpoint for the LLM.
	MODEL_NAME The model identifier to use for inference.
	HF_TOKEN Your Hugging Face / API key.
	LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()

	- The inference script must be named `inference.py` and placed in the root directory of the project
	- Participants must use OpenAI Client for all LLM calls using above variables

	STDOUT FORMAT
	- The script must emit exactly three line types to stdout, in this order:

	[START] task=<task_name> env=<benchmark> model=<model_name>
	[STEP] step=<n> action=<action_str> reward=<0.00> done=<true\|false> error=<msg\|null>
	[END] success=<true\|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
	"""

	import asyncio
	import json
	import os
	import textwrap
	from typing import List, Optional

	from openai import OpenAI

	from dalaal_env import DalaalEnvAction, DalaalEnvEnv

	IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
	API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
	API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
	MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen3.5-27B")

	TASK_NAME = os.getenv("DALAAL_TASK", "todo_add")
	BENCHMARK = "dalaal_env"
	MAX_STEPS = 15
	TEMPERATURE = 0.0
	MAX_TOKENS = 300

	SYSTEM_PROMPT = textwrap.dedent("""\
	You are a browser automation agent. You interact with web pages by reading
	an accessibility tree and issuing actions.

	ACCESSIBILITY TREE FORMAT:
	Each element has an [ID] followed by its role and properties:
	[1] heading "Page Title"
	[2] textbox "Search" value=""
	[3] button "Submit"
	[4] checkbox "Accept terms" checked=false

	AVAILABLE ACTIONS (respond with exactly one JSON object):
	- Click an element: {"action_type": "click", "element_id": <id>}
	- Type into an element: {"action_type": "type", "element_id": <id>, "text": "<text>"}
	- Select a dropdown option: {"action_type": "select_option", "element_id": <id>, "text": "<option label>"}
	- Press a key: {"action_type": "press_key", "key": "<key name>"}
	- Scroll: {"action_type": "scroll", "direction": "up" or "down"}
	- Go back: {"action_type": "go_back"}
	- Signal task complete: {"action_type": "done"}

	STRATEGY:
	- Think step by step about what action to take next to accomplish the task.
	- Each action changes the page. After typing text into an input, you typically
	need to click a button to submit it.
	- Do NOT repeat the same action if the page hasn't changed.
	- When the task appears complete (e.g., you can see the expected result in the
	accessibility tree), use {"action_type": "done"}.

	RULES:
	- Respond with ONLY a JSON object. No explanation, no markdown, no extra text.
	- Use element IDs from the current accessibility tree.
	- If you see an error, try a different approach.
	""")


	def log_start(task: str, env: str, model: str) -> None:
	print(f"[START] task={task} env={env} model={model}", flush=True)


	def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
	error_val = error if error else "null"
	done_val = str(done).lower()
	print(
	f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
	flush=True,
	)


	def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
	rewards_str = ",".join(f"{r:.2f}" for r in rewards)
	print(
	f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
	flush=True,
	)


	def parse_action(text: str) -> DalaalEnvAction:
	"""Parse LLM response into a DalaalEnvAction."""
	text = text.strip()
	# Strip markdown code fences if present
	if text.startswith("```"):
	lines = text.split("\n")
	lines = [l for l in lines if not l.startswith("```")]
	text = "\n".join(lines).strip()

	data = json.loads(text)
	return DalaalEnvAction(**data)


	def build_user_prompt(
	task: str,
	tree: str,
	url: str,
	step: int,
	max_steps: int,
	last_error: Optional[str],
	) -> str:
	parts = [
	f"TASK: {task}",
	f"STEP: {step}/{max_steps}",
	f"URL: {url}",
	]
	if last_error:
	parts.append(f"LAST ACTION ERROR: {last_error}")
	parts.append(f"ACCESSIBILITY TREE:\n{tree}")
	parts.append("Respond with your next action as a JSON object.")
	return "\n\n".join(parts)


	def get_action_from_llm(
	client: OpenAI,
	messages: list,
	task: str,
	tree: str,
	url: str,
	step: int,
	max_steps: int,
	last_error: Optional[str],
	) -> tuple[DalaalEnvAction, str]:
	"""Call the LLM and parse the response into an action. Returns (action, raw_text)."""
	user_prompt = build_user_prompt(task, tree, url, step, max_steps, last_error)
	messages.append({"role": "user", "content": user_prompt})

	try:
	completion = client.chat.completions.create(
	model=MODEL_NAME,
	messages=messages,
	temperature=TEMPERATURE,
	max_tokens=MAX_TOKENS,
	stream=False,
	)
	text = (completion.choices[0].message.content or "").strip()
	messages.append({"role": "assistant", "content": text})
	print(f"[DEBUG] LLM response: {text}", flush=True)
	return parse_action(text), text
	except Exception as exc:
	print(f"[DEBUG] LLM/parse error: {exc}", flush=True)
	return DalaalEnvAction(action_type="done"), ""


	async def main() -> None:
	client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

	if IMAGE_NAME:
	from openenv.core.containers.runtime import LocalDockerProvider
	provider = LocalDockerProvider()
	base_url = provider.start_container(IMAGE_NAME)
	provider.wait_for_ready(base_url, timeout_s=60.0)
	env = DalaalEnvEnv(base_url=base_url, provider=provider)
	# Retry WebSocket connect — server may need extra time after health check passes
	for attempt in range(5):
	try:
	await env.connect()
	break
	except (ConnectionError, OSError) as e:
	if attempt == 4:
	raise
	print(f"[DEBUG] WS connect attempt {attempt + 1} failed: {e}, retrying...", flush=True)
	await asyncio.sleep(3)
	else:
	env = DalaalEnvEnv(base_url=os.getenv("DALAAL_ENV_URL", "http://localhost:8000"))
	await env.connect()

	rewards: List[float] = []
	steps_taken = 0
	score = 0.0
	success = False

	log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)

	try:
	result = await env.reset(task=TASK_NAME)
	obs = result.observation

	# Conversation history for multi-turn reasoning
	messages = [{"role": "system", "content": SYSTEM_PROMPT}]

	for step in range(1, MAX_STEPS + 1):
	if result.done:
	break

	action, raw = get_action_from_llm(
	client=client,
	messages=messages,
	task=obs.task_description,
	tree=obs.accessibility_tree,
	url=obs.url,
	step=step,
	max_steps=obs.max_steps,
	last_error=obs.last_action_error,
	)

	result = await env.step(action)
	obs = result.observation

	reward = result.reward or 0.0
	done = result.done
	error = obs.last_action_error

	rewards.append(reward)
	steps_taken = step

	action_str = f"{action.action_type}({action.element_id or action.text or action.key or ''})"
	log_step(step=step, action=action_str, reward=reward, done=done, error=error)

	if done:
	break

	# Final reward is the last reward (which encodes success)
	if rewards and rewards[-1] > 0:
	score = rewards[-1]
	success = True
	else:
	score = 0.0
	success = False

	score = min(max(score, 0.0), 1.0)

	finally:
	try:
	await env.close()
	except Exception as e:
	print(f"[DEBUG] env.close() error: {e}", flush=True)
	log_end(success=success, steps=steps_taken, score=score, rewards=rewards)


	if __name__ == "__main__":
	asyncio.run(main())