burtenshaw's picture
burtenshaw HF Staff
Upload folder using huggingface_hub
42cc6d2 verified
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
Android Environment HTTP Client.
This module provides the client for connecting to an Android Environment server
over HTTP.
"""
from typing import Any, Dict
from core.client_types import StepResult
from core.env_server.types import State
from core.http_env_client import HTTPEnvClient
from .models import AndroidAction, AndroidObservation
class AndroidEnv(HTTPEnvClient[AndroidAction, AndroidObservation]):
"""
HTTP client for the Android Environment.
This client connects to an AndroidEnvironment HTTP server running in a
container with an Android emulator. It provides methods to interact with
Android applications through touchscreen gestures.
Example:
>>> # Connect to a running server
>>> client = AndroidEnv(base_url="http://localhost:8000")
>>> result = client.reset()
>>> print(result.observation.screen_width, result.observation.screen_height)
>>>
>>> # Tap on the screen
>>> result = client.step(
... AndroidAction(tool_name="tap", parameters={"x": 0.5, "y": 0.3})
... )
>>> print(result.reward, result.done)
Example with Docker:
>>> # Automatically start container and connect
>>> client = AndroidEnv.from_docker_image(
... "android-env:latest",
... environment={
... "ANDROID_AVD_NAME": "Pixel_6_API_33",
... "ANDROID_TASK_PATH": "/workspace/tasks/my_task.textproto"
... }
... )
>>> result = client.reset()
>>> result = client.step(
... AndroidAction(tool_name="tap", parameters={"x": 0.5, "y": 0.5})
... )
>>> # View screen image (base64)
>>> print(result.observation.screen_image[:50]) # First 50 chars
>>> client.close()
Example with high-level gestures:
>>> # Swipe gesture
>>> result = client.step(AndroidAction(
... tool_name="swipe",
... parameters={"x1": 0.5, "y1": 0.8, "x2": 0.5, "y2": 0.2}
... ))
>>>
>>> # Type text (if supported by task)
>>> result = client.step(AndroidAction(
... tool_name="type_text",
... parameters={"text": "Hello Android"}
... ))
>>>
>>> # Press system button
>>> result = client.step(AndroidAction(
... tool_name="press_button",
... parameters={"button": "HOME"}
... ))
"""
def _step_payload(self, action: AndroidAction) -> Dict:
"""
Convert AndroidAction to JSON payload for step request.
Args:
action: AndroidAction instance with tool_name and parameters.
Returns:
Dictionary representation suitable for JSON encoding.
"""
return {
"tool_name": action.tool_name,
"parameters": action.parameters,
"metadata": action.metadata,
}
def _parse_result(self, payload: Dict) -> StepResult[AndroidObservation]:
"""
Parse server response into StepResult[AndroidObservation].
Args:
payload: JSON response from server.
Returns:
StepResult with AndroidObservation containing screen state.
"""
obs_data = payload.get("observation", {})
observation = AndroidObservation(
screen_image=obs_data.get("screen_image", ""),
screen_width=obs_data.get("screen_width", 0),
screen_height=obs_data.get("screen_height", 0),
timestamp_ms=obs_data.get("timestamp_ms", 0),
orientation=obs_data.get("orientation", 0),
extras=obs_data.get("extras", {}),
pixels_shape=obs_data.get("pixels_shape"),
done=obs_data.get("done", False),
reward=obs_data.get("reward"),
metadata=obs_data.get("metadata", {}),
)
return StepResult(
observation=observation,
reward=obs_data.get("reward"),
done=obs_data.get("done", False),
)
def _parse_state(self, payload: Dict) -> State:
"""
Parse server response into State object.
Args:
payload: JSON response from /state endpoint.
Returns:
State object with episode_id and step_count.
"""
return State(
episode_id=payload.get("episode_id"),
step_count=payload.get("step_count", 0),
)