Spaces:
Runtime error
Runtime error
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the BSD-style license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| """ | |
| Android Environment HTTP Client. | |
| This module provides the client for connecting to an Android Environment server | |
| over HTTP. | |
| """ | |
| from typing import Any, Dict | |
| from core.client_types import StepResult | |
| from core.env_server.types import State | |
| from core.http_env_client import HTTPEnvClient | |
| from .models import AndroidAction, AndroidObservation | |
| class AndroidEnv(HTTPEnvClient[AndroidAction, AndroidObservation]): | |
| """ | |
| HTTP client for the Android Environment. | |
| This client connects to an AndroidEnvironment HTTP server running in a | |
| container with an Android emulator. It provides methods to interact with | |
| Android applications through touchscreen gestures. | |
| Example: | |
| >>> # Connect to a running server | |
| >>> client = AndroidEnv(base_url="http://localhost:8000") | |
| >>> result = client.reset() | |
| >>> print(result.observation.screen_width, result.observation.screen_height) | |
| >>> | |
| >>> # Tap on the screen | |
| >>> result = client.step( | |
| ... AndroidAction(tool_name="tap", parameters={"x": 0.5, "y": 0.3}) | |
| ... ) | |
| >>> print(result.reward, result.done) | |
| Example with Docker: | |
| >>> # Automatically start container and connect | |
| >>> client = AndroidEnv.from_docker_image( | |
| ... "android-env:latest", | |
| ... environment={ | |
| ... "ANDROID_AVD_NAME": "Pixel_6_API_33", | |
| ... "ANDROID_TASK_PATH": "/workspace/tasks/my_task.textproto" | |
| ... } | |
| ... ) | |
| >>> result = client.reset() | |
| >>> result = client.step( | |
| ... AndroidAction(tool_name="tap", parameters={"x": 0.5, "y": 0.5}) | |
| ... ) | |
| >>> # View screen image (base64) | |
| >>> print(result.observation.screen_image[:50]) # First 50 chars | |
| >>> client.close() | |
| Example with high-level gestures: | |
| >>> # Swipe gesture | |
| >>> result = client.step(AndroidAction( | |
| ... tool_name="swipe", | |
| ... parameters={"x1": 0.5, "y1": 0.8, "x2": 0.5, "y2": 0.2} | |
| ... )) | |
| >>> | |
| >>> # Type text (if supported by task) | |
| >>> result = client.step(AndroidAction( | |
| ... tool_name="type_text", | |
| ... parameters={"text": "Hello Android"} | |
| ... )) | |
| >>> | |
| >>> # Press system button | |
| >>> result = client.step(AndroidAction( | |
| ... tool_name="press_button", | |
| ... parameters={"button": "HOME"} | |
| ... )) | |
| """ | |
| def _step_payload(self, action: AndroidAction) -> Dict: | |
| """ | |
| Convert AndroidAction to JSON payload for step request. | |
| Args: | |
| action: AndroidAction instance with tool_name and parameters. | |
| Returns: | |
| Dictionary representation suitable for JSON encoding. | |
| """ | |
| return { | |
| "tool_name": action.tool_name, | |
| "parameters": action.parameters, | |
| "metadata": action.metadata, | |
| } | |
| def _parse_result(self, payload: Dict) -> StepResult[AndroidObservation]: | |
| """ | |
| Parse server response into StepResult[AndroidObservation]. | |
| Args: | |
| payload: JSON response from server. | |
| Returns: | |
| StepResult with AndroidObservation containing screen state. | |
| """ | |
| obs_data = payload.get("observation", {}) | |
| observation = AndroidObservation( | |
| screen_image=obs_data.get("screen_image", ""), | |
| screen_width=obs_data.get("screen_width", 0), | |
| screen_height=obs_data.get("screen_height", 0), | |
| timestamp_ms=obs_data.get("timestamp_ms", 0), | |
| orientation=obs_data.get("orientation", 0), | |
| extras=obs_data.get("extras", {}), | |
| pixels_shape=obs_data.get("pixels_shape"), | |
| done=obs_data.get("done", False), | |
| reward=obs_data.get("reward"), | |
| metadata=obs_data.get("metadata", {}), | |
| ) | |
| return StepResult( | |
| observation=observation, | |
| reward=obs_data.get("reward"), | |
| done=obs_data.get("done", False), | |
| ) | |
| def _parse_state(self, payload: Dict) -> State: | |
| """ | |
| Parse server response into State object. | |
| Args: | |
| payload: JSON response from /state endpoint. | |
| Returns: | |
| State object with episode_id and step_count. | |
| """ | |
| return State( | |
| episode_id=payload.get("episode_id"), | |
| step_count=payload.get("step_count", 0), | |
| ) | |