File size: 4,688 Bytes
42cc6d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
Android Environment HTTP Client.

This module provides the client for connecting to an Android Environment server
over HTTP.
"""

from typing import Any, Dict

from core.client_types import StepResult
from core.env_server.types import State
from core.http_env_client import HTTPEnvClient

from .models import AndroidAction, AndroidObservation


class AndroidEnv(HTTPEnvClient[AndroidAction, AndroidObservation]):
    """
    HTTP client for the Android Environment.

    This client connects to an AndroidEnvironment HTTP server running in a
    container with an Android emulator. It provides methods to interact with
    Android applications through touchscreen gestures.

    Example:
        >>> # Connect to a running server
        >>> client = AndroidEnv(base_url="http://localhost:8000")
        >>> result = client.reset()
        >>> print(result.observation.screen_width, result.observation.screen_height)
        >>>
        >>> # Tap on the screen
        >>> result = client.step(
        ...     AndroidAction(tool_name="tap", parameters={"x": 0.5, "y": 0.3})
        ... )
        >>> print(result.reward, result.done)

    Example with Docker:
        >>> # Automatically start container and connect
        >>> client = AndroidEnv.from_docker_image(
        ...     "android-env:latest",
        ...     environment={
        ...         "ANDROID_AVD_NAME": "Pixel_6_API_33",
        ...         "ANDROID_TASK_PATH": "/workspace/tasks/my_task.textproto"
        ...     }
        ... )
        >>> result = client.reset()
        >>> result = client.step(
        ...     AndroidAction(tool_name="tap", parameters={"x": 0.5, "y": 0.5})
        ... )
        >>> # View screen image (base64)
        >>> print(result.observation.screen_image[:50])  # First 50 chars
        >>> client.close()

    Example with high-level gestures:
        >>> # Swipe gesture
        >>> result = client.step(AndroidAction(
        ...     tool_name="swipe",
        ...     parameters={"x1": 0.5, "y1": 0.8, "x2": 0.5, "y2": 0.2}
        ... ))
        >>>
        >>> # Type text (if supported by task)
        >>> result = client.step(AndroidAction(
        ...     tool_name="type_text",
        ...     parameters={"text": "Hello Android"}
        ... ))
        >>>
        >>> # Press system button
        >>> result = client.step(AndroidAction(
        ...     tool_name="press_button",
        ...     parameters={"button": "HOME"}
        ... ))
    """

    def _step_payload(self, action: AndroidAction) -> Dict:
        """
        Convert AndroidAction to JSON payload for step request.

        Args:
            action: AndroidAction instance with tool_name and parameters.

        Returns:
            Dictionary representation suitable for JSON encoding.
        """
        return {
            "tool_name": action.tool_name,
            "parameters": action.parameters,
            "metadata": action.metadata,
        }

    def _parse_result(self, payload: Dict) -> StepResult[AndroidObservation]:
        """
        Parse server response into StepResult[AndroidObservation].

        Args:
            payload: JSON response from server.

        Returns:
            StepResult with AndroidObservation containing screen state.
        """
        obs_data = payload.get("observation", {})

        observation = AndroidObservation(
            screen_image=obs_data.get("screen_image", ""),
            screen_width=obs_data.get("screen_width", 0),
            screen_height=obs_data.get("screen_height", 0),
            timestamp_ms=obs_data.get("timestamp_ms", 0),
            orientation=obs_data.get("orientation", 0),
            extras=obs_data.get("extras", {}),
            pixels_shape=obs_data.get("pixels_shape"),
            done=obs_data.get("done", False),
            reward=obs_data.get("reward"),
            metadata=obs_data.get("metadata", {}),
        )

        return StepResult(
            observation=observation,
            reward=obs_data.get("reward"),
            done=obs_data.get("done", False),
        )

    def _parse_state(self, payload: Dict) -> State:
        """
        Parse server response into State object.

        Args:
            payload: JSON response from /state endpoint.

        Returns:
            State object with episode_id and step_count.
        """
        return State(
            episode_id=payload.get("episode_id"),
            step_count=payload.get("step_count", 0),
        )