File size: 3,084 Bytes
42cc6d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
Data models for the Android Environment.

The Android environment provides access to Android applications and the
Android OS through a touchscreen interface. Actions represent touch events
and gestures, while observations contain screen pixels and metadata.
"""

from dataclasses import dataclass, field
from typing import Any, Dict, Optional

from core.env_server.types import Action, Observation


@dataclass(kw_only=True)
class AndroidAction(Action):
    """Action for the Android environment.

    Supports multiple interaction types following RFC 004's ToolCallAction pattern.

    Examples:
        # Tap at specific coordinates
        AndroidAction(
            tool_name="tap",
            parameters={"x": 0.5, "y": 0.3}
        )

        # Swipe gesture
        AndroidAction(
            tool_name="swipe",
            parameters={"x1": 0.2, "y1": 0.5, "x2": 0.8, "y2": 0.5, "duration_ms": 300}
        )

        # Type text
        AndroidAction(
            tool_name="type_text",
            parameters={"text": "Hello World"}
        )

        # Press system button
        AndroidAction(
            tool_name="press_button",
            parameters={"button": "HOME"}  # HOME, BACK, MENU, etc.
        )

        # Raw touch event (for advanced control)
        AndroidAction(
            tool_name="touch_event",
            parameters={
                "action_type": "TOUCH",  # TOUCH, LIFT, REPEAT
                "touch_position": [0.5, 0.3],  # normalized [0, 1]
                "duration_ms": 100
            }
        )
    """

    tool_name: str  # Action type: "tap", "swipe", "type_text", "press_button", "touch_event"
    parameters: Dict[str, Any] = field(default_factory=dict)


@dataclass(kw_only=True)
class AndroidObservation(Observation):
    """Observation from the Android environment.

    Contains the current screen state as an image plus additional metadata
    about the Android system and task state.

    Attributes:
        screen_image: Base64-encoded image (JPEG or PNG) of current screen.
        screen_width: Width of the screen in pixels.
        screen_height: Height of the screen in pixels.
        timestamp_ms: Timestamp of the observation in milliseconds.
        orientation: Screen orientation (0, 90, 180, 270 degrees).
        extras: Additional task-specific information (e.g., accessibility tree,
                current app package, system state).
    """

    screen_image: str  # Base64-encoded image
    screen_width: int
    screen_height: int
    timestamp_ms: int = 0
    orientation: int = 0  # degrees: 0, 90, 180, 270

    # Task extras from android_env (accessibility info, package names, etc.)
    extras: Dict[str, Any] = field(default_factory=dict)

    # Optional: Include raw pixels shape for reference
    pixels_shape: Optional[tuple[int, int, int]] = None  # (height, width, channels)