File size: 3,912 Bytes
6a42990
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from tinytroupe.agent.mental_faculty import TinyMentalFaculty
from tinytroupe.tools import browser
import textwrap

class BrowserFaculty(TinyMentalFaculty):
    """
    A mental faculty that allows an agent to interact with a web browser.
    """

    def __init__(self):
        super().__init__("Browser Navigation")

    def process_action(self, agent, action: dict) -> bool:
        """
        Processes a browser-related action.
        """
        action_type = action.get("type")
        content = action.get("content")
        target = action.get("target")

        if action_type == "See":
            screenshot_path = browser.screenshot()
            agent.see(f"Took a screenshot and saved it to {screenshot_path}. I will now analyze the screenshot.")
            return True
        elif action_type == "Click":
            browser.click(target)
            agent.see(f"Clicked on element with selector: {target}")
            return True
        elif action_type == "Write":
            browser.fill(target, content)
            agent.see(f"Typed '{content}' into element with selector: {target}")
            return True
        elif action_type == "Submit":
            browser.submit_form(target)
            agent.see(f"Submitted form with element: {target}")
            return True
        elif action_type == "Wait":
            browser.wait_for_element(target)
            agent.see(f"Waited for element: {target}")
            return True
        elif action_type == "Scroll":
            browser.scroll_page(content)
            agent.see(f"Scrolled page {content}")
            return True
        elif action_type == "Hover":
            browser.hover_element(target)
            agent.see(f"Hovered over element: {target}")
            return True
        elif action_type == "Keyboard_Key":
            browser.press_key(content)
            agent.see(f"Pressed key: {content}")
            return True
        elif action_type == "ScanPage":
            page_info = browser.get_page_info()
            agent.see(f"Scanned page and found the following information: {page_info}")
            return True
        return False

    def actions_definitions_prompt(self) -> str:
        """
        Returns the prompt for defining browser-related actions.
        """
        prompt = """
          - See: Take a screenshot of the current page. The `content` will be a placeholder for vision.
          - Click: Click on an element on the page. The `target` should be a CSS selector for the element.
          - Write: Type text into an element on the page. The `target` should be a CSS selector for the element, and `content` should be the text to type.
          - Submit: Submit a form on the page. The `target` should be a CSS selector for a form or an element within a form.
          - Wait: Wait for an element to appear on the page. The `target` should be a CSS selector for the element.
          - Scroll: Scroll the page. The `content` should be 'up' or 'down'.
          - Hover: Hover over an element on the page. The `target` should be a CSS selector for the element.
          - Keyboard_Key: Press a key on the keyboard. The `content` should be the key to press (e.g., 'Enter', 'ArrowDown').
          - ScanPage: Get information about the current page, such as links and form elements.
        """
        return textwrap.dedent(prompt)

    def actions_constraints_prompt(self) -> str:
        """
        Returns the prompt for defining constraints on browser-related actions.
        """
        prompt = """
        - Use See to get a visual representation of the page to help you decide on the next action.
        - Use ScanPage to get a list of interactive elements to help you decide on the next action.
        - Use Click, Write, and other actions to interact with elements on the page to accomplish the task.
        """
        return textwrap.dedent(prompt)