File size: 9,445 Bytes
2a5ead4
a58e6a3
2a5ead4
 
 
 
 
 
 
 
 
c0c69f5
4f207be
 
583c5ee
2a5ead4
583c5ee
2a5ead4
 
 
4f207be
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f207be
59d77e5
 
2a5ead4
 
a58e6a3
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bcfe23
583c5ee
 
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d3d041
 
2a5ead4
 
9e3f857
0d3d041
2a5ead4
 
9bcfe23
 
 
 
 
2a5ead4
 
0d3d041
2a5ead4
0d3d041
 
 
 
 
 
 
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bcfe23
 
 
 
 
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583c5ee
 
 
 
2a5ead4
 
583c5ee
2a5ead4
583c5ee
 
 
 
 
 
 
 
 
 
 
 
 
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
583c5ee
2a5ead4
 
 
 
 
 
 
 
583c5ee
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
9e3f857
2a5ead4
 
 
 
 
 
9e3f857
 
583c5ee
4e3db9c
9e3f857
2a5ead4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
"""
Web agent backend - autonomous agent with web tools (search, read, screenshot).

Uses the same tool-calling loop pattern as code.py:
  LLM call → parse tool_calls → execute → update history → repeat
"""
import json
import logging
import re
from typing import List, Dict, Optional

from .tools import (
    web_search, read_url,
    execute_web_search, execute_read_url,
    extract_and_download_images,
)
from .image import resize_image_for_vlm

logger = logging.getLogger(__name__)

TOOLS = [web_search, read_url]

MAX_TURNS = 20


def execute_tool(tool_name: str, args: dict, serper_key: str) -> dict:
    """
    Execute a tool by name and return result dict.

    Returns:
        dict with keys:
        - "content": str result for the LLM
        - "image": optional base64 PNG (for screenshot_url)
        - "display": dict with display-friendly data for frontend
    """
    if tool_name == "web_search":
        query = args.get("query", "")
        num_results = args.get("num_results", 5)
        result_str = execute_web_search(query, serper_key, num_results)
        return {
            "content": result_str,
            "display": {"type": "search", "query": query, "results": result_str}
        }

    elif tool_name == "read_url":
        url = args.get("url", "")
        chunk = args.get("chunk", 0)
        use_html = args.get("use_html", False)
        content = execute_read_url(url, chunk=chunk, use_html=use_html)
        return {
            "content": content,
            "display": {"type": "page", "url": url, "length": len(content), "markdown": content}
        }

    elif tool_name == "screenshot_url":
        url = args.get("url", "")
        base64_png = execute_screenshot_url(url)
        if base64_png:
            return {
                "content": "Screenshot captured successfully. The image is attached.",
                "image": base64_png,
                "display": {"type": "screenshot", "url": url}
            }
        else:
            return {
                "content": f"Failed to take screenshot of {url}. The page may require JavaScript or be inaccessible.",
                "display": {"type": "screenshot_error", "url": url}
            }

    return {"content": f"Unknown tool: {tool_name}", "display": {"type": "error"}}


def stream_agent_execution(
    client,
    model: str,
    messages: List[Dict],
    serper_key: str,
    extra_params: Optional[Dict] = None,
    abort_event=None,
    multimodal: bool = False
):
    """
    Run the agent tool-calling loop.

    Yields dicts with SSE event types:
      - thinking: { content }
      - content: { content }
      - tool_start: { tool, args }
      - tool_result: { tool, result, image? }
      - result_preview: { content }
      - result: { content }
      - generating: {}
      - retry: { attempt, max_attempts, delay, message }
      - error: { content }
      - done: {}
    """
    from .agents import call_llm

    turns = 0
    done = False
    has_result = False
    debug_call_number = 0

    while not done and turns < MAX_TURNS:
        # Check abort before each turn
        if abort_event and abort_event.is_set():
            yield {"type": "aborted"}
            return

        turns += 1

        # LLM call with retries and debug events
        response = None
        for event in call_llm(client, model, messages, tools=TOOLS, extra_params=extra_params, abort_event=abort_event, call_number=debug_call_number):
            if "_response" in event:
                response = event["_response"]
                debug_call_number = event["_call_number"]
            else:
                yield event
                if event.get("type") in ("error", "aborted"):
                    return

        if response is None:
            return

        # --- Parse response ---
        assistant_message = response.choices[0].message
        content = assistant_message.content or ""
        tool_calls = assistant_message.tool_calls or []

        # Check for <result> tags
        result_match = re.search(r'<result>(.*?)</result>', content, re.DOTALL | re.IGNORECASE)
        result_content = None
        thinking_content = content

        if result_match:
            result_content = result_match.group(1).strip()
            thinking_content = re.sub(r'<result>.*?</result>', '', content, flags=re.DOTALL | re.IGNORECASE).strip()

        # Send thinking/content
        if thinking_content.strip():
            if tool_calls:
                yield {"type": "thinking", "content": thinking_content}
            else:
                yield {"type": "content", "content": thinking_content}

        # Send result preview
        if result_content:
            yield {"type": "result_preview", "content": result_content}

        # --- Handle tool calls ---
        if tool_calls:
            for tool_call in tool_calls:
                # Check abort between tool calls
                if abort_event and abort_event.is_set():
                    yield {"type": "aborted"}
                    return

                func_name = tool_call.function.name

                # Parse arguments
                try:
                    args = json.loads(tool_call.function.arguments)
                except json.JSONDecodeError as e:
                    output = f"Error parsing arguments: {e}"
                    messages.append({
                        "role": "assistant",
                        "content": content,
                        "tool_calls": [{"id": tool_call.id, "type": "function", "function": {"name": func_name, "arguments": tool_call.function.arguments}}]
                    })
                    messages.append({"role": "tool", "tool_call_id": tool_call.id, "content": output})
                    yield {"type": "error", "content": output}
                    continue

                # Signal tool start (include IDs for history reconstruction)
                yield {
                    "type": "tool_start",
                    "tool": func_name,
                    "args": args,
                    "tool_call_id": tool_call.id,
                    "arguments": tool_call.function.arguments,
                    "thinking": content,
                }

                # Execute tool
                result = execute_tool(func_name, args, serper_key)

                # Build tool response content for LLM
                if result.get("image") and multimodal:
                    # Send screenshot as multimodal content so VLM can see it
                    vlm_image = resize_image_for_vlm(result["image"])
                    tool_response_content = [
                        {"type": "text", "text": result["content"]},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{vlm_image}"}}
                    ]
                elif func_name == "read_url" and multimodal:
                    # Extract and include page images so VLM can see them
                    page_images = extract_and_download_images(result["content"])
                    if page_images:
                        tool_response_content = [{"type": "text", "text": result["content"]}]
                        for img_b64 in page_images:
                            vlm_img = resize_image_for_vlm(img_b64)
                            tool_response_content.append({
                                "type": "image_url",
                                "image_url": {"url": f"data:image/jpeg;base64,{vlm_img}"}
                            })
                    else:
                        tool_response_content = result["content"]
                else:
                    tool_response_content = result["content"]

                # Add to message history
                messages.append({
                    "role": "assistant",
                    "content": content,
                    "tool_calls": [{"id": tool_call.id, "type": "function", "function": {"name": func_name, "arguments": tool_call.function.arguments}}]
                })
                messages.append({
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "content": tool_response_content
                })

                # Signal tool result to frontend (include response for history)
                tool_result_event = {
                    "type": "tool_result",
                    "tool": func_name,
                    "tool_call_id": tool_call.id,
                    "result": result.get("display", {}),
                    "response": result.get("content", ""),
                }
                if result.get("image"):
                    tool_result_event["image"] = result["image"]
                yield tool_result_event

        else:
            # No tool calls — we're done
            messages.append({"role": "assistant", "content": content})
            done = True

        # Send result if found
        if result_content:
            has_result = True
            yield {"type": "result", "content": result_content}

        # Signal between-turn processing
        if not done:
            yield {"type": "generating"}

    # If agent finished without a <result>, nudge it for one
    if not has_result:
        from .agents import nudge_for_result
        yield from nudge_for_result(client, model, messages, extra_params=extra_params, call_number=debug_call_number)

    yield {"type": "done"}