""" Web agent backend - autonomous agent with web tools (search, read, screenshot). Uses the same tool-calling loop pattern as code.py: LLM call → parse tool_calls → execute → update history → repeat """ import json import logging import re from typing import List, Dict, Optional from .tools import ( web_search, read_url, execute_web_search, execute_read_url, extract_and_download_images, ) from .image import resize_image_for_vlm logger = logging.getLogger(__name__) TOOLS = [web_search, read_url] MAX_TURNS = 20 def execute_tool(tool_name: str, args: dict, serper_key: str) -> dict: """ Execute a tool by name and return result dict. Returns: dict with keys: - "content": str result for the LLM - "image": optional base64 PNG (for screenshot_url) - "display": dict with display-friendly data for frontend """ if tool_name == "web_search": query = args.get("query", "") num_results = args.get("num_results", 5) result_str = execute_web_search(query, serper_key, num_results) return { "content": result_str, "display": {"type": "search", "query": query, "results": result_str} } elif tool_name == "read_url": url = args.get("url", "") chunk = args.get("chunk", 0) use_html = args.get("use_html", False) content = execute_read_url(url, chunk=chunk, use_html=use_html) return { "content": content, "display": {"type": "page", "url": url, "length": len(content), "markdown": content} } elif tool_name == "screenshot_url": url = args.get("url", "") base64_png = execute_screenshot_url(url) if base64_png: return { "content": "Screenshot captured successfully. The image is attached.", "image": base64_png, "display": {"type": "screenshot", "url": url} } else: return { "content": f"Failed to take screenshot of {url}. The page may require JavaScript or be inaccessible.", "display": {"type": "screenshot_error", "url": url} } return {"content": f"Unknown tool: {tool_name}", "display": {"type": "error"}} def stream_agent_execution( client, model: str, messages: List[Dict], serper_key: str, extra_params: Optional[Dict] = None, abort_event=None, multimodal: bool = False ): """ Run the agent tool-calling loop. Yields dicts with SSE event types: - thinking: { content } - content: { content } - tool_start: { tool, args } - tool_result: { tool, result, image? } - result_preview: { content } - result: { content } - generating: {} - retry: { attempt, max_attempts, delay, message } - error: { content } - done: {} """ from .agents import call_llm turns = 0 done = False has_result = False debug_call_number = 0 while not done and turns < MAX_TURNS: # Check abort before each turn if abort_event and abort_event.is_set(): yield {"type": "aborted"} return turns += 1 # LLM call with retries and debug events response = None for event in call_llm(client, model, messages, tools=TOOLS, extra_params=extra_params, abort_event=abort_event, call_number=debug_call_number): if "_response" in event: response = event["_response"] debug_call_number = event["_call_number"] else: yield event if event.get("type") in ("error", "aborted"): return if response is None: return # --- Parse response --- assistant_message = response.choices[0].message content = assistant_message.content or "" tool_calls = assistant_message.tool_calls or [] # Check for tags result_match = re.search(r'(.*?)', content, re.DOTALL | re.IGNORECASE) result_content = None thinking_content = content if result_match: result_content = result_match.group(1).strip() thinking_content = re.sub(r'.*?', '', content, flags=re.DOTALL | re.IGNORECASE).strip() # Send thinking/content if thinking_content.strip(): if tool_calls: yield {"type": "thinking", "content": thinking_content} else: yield {"type": "content", "content": thinking_content} # Send result preview if result_content: yield {"type": "result_preview", "content": result_content} # --- Handle tool calls --- if tool_calls: for tool_call in tool_calls: # Check abort between tool calls if abort_event and abort_event.is_set(): yield {"type": "aborted"} return func_name = tool_call.function.name # Parse arguments try: args = json.loads(tool_call.function.arguments) except json.JSONDecodeError as e: output = f"Error parsing arguments: {e}" messages.append({ "role": "assistant", "content": content, "tool_calls": [{"id": tool_call.id, "type": "function", "function": {"name": func_name, "arguments": tool_call.function.arguments}}] }) messages.append({"role": "tool", "tool_call_id": tool_call.id, "content": output}) yield {"type": "error", "content": output} continue # Signal tool start (include IDs for history reconstruction) yield { "type": "tool_start", "tool": func_name, "args": args, "tool_call_id": tool_call.id, "arguments": tool_call.function.arguments, "thinking": content, } # Execute tool result = execute_tool(func_name, args, serper_key) # Build tool response content for LLM if result.get("image") and multimodal: # Send screenshot as multimodal content so VLM can see it vlm_image = resize_image_for_vlm(result["image"]) tool_response_content = [ {"type": "text", "text": result["content"]}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{vlm_image}"}} ] elif func_name == "read_url" and multimodal: # Extract and include page images so VLM can see them page_images = extract_and_download_images(result["content"]) if page_images: tool_response_content = [{"type": "text", "text": result["content"]}] for img_b64 in page_images: vlm_img = resize_image_for_vlm(img_b64) tool_response_content.append({ "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{vlm_img}"} }) else: tool_response_content = result["content"] else: tool_response_content = result["content"] # Add to message history messages.append({ "role": "assistant", "content": content, "tool_calls": [{"id": tool_call.id, "type": "function", "function": {"name": func_name, "arguments": tool_call.function.arguments}}] }) messages.append({ "role": "tool", "tool_call_id": tool_call.id, "content": tool_response_content }) # Signal tool result to frontend (include response for history) tool_result_event = { "type": "tool_result", "tool": func_name, "tool_call_id": tool_call.id, "result": result.get("display", {}), "response": result.get("content", ""), } if result.get("image"): tool_result_event["image"] = result["image"] yield tool_result_event else: # No tool calls — we're done messages.append({"role": "assistant", "content": content}) done = True # Send result if found if result_content: has_result = True yield {"type": "result", "content": result_content} # Signal between-turn processing if not done: yield {"type": "generating"} # If agent finished without a , nudge it for one if not has_result: from .agents import nudge_for_result yield from nudge_for_result(client, model, messages, extra_params=extra_params, call_number=debug_call_number) yield {"type": "done"}