Spaces:

lvwerra
/

agent-ui

Running

File size: 9,445 Bytes

2a5ead4
a58e6a3
2a5ead4
 
 
 
 
 
 
 
 
c0c69f5
4f207be
 
583c5ee
2a5ead4
583c5ee
2a5ead4
 
 
4f207be
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f207be
59d77e5
 
2a5ead4
 
a58e6a3
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bcfe23
583c5ee
 
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d3d041
 
2a5ead4
 
9e3f857
0d3d041
2a5ead4
 
9bcfe23
 
 
 
 
2a5ead4
 
0d3d041
2a5ead4
0d3d041
 
 
 
 
 
 
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bcfe23
 
 
 
 
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583c5ee
 
 
 
2a5ead4
 
583c5ee
2a5ead4
583c5ee
 
 
 
 
 
 
 
 
 
 
 
 
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
583c5ee
2a5ead4
 
 
 
 
 
 
 
583c5ee
2a5ead4
 
 
 
 
 
 
 
 
 
 
 
9e3f857
2a5ead4
 
 
 
 
 
9e3f857
 
583c5ee
4e3db9c
9e3f857
2a5ead4

"""
Web agent backend - autonomous agent with web tools (search, read, screenshot).

Uses the same tool-calling loop pattern as code.py:
  LLM call → parse tool_calls → execute → update history → repeat
"""
import json
import logging
import re
from typing import List, Dict, Optional

from .tools import (
    web_search, read_url,
    execute_web_search, execute_read_url,
    extract_and_download_images,
)
from .image import resize_image_for_vlm

logger = logging.getLogger(__name__)

TOOLS = [web_search, read_url]

MAX_TURNS = 20


def execute_tool(tool_name: str, args: dict, serper_key: str) -> dict:
    """
    Execute a tool by name and return result dict.

    Returns:
        dict with keys:
        - "content": str result for the LLM
        - "image": optional base64 PNG (for screenshot_url)
        - "display": dict with display-friendly data for frontend
    """
    if tool_name == "web_search":
        query = args.get("query", "")
        num_results = args.get("num_results", 5)
        result_str = execute_web_search(query, serper_key, num_results)
        return {
            "content": result_str,
            "display": {"type": "search", "query": query, "results": result_str}
        }

    elif tool_name == "read_url":
        url = args.get("url", "")
        chunk = args.get("chunk", 0)
        use_html = args.get("use_html", False)
        content = execute_read_url(url, chunk=chunk, use_html=use_html)
        return {
            "content": content,
            "display": {"type": "page", "url": url, "length": len(content), "markdown": content}
        }

    elif tool_name == "screenshot_url":
        url = args.get("url", "")
        base64_png = execute_screenshot_url(url)
        if base64_png:
            return {
                "content": "Screenshot captured successfully. The image is attached.",
                "image": base64_png,
                "display": {"type": "screenshot", "url": url}
            }
        else:
            return {
                "content": f"Failed to take screenshot of {url}. The page may require JavaScript or be inaccessible.",
                "display": {"type": "screenshot_error", "url": url}
            }

    return {"content": f"Unknown tool: {tool_name}", "display": {"type": "error"}}


def stream_agent_execution(
    client,
    model: str,
    messages: List[Dict],
    serper_key: str,
    extra_params: Optional[Dict] = None,
    abort_event=None,
    multimodal: bool = False
):
    """
    Run the agent tool-calling loop.

    Yields dicts with SSE event types:
      - thinking: { content }
      - content: { content }
      - tool_start: { tool, args }
      - tool_result: { tool, result, image? }
      - result_preview: { content }
      - result: { content }
      - generating: {}
      - retry: { attempt, max_attempts, delay, message }
      - error: { content }
      - done: {}
    """
    from .agents import call_llm

    turns = 0
    done = False
    has_result = False
    debug_call_number = 0

    while not done and turns < MAX_TURNS:
        # Check abort before each turn
        if abort_event and abort_event.is_set():
            yield {"type": "aborted"}
            return

        turns += 1

        # LLM call with retries and debug events
        response = None
        for event in call_llm(client, model, messages, tools=TOOLS, extra_params=extra_params, abort_event=abort_event, call_number=debug_call_number):
            if "_response" in event:
                response = event["_response"]
                debug_call_number = event["_call_number"]
            else:
                yield event
                if event.get("type") in ("error", "aborted"):
                    return

        if response is None:
            return

        # --- Parse response ---
        assistant_message = response.choices[0].message
        content = assistant_message.content or ""
        tool_calls = assistant_message.tool_calls or []

        # Check for <result> tags
        result_match = re.search(r'<result>(.*?)</result>', content, re.DOTALL | re.IGNORECASE)
        result_content = None
        thinking_content = content

        if result_match:
            result_content = result_match.group(1).strip()
            thinking_content = re.sub(r'<result>.*?</result>', '', content, flags=re.DOTALL | re.IGNORECASE).strip()

        # Send thinking/content
        if thinking_content.strip():
            if tool_calls:
                yield {"type": "thinking", "content": thinking_content}
            else:
                yield {"type": "content", "content": thinking_content}

        # Send result preview
        if result_content:
            yield {"type": "result_preview", "content": result_content}

        # --- Handle tool calls ---
        if tool_calls:
            for tool_call in tool_calls:
                # Check abort between tool calls
                if abort_event and abort_event.is_set():
                    yield {"type": "aborted"}
                    return

                func_name = tool_call.function.name

                # Parse arguments
                try:
                    args = json.loads(tool_call.function.arguments)
                except json.JSONDecodeError as e:
                    output = f"Error parsing arguments: {e}"
                    messages.append({
                        "role": "assistant",
                        "content": content,
                        "tool_calls": [{"id": tool_call.id, "type": "function", "function": {"name": func_name, "arguments": tool_call.function.arguments}}]
                    })
                    messages.append({"role": "tool", "tool_call_id": tool_call.id, "content": output})
                    yield {"type": "error", "content": output}
                    continue

                # Signal tool start (include IDs for history reconstruction)
                yield {
                    "type": "tool_start",
                    "tool": func_name,
                    "args": args,
                    "tool_call_id": tool_call.id,
                    "arguments": tool_call.function.arguments,
                    "thinking": content,
                }

                # Execute tool
                result = execute_tool(func_name, args, serper_key)

                # Build tool response content for LLM
                if result.get("image") and multimodal:
                    # Send screenshot as multimodal content so VLM can see it
                    vlm_image = resize_image_for_vlm(result["image"])
                    tool_response_content = [
                        {"type": "text", "text": result["content"]},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{vlm_image}"}}
                    ]
                elif func_name == "read_url" and multimodal:
                    # Extract and include page images so VLM can see them
                    page_images = extract_and_download_images(result["content"])
                    if page_images:
                        tool_response_content = [{"type": "text", "text": result["content"]}]
                        for img_b64 in page_images:
                            vlm_img = resize_image_for_vlm(img_b64)
                            tool_response_content.append({
                                "type": "image_url",
                                "image_url": {"url": f"data:image/jpeg;base64,{vlm_img}"}
                            })
                    else:
                        tool_response_content = result["content"]
                else:
                    tool_response_content = result["content"]

                # Add to message history
                messages.append({
                    "role": "assistant",
                    "content": content,
                    "tool_calls": [{"id": tool_call.id, "type": "function", "function": {"name": func_name, "arguments": tool_call.function.arguments}}]
                })
                messages.append({
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "content": tool_response_content
                })

                # Signal tool result to frontend (include response for history)
                tool_result_event = {
                    "type": "tool_result",
                    "tool": func_name,
                    "tool_call_id": tool_call.id,
                    "result": result.get("display", {}),
                    "response": result.get("content", ""),
                }
                if result.get("image"):
                    tool_result_event["image"] = result["image"]
                yield tool_result_event

        else:
            # No tool calls — we're done
            messages.append({"role": "assistant", "content": content})
            done = True

        # Send result if found
        if result_content:
            has_result = True
            yield {"type": "result", "content": result_content}

        # Signal between-turn processing
        if not done:
            yield {"type": "generating"}

    # If agent finished without a <result>, nudge it for one
    if not has_result:
        from .agents import nudge_for_result
        yield from nudge_for_result(client, model, messages, extra_params=extra_params, call_number=debug_call_number)

    yield {"type": "done"}