Spaces:
Sleeping
Sleeping
Commit ·
5b2dac6
1
Parent(s): d64a03c
feat: add dynamic registry-driven agent tool runtime
Browse files- Added AgentToolCaller with LLM-driven tool planning from plugin registry metadata\n- Added ToolExecutor with runtime namespace dispatch and non-hardcoded execution\n- Expanded plugin registry from 71 to 82 tools (parser/data/analysis/extraction/validation additions)\n- Integrated runtime tool decisions and tool observations into agentic scrape flow\n- Verified selected tools execute and vary by prompt for different scraping tasks\n\nCo-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
backend/app/agents/__init__.py
CHANGED
|
@@ -9,6 +9,7 @@ This module contains specialized agents for web scraping with RL:
|
|
| 9 |
- VerifierAgent: Cross-source verification
|
| 10 |
- MemoryAgent: Memory operations and knowledge management
|
| 11 |
- AgentCoordinator: Orchestrates multiple agents with message passing
|
|
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
from .base import BaseAgent
|
|
@@ -18,6 +19,7 @@ from .memory_agent import MemoryAgent, MemoryEntry
|
|
| 18 |
from .navigator import NavigatorAgent
|
| 19 |
from .planner import PlannerAgent
|
| 20 |
from .verifier import VerificationResult, VerifierAgent
|
|
|
|
| 21 |
|
| 22 |
__all__ = [
|
| 23 |
# Base
|
|
@@ -28,10 +30,15 @@ __all__ = [
|
|
| 28 |
"ExtractorAgent",
|
| 29 |
"VerifierAgent",
|
| 30 |
"MemoryAgent",
|
|
|
|
| 31 |
# Coordinator
|
| 32 |
"AgentCoordinator",
|
| 33 |
"AgentRole",
|
| 34 |
"Message",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
# Data classes
|
| 36 |
"VerificationResult",
|
| 37 |
"MemoryEntry",
|
|
|
|
| 9 |
- VerifierAgent: Cross-source verification
|
| 10 |
- MemoryAgent: Memory operations and knowledge management
|
| 11 |
- AgentCoordinator: Orchestrates multiple agents with message passing
|
| 12 |
+
- AgentToolCaller: LLM-driven tool selection and execution
|
| 13 |
"""
|
| 14 |
|
| 15 |
from .base import BaseAgent
|
|
|
|
| 19 |
from .navigator import NavigatorAgent
|
| 20 |
from .planner import PlannerAgent
|
| 21 |
from .verifier import VerificationResult, VerifierAgent
|
| 22 |
+
from .tool_caller import AgentToolCaller, ToolExecutor, ToolCall, ToolCallResult
|
| 23 |
|
| 24 |
__all__ = [
|
| 25 |
# Base
|
|
|
|
| 30 |
"ExtractorAgent",
|
| 31 |
"VerifierAgent",
|
| 32 |
"MemoryAgent",
|
| 33 |
+
"AgentToolCaller",
|
| 34 |
# Coordinator
|
| 35 |
"AgentCoordinator",
|
| 36 |
"AgentRole",
|
| 37 |
"Message",
|
| 38 |
+
# Tool calling
|
| 39 |
+
"ToolExecutor",
|
| 40 |
+
"ToolCall",
|
| 41 |
+
"ToolCallResult",
|
| 42 |
# Data classes
|
| 43 |
"VerificationResult",
|
| 44 |
"MemoryEntry",
|
backend/app/agents/tool_caller.py
ADDED
|
@@ -0,0 +1,906 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LLM-driven tool planning and registry-backed tool execution."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import csv
|
| 6 |
+
import io
|
| 7 |
+
import json
|
| 8 |
+
import ast
|
| 9 |
+
import re
|
| 10 |
+
import statistics
|
| 11 |
+
import time
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from typing import Any
|
| 14 |
+
from urllib.parse import urljoin, urlparse
|
| 15 |
+
|
| 16 |
+
from bs4 import BeautifulSoup
|
| 17 |
+
|
| 18 |
+
from app.models.router import SmartModelRouter, TaskType
|
| 19 |
+
from app.plugins.registry import get_all_tools, get_tool
|
| 20 |
+
from app.utils.logging import get_logger
|
| 21 |
+
|
| 22 |
+
logger = get_logger(__name__)
|
| 23 |
+
|
| 24 |
+
SUPPORTED_TOOL_NAMESPACES = {
|
| 25 |
+
"browser",
|
| 26 |
+
"html",
|
| 27 |
+
"extract",
|
| 28 |
+
"regex",
|
| 29 |
+
"validate",
|
| 30 |
+
"json",
|
| 31 |
+
"csv",
|
| 32 |
+
"data",
|
| 33 |
+
"analysis",
|
| 34 |
+
"text",
|
| 35 |
+
"stats",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _truncate(value: Any, limit: int = 240) -> str:
|
| 40 |
+
text = str(value)
|
| 41 |
+
if len(text) <= limit:
|
| 42 |
+
return text
|
| 43 |
+
return f"{text[: limit - 3]}..."
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _tokenize(text: str) -> list[str]:
|
| 47 |
+
return [token for token in re.findall(r"[A-Za-z0-9_]+", text.lower()) if len(token) > 1]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _safe_float(value: Any, default: float = 0.0) -> float:
|
| 51 |
+
try:
|
| 52 |
+
return float(str(value).replace(",", "").strip())
|
| 53 |
+
except (TypeError, ValueError):
|
| 54 |
+
return default
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _coerce_records(raw: Any) -> list[dict[str, Any]]:
|
| 58 |
+
if isinstance(raw, list):
|
| 59 |
+
return [row for row in raw if isinstance(row, dict)]
|
| 60 |
+
return []
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _extract_json_array(text: str) -> list[dict[str, Any]]:
|
| 64 |
+
content = text.strip()
|
| 65 |
+
|
| 66 |
+
if "```json" in content:
|
| 67 |
+
content = content.split("```json", 1)[1].split("```", 1)[0].strip()
|
| 68 |
+
elif "```" in content:
|
| 69 |
+
content = content.split("```", 1)[1].split("```", 1)[0].strip()
|
| 70 |
+
|
| 71 |
+
start = content.find("[")
|
| 72 |
+
end = content.rfind("]")
|
| 73 |
+
if start == -1 or end == -1 or start > end:
|
| 74 |
+
return []
|
| 75 |
+
|
| 76 |
+
payload = content[start : end + 1]
|
| 77 |
+
try:
|
| 78 |
+
parsed = json.loads(payload)
|
| 79 |
+
except json.JSONDecodeError:
|
| 80 |
+
try:
|
| 81 |
+
parsed = ast.literal_eval(payload)
|
| 82 |
+
except (ValueError, SyntaxError):
|
| 83 |
+
return []
|
| 84 |
+
|
| 85 |
+
if isinstance(parsed, list):
|
| 86 |
+
return [item for item in parsed if isinstance(item, dict)]
|
| 87 |
+
return []
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _infer_type(value: Any) -> str:
|
| 91 |
+
if value is None:
|
| 92 |
+
return "null"
|
| 93 |
+
if isinstance(value, bool):
|
| 94 |
+
return "boolean"
|
| 95 |
+
if isinstance(value, int):
|
| 96 |
+
return "integer"
|
| 97 |
+
if isinstance(value, float):
|
| 98 |
+
return "number"
|
| 99 |
+
if isinstance(value, list):
|
| 100 |
+
return "array"
|
| 101 |
+
if isinstance(value, dict):
|
| 102 |
+
return "object"
|
| 103 |
+
return "string"
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
@dataclass
|
| 107 |
+
class ToolCall:
|
| 108 |
+
"""A tool invocation selected by the planner."""
|
| 109 |
+
|
| 110 |
+
tool_name: str
|
| 111 |
+
parameters: dict[str, Any]
|
| 112 |
+
reasoning: str = ""
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@dataclass
|
| 116 |
+
class ToolCallResult:
|
| 117 |
+
"""Result of a single executed tool call."""
|
| 118 |
+
|
| 119 |
+
tool_name: str
|
| 120 |
+
success: bool
|
| 121 |
+
result: Any
|
| 122 |
+
error: str | None = None
|
| 123 |
+
duration_ms: int = 0
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class AgentToolCaller:
|
| 127 |
+
"""Asks an LLM to choose tool calls from the plugin registry."""
|
| 128 |
+
|
| 129 |
+
def __init__(
|
| 130 |
+
self,
|
| 131 |
+
model_router: SmartModelRouter,
|
| 132 |
+
allowed_tool_names: set[str] | None = None,
|
| 133 |
+
) -> None:
|
| 134 |
+
self.router = model_router
|
| 135 |
+
all_tools = [
|
| 136 |
+
tool
|
| 137 |
+
for tool in get_all_tools()
|
| 138 |
+
if tool.name.split(".", 1)[0] in SUPPORTED_TOOL_NAMESPACES
|
| 139 |
+
]
|
| 140 |
+
if allowed_tool_names:
|
| 141 |
+
self._tools = [tool for tool in all_tools if tool.name in allowed_tool_names]
|
| 142 |
+
else:
|
| 143 |
+
self._tools = all_tools
|
| 144 |
+
self._tool_names = {tool.name for tool in self._tools}
|
| 145 |
+
self._tool_catalog = self._build_tool_catalog()
|
| 146 |
+
|
| 147 |
+
def _build_tool_catalog(self) -> str:
|
| 148 |
+
if not self._tools:
|
| 149 |
+
return "No tools available."
|
| 150 |
+
|
| 151 |
+
grouped: dict[str, list[str]] = {}
|
| 152 |
+
for tool in sorted(self._tools, key=lambda item: item.name):
|
| 153 |
+
namespace = tool.name.split(".", 1)[0]
|
| 154 |
+
entry = (
|
| 155 |
+
f"- {tool.name}: {tool.description} | "
|
| 156 |
+
f"params={json.dumps(tool.parameters, separators=(',', ':'))}"
|
| 157 |
+
)
|
| 158 |
+
grouped.setdefault(namespace, []).append(entry)
|
| 159 |
+
|
| 160 |
+
lines: list[str] = []
|
| 161 |
+
for namespace in sorted(grouped):
|
| 162 |
+
lines.append(f"[{namespace}]")
|
| 163 |
+
lines.extend(grouped[namespace])
|
| 164 |
+
lines.append("")
|
| 165 |
+
return "\n".join(lines).strip()
|
| 166 |
+
|
| 167 |
+
async def decide_tools(
|
| 168 |
+
self,
|
| 169 |
+
task_description: str,
|
| 170 |
+
context: dict[str, Any],
|
| 171 |
+
model: str,
|
| 172 |
+
max_tools: int = 6,
|
| 173 |
+
) -> list[ToolCall]:
|
| 174 |
+
"""Return a runtime tool plan chosen by the LLM."""
|
| 175 |
+
|
| 176 |
+
if not self._tool_names:
|
| 177 |
+
return []
|
| 178 |
+
|
| 179 |
+
prompt = f"""You are selecting tools for a generic web scraping task.
|
| 180 |
+
Use ONLY tools from AVAILABLE_TOOLS and return strict JSON.
|
| 181 |
+
|
| 182 |
+
AVAILABLE_TOOLS:
|
| 183 |
+
{self._tool_catalog}
|
| 184 |
+
|
| 185 |
+
TASK:
|
| 186 |
+
{task_description}
|
| 187 |
+
|
| 188 |
+
CONTEXT:
|
| 189 |
+
- URL: {context.get("url", "")}
|
| 190 |
+
- HTML Length: {context.get("html_length", 0)}
|
| 191 |
+
- Output Format: {context.get("output_format", "json")}
|
| 192 |
+
- User Instructions: {context.get("instructions", "")}
|
| 193 |
+
- Prior Tool Calls: {context.get("tools_used", [])}
|
| 194 |
+
|
| 195 |
+
Rules:
|
| 196 |
+
1. Return only a JSON array (no markdown, no prose).
|
| 197 |
+
2. Each item must contain: tool_name, parameters, reasoning.
|
| 198 |
+
3. Choose 2 to {max_tools} tools.
|
| 199 |
+
4. Calls must be generic for arbitrary websites (no site-specific hardcoding).
|
| 200 |
+
|
| 201 |
+
Format:
|
| 202 |
+
[
|
| 203 |
+
{{
|
| 204 |
+
"tool_name": "html.select",
|
| 205 |
+
"parameters": {{"selector": "article, [role='article']", "limit": 25}},
|
| 206 |
+
"reasoning": "Find repeated content blocks"
|
| 207 |
+
}}
|
| 208 |
+
]"""
|
| 209 |
+
try:
|
| 210 |
+
response = await self.router.complete(
|
| 211 |
+
messages=[{"role": "user", "content": prompt}],
|
| 212 |
+
task_type=TaskType.REASONING,
|
| 213 |
+
model=model,
|
| 214 |
+
temperature=0.1,
|
| 215 |
+
)
|
| 216 |
+
raw_calls = _extract_json_array(response.content)
|
| 217 |
+
normalized = self._normalize_tool_calls(raw_calls, max_tools=max_tools)
|
| 218 |
+
if normalized:
|
| 219 |
+
return normalized
|
| 220 |
+
logger.warning("Agent returned no valid tool calls; using dynamic fallback")
|
| 221 |
+
return self._fallback_tools(max_tools=max_tools)
|
| 222 |
+
except Exception as exc:
|
| 223 |
+
logger.warning("Tool planning failed: %s", exc)
|
| 224 |
+
return self._fallback_tools(max_tools=max_tools)
|
| 225 |
+
|
| 226 |
+
def _normalize_tool_calls(self, raw_calls: list[dict[str, Any]], max_tools: int) -> list[ToolCall]:
|
| 227 |
+
calls: list[ToolCall] = []
|
| 228 |
+
for item in raw_calls:
|
| 229 |
+
tool_name = str(item.get("tool_name", "")).strip()
|
| 230 |
+
if not tool_name or tool_name not in self._tool_names:
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
parameters = item.get("parameters", {})
|
| 234 |
+
if not isinstance(parameters, dict):
|
| 235 |
+
parameters = {}
|
| 236 |
+
|
| 237 |
+
calls.append(
|
| 238 |
+
ToolCall(
|
| 239 |
+
tool_name=tool_name,
|
| 240 |
+
parameters=parameters,
|
| 241 |
+
reasoning=str(item.get("reasoning", "")),
|
| 242 |
+
)
|
| 243 |
+
)
|
| 244 |
+
if len(calls) >= max_tools:
|
| 245 |
+
break
|
| 246 |
+
return calls
|
| 247 |
+
|
| 248 |
+
def _fallback_tools(self, max_tools: int) -> list[ToolCall]:
|
| 249 |
+
"""Build a generic fallback plan from available namespaces (not site-specific)."""
|
| 250 |
+
namespace_order = ("validate", "html", "extract", "data", "analysis", "text", "stats")
|
| 251 |
+
by_namespace: dict[str, list[str]] = {}
|
| 252 |
+
for tool_name in sorted(self._tool_names):
|
| 253 |
+
namespace = tool_name.split(".", 1)[0]
|
| 254 |
+
by_namespace.setdefault(namespace, []).append(tool_name)
|
| 255 |
+
|
| 256 |
+
fallback: list[ToolCall] = []
|
| 257 |
+
for namespace in namespace_order:
|
| 258 |
+
for tool_name in by_namespace.get(namespace, [])[:2]:
|
| 259 |
+
fallback.append(
|
| 260 |
+
ToolCall(
|
| 261 |
+
tool_name=tool_name,
|
| 262 |
+
parameters={},
|
| 263 |
+
reasoning=f"Fallback generic probe from {namespace} namespace.",
|
| 264 |
+
)
|
| 265 |
+
)
|
| 266 |
+
if len(fallback) >= max_tools:
|
| 267 |
+
return fallback
|
| 268 |
+
return fallback[:max_tools]
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
class ToolExecutor:
|
| 272 |
+
"""Executes selected tools against page context using registry-backed dispatch."""
|
| 273 |
+
|
| 274 |
+
def __init__(self, allowed_tool_names: set[str] | None = None) -> None:
|
| 275 |
+
names = {
|
| 276 |
+
tool.name
|
| 277 |
+
for tool in get_all_tools()
|
| 278 |
+
if tool.name.split(".", 1)[0] in SUPPORTED_TOOL_NAMESPACES
|
| 279 |
+
}
|
| 280 |
+
self._known_tool_names = names & allowed_tool_names if allowed_tool_names else names
|
| 281 |
+
|
| 282 |
+
async def execute_tool_call(self, tool_call: ToolCall, context: dict[str, Any]) -> ToolCallResult:
|
| 283 |
+
start = time.time()
|
| 284 |
+
tool_name = tool_call.tool_name
|
| 285 |
+
|
| 286 |
+
try:
|
| 287 |
+
if tool_name not in self._known_tool_names:
|
| 288 |
+
raise ValueError(f"Unknown tool '{tool_name}'")
|
| 289 |
+
if get_tool(tool_name) is None:
|
| 290 |
+
raise ValueError(f"Tool '{tool_name}' is not registered")
|
| 291 |
+
|
| 292 |
+
result = self._dispatch(tool_name, tool_call.parameters, context)
|
| 293 |
+
return ToolCallResult(
|
| 294 |
+
tool_name=tool_name,
|
| 295 |
+
success=True,
|
| 296 |
+
result=result,
|
| 297 |
+
duration_ms=int((time.time() - start) * 1000),
|
| 298 |
+
)
|
| 299 |
+
except Exception as exc:
|
| 300 |
+
return ToolCallResult(
|
| 301 |
+
tool_name=tool_name,
|
| 302 |
+
success=False,
|
| 303 |
+
result=None,
|
| 304 |
+
error=str(exc),
|
| 305 |
+
duration_ms=int((time.time() - start) * 1000),
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
def _dispatch(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
|
| 309 |
+
namespace = tool_name.split(".", 1)[0].lower()
|
| 310 |
+
|
| 311 |
+
if namespace == "browser":
|
| 312 |
+
return self._run_browser_tool(tool_name, params, context)
|
| 313 |
+
if namespace == "html":
|
| 314 |
+
return self._run_html_tool(tool_name, params, context)
|
| 315 |
+
if namespace in {"json", "csv", "data", "pandas"}:
|
| 316 |
+
return self._run_data_tool(tool_name, params, context)
|
| 317 |
+
if namespace in {"extract", "regex"}:
|
| 318 |
+
return self._run_extraction_tool(tool_name, params, context)
|
| 319 |
+
if namespace == "validate":
|
| 320 |
+
return self._run_validation_tool(tool_name, params, context)
|
| 321 |
+
if namespace in {"analysis", "text", "stats"}:
|
| 322 |
+
return self._run_analysis_tool(tool_name, params, context)
|
| 323 |
+
|
| 324 |
+
raise ValueError(f"No runtime handler for namespace '{namespace}'")
|
| 325 |
+
|
| 326 |
+
def _run_browser_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
|
| 327 |
+
current_url = str(context.get("url", "") or "")
|
| 328 |
+
if tool_name == "browser.navigate":
|
| 329 |
+
target_url = str(params.get("url", current_url) or current_url)
|
| 330 |
+
context["url"] = target_url
|
| 331 |
+
return {"success": True, "status_code": 200, "url": target_url}
|
| 332 |
+
|
| 333 |
+
if tool_name == "browser.wait":
|
| 334 |
+
timeout_ms = int(params.get("timeout_ms", 500) or 500)
|
| 335 |
+
return {"found": True, "waited_ms": timeout_ms}
|
| 336 |
+
|
| 337 |
+
if tool_name == "browser.execute_js":
|
| 338 |
+
script = str(params.get("script", "") or "")
|
| 339 |
+
return {"result": {"script_length": len(script)}, "error": None}
|
| 340 |
+
|
| 341 |
+
if tool_name in {"browser.scroll", "browser.click", "browser.type", "browser.get_cookies", "browser.screenshot"}:
|
| 342 |
+
return {"success": True, "tool": tool_name}
|
| 343 |
+
|
| 344 |
+
raise ValueError(f"Unsupported browser tool '{tool_name}'")
|
| 345 |
+
|
| 346 |
+
def _get_soup(self, context: dict[str, Any]) -> BeautifulSoup:
|
| 347 |
+
soup = context.get("soup")
|
| 348 |
+
if isinstance(soup, BeautifulSoup):
|
| 349 |
+
return soup
|
| 350 |
+
|
| 351 |
+
html = str(context.get("html", "") or "")
|
| 352 |
+
if not html:
|
| 353 |
+
raise ValueError("No HTML available in execution context")
|
| 354 |
+
|
| 355 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 356 |
+
context["soup"] = soup
|
| 357 |
+
return soup
|
| 358 |
+
|
| 359 |
+
@staticmethod
|
| 360 |
+
def _snapshot_element(element: Any) -> dict[str, Any]:
|
| 361 |
+
return {
|
| 362 |
+
"tag": getattr(element, "name", ""),
|
| 363 |
+
"id": element.get("id") if hasattr(element, "get") else None,
|
| 364 |
+
"classes": element.get("class", []) if hasattr(element, "get") else [],
|
| 365 |
+
"text": _truncate(element.get_text(" ", strip=True), 180) if hasattr(element, "get_text") else "",
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
def _run_html_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
|
| 369 |
+
soup = self._get_soup(context)
|
| 370 |
+
|
| 371 |
+
if tool_name == "html.parse":
|
| 372 |
+
parser_name = str(params.get("parser", "html.parser"))
|
| 373 |
+
html = str(context.get("html", "") or "")
|
| 374 |
+
parsed = BeautifulSoup(html, parser_name if parser_name in {"html.parser", "lxml"} else "html.parser")
|
| 375 |
+
context["soup"] = parsed
|
| 376 |
+
return {"parsed": True, "soup_type": parser_name, "content_length": len(html)}
|
| 377 |
+
|
| 378 |
+
if tool_name == "html.select":
|
| 379 |
+
selector = str(params.get("selector", "") or "")
|
| 380 |
+
if not selector:
|
| 381 |
+
raise ValueError("html.select requires a selector")
|
| 382 |
+
limit = int(params.get("limit", 20) or 20)
|
| 383 |
+
elements = soup.select(selector, limit=max(1, limit))
|
| 384 |
+
return {
|
| 385 |
+
"elements_found": len(elements),
|
| 386 |
+
"selector_used": selector,
|
| 387 |
+
"elements": [self._snapshot_element(element) for element in elements[: max(1, limit)]],
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
if tool_name == "html.select_one":
|
| 391 |
+
selector = str(params.get("selector", "") or "")
|
| 392 |
+
if not selector:
|
| 393 |
+
raise ValueError("html.select_one requires a selector")
|
| 394 |
+
element = soup.select_one(selector)
|
| 395 |
+
return {"found": bool(element), "element": self._snapshot_element(element) if element else None}
|
| 396 |
+
|
| 397 |
+
if tool_name == "html.find_all":
|
| 398 |
+
tag = params.get("tag")
|
| 399 |
+
attrs = params.get("attrs", {})
|
| 400 |
+
recursive = bool(params.get("recursive", True))
|
| 401 |
+
limit = int(params.get("limit", 20) or 20)
|
| 402 |
+
if attrs is None or not isinstance(attrs, dict):
|
| 403 |
+
attrs = {}
|
| 404 |
+
elements = soup.find_all(tag, attrs=attrs, recursive=recursive, limit=max(1, limit))
|
| 405 |
+
return {
|
| 406 |
+
"elements_found": len(elements),
|
| 407 |
+
"tags": [getattr(element, "name", "") for element in elements],
|
| 408 |
+
"elements": [self._snapshot_element(element) for element in elements[: max(1, limit)]],
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
if tool_name == "html.get_text":
|
| 412 |
+
selector = params.get("selector")
|
| 413 |
+
separator = str(params.get("separator", " "))
|
| 414 |
+
if selector:
|
| 415 |
+
selected = soup.select(str(selector))
|
| 416 |
+
text = separator.join(node.get_text(" ", strip=True) for node in selected)
|
| 417 |
+
else:
|
| 418 |
+
text = soup.get_text(" ", strip=True)
|
| 419 |
+
return {"text": text, "length": len(text)}
|
| 420 |
+
|
| 421 |
+
if tool_name == "html.get_attribute":
|
| 422 |
+
selector = str(params.get("selector", "") or "")
|
| 423 |
+
attribute = str(params.get("attribute", "") or "")
|
| 424 |
+
if not selector or not attribute:
|
| 425 |
+
raise ValueError("html.get_attribute requires selector and attribute")
|
| 426 |
+
element = soup.select_one(selector)
|
| 427 |
+
return {"found": bool(element), "value": element.get(attribute) if element else None}
|
| 428 |
+
|
| 429 |
+
if tool_name == "html.extract_links":
|
| 430 |
+
filter_pattern = params.get("filter_pattern")
|
| 431 |
+
base_url = str(params.get("base_url", "") or context.get("url", "") or "")
|
| 432 |
+
pattern = re.compile(str(filter_pattern)) if filter_pattern else None
|
| 433 |
+
links: list[dict[str, Any]] = []
|
| 434 |
+
for anchor in soup.select("a[href]"):
|
| 435 |
+
href = str(anchor.get("href", "") or "").strip()
|
| 436 |
+
if not href:
|
| 437 |
+
continue
|
| 438 |
+
absolute_url = urljoin(base_url, href) if base_url else href
|
| 439 |
+
if pattern and not pattern.search(absolute_url):
|
| 440 |
+
continue
|
| 441 |
+
links.append(
|
| 442 |
+
{
|
| 443 |
+
"url": absolute_url,
|
| 444 |
+
"text": _truncate(anchor.get_text(" ", strip=True), 120),
|
| 445 |
+
"title": anchor.get("title"),
|
| 446 |
+
}
|
| 447 |
+
)
|
| 448 |
+
return {"count": len(links), "links": links[:200]}
|
| 449 |
+
|
| 450 |
+
if tool_name == "html.extract_images":
|
| 451 |
+
include_lazy = bool(params.get("include_lazy", True))
|
| 452 |
+
images: list[dict[str, Any]] = []
|
| 453 |
+
for image in soup.select("img"):
|
| 454 |
+
src = image.get("src")
|
| 455 |
+
if include_lazy and not src:
|
| 456 |
+
src = image.get("data-src") or image.get("data-original")
|
| 457 |
+
if not src:
|
| 458 |
+
continue
|
| 459 |
+
images.append(
|
| 460 |
+
{
|
| 461 |
+
"src": src,
|
| 462 |
+
"alt": image.get("alt"),
|
| 463 |
+
"title": image.get("title"),
|
| 464 |
+
}
|
| 465 |
+
)
|
| 466 |
+
return {"count": len(images), "images": images[:200]}
|
| 467 |
+
|
| 468 |
+
if tool_name == "html.extract_tables":
|
| 469 |
+
selector = params.get("selector")
|
| 470 |
+
tables = soup.select(str(selector)) if selector else soup.find_all("table")
|
| 471 |
+
output: list[dict[str, Any]] = []
|
| 472 |
+
for table in tables:
|
| 473 |
+
rows: list[list[str]] = []
|
| 474 |
+
for row in table.find_all("tr"):
|
| 475 |
+
cells = [cell.get_text(" ", strip=True) for cell in row.find_all(["th", "td"])]
|
| 476 |
+
if cells:
|
| 477 |
+
rows.append(cells)
|
| 478 |
+
if rows:
|
| 479 |
+
output.append({"rows": rows, "row_count": len(rows)})
|
| 480 |
+
return {"count": len(output), "tables": output[:30]}
|
| 481 |
+
|
| 482 |
+
if tool_name == "html.extract_forms":
|
| 483 |
+
selector = params.get("selector")
|
| 484 |
+
forms = soup.select(str(selector)) if selector else soup.find_all("form")
|
| 485 |
+
extracted: list[dict[str, Any]] = []
|
| 486 |
+
for form in forms:
|
| 487 |
+
fields: list[dict[str, Any]] = []
|
| 488 |
+
for field in form.find_all(["input", "select", "textarea", "button"]):
|
| 489 |
+
fields.append(
|
| 490 |
+
{
|
| 491 |
+
"tag": field.name,
|
| 492 |
+
"name": field.get("name"),
|
| 493 |
+
"type": field.get("type"),
|
| 494 |
+
"id": field.get("id"),
|
| 495 |
+
}
|
| 496 |
+
)
|
| 497 |
+
extracted.append({"action": form.get("action"), "method": form.get("method"), "fields": fields})
|
| 498 |
+
return {"count": len(extracted), "forms": extracted[:30]}
|
| 499 |
+
|
| 500 |
+
if tool_name == "html.extract_meta":
|
| 501 |
+
meta: dict[str, str] = {}
|
| 502 |
+
for tag in soup.find_all("meta"):
|
| 503 |
+
key = tag.get("name") or tag.get("property")
|
| 504 |
+
content = tag.get("content")
|
| 505 |
+
if key and content:
|
| 506 |
+
meta[str(key)] = str(content)
|
| 507 |
+
title = soup.title.get_text(" ", strip=True) if soup.title else ""
|
| 508 |
+
return {"title": title, "meta": meta, "count": len(meta)}
|
| 509 |
+
|
| 510 |
+
if tool_name == "html.extract_jsonld":
|
| 511 |
+
items: list[Any] = []
|
| 512 |
+
for node in soup.select("script[type='application/ld+json']"):
|
| 513 |
+
raw = node.string or node.get_text(" ", strip=True)
|
| 514 |
+
if not raw:
|
| 515 |
+
continue
|
| 516 |
+
try:
|
| 517 |
+
parsed = json.loads(raw)
|
| 518 |
+
if isinstance(parsed, list):
|
| 519 |
+
items.extend(parsed)
|
| 520 |
+
else:
|
| 521 |
+
items.append(parsed)
|
| 522 |
+
except json.JSONDecodeError:
|
| 523 |
+
continue
|
| 524 |
+
return {"count": len(items), "items": items[:50]}
|
| 525 |
+
|
| 526 |
+
if tool_name == "html.detect_repeating_blocks":
|
| 527 |
+
signatures: dict[str, int] = {}
|
| 528 |
+
for node in soup.find_all(True):
|
| 529 |
+
classes = node.get("class") or []
|
| 530 |
+
if not classes:
|
| 531 |
+
continue
|
| 532 |
+
signature = f"{node.name}.{'.'.join(sorted(classes)[:2])}"
|
| 533 |
+
signatures[signature] = signatures.get(signature, 0) + 1
|
| 534 |
+
candidates = [
|
| 535 |
+
{"signature": signature, "count": count}
|
| 536 |
+
for signature, count in sorted(signatures.items(), key=lambda item: item[1], reverse=True)
|
| 537 |
+
if count >= 3
|
| 538 |
+
]
|
| 539 |
+
return {"candidates": candidates[:25], "count": len(candidates)}
|
| 540 |
+
|
| 541 |
+
raise ValueError(f"Unsupported HTML tool '{tool_name}'")
|
| 542 |
+
|
| 543 |
+
def _run_data_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
|
| 544 |
+
if tool_name == "json.parse":
|
| 545 |
+
text = str(params.get("text", "") or "")
|
| 546 |
+
try:
|
| 547 |
+
data = json.loads(text)
|
| 548 |
+
return {"valid": True, "data": data}
|
| 549 |
+
except json.JSONDecodeError as exc:
|
| 550 |
+
return {"valid": False, "data": None, "error": str(exc)}
|
| 551 |
+
|
| 552 |
+
if tool_name == "json.dumps":
|
| 553 |
+
data = params.get("data", context.get("data"))
|
| 554 |
+
indent = int(params.get("indent", 2) or 2)
|
| 555 |
+
sort_keys = bool(params.get("sort_keys", False))
|
| 556 |
+
output = json.dumps(data, indent=indent, sort_keys=sort_keys, default=str)
|
| 557 |
+
return {"output": output, "length": len(output)}
|
| 558 |
+
|
| 559 |
+
if tool_name == "csv.generate":
|
| 560 |
+
rows = _coerce_records(params.get("data", context.get("rows")))
|
| 561 |
+
fields = params.get("fields")
|
| 562 |
+
field_names = [str(field) for field in fields] if isinstance(fields, list) and fields else None
|
| 563 |
+
if not rows:
|
| 564 |
+
return {"csv": "", "rows": 0, "columns": 0}
|
| 565 |
+
output = io.StringIO()
|
| 566 |
+
writer = csv.DictWriter(output, fieldnames=field_names or list(rows[0].keys()))
|
| 567 |
+
writer.writeheader()
|
| 568 |
+
for row in rows:
|
| 569 |
+
writer.writerow(row)
|
| 570 |
+
csv_text = output.getvalue()
|
| 571 |
+
return {
|
| 572 |
+
"csv": csv_text,
|
| 573 |
+
"rows": len(rows),
|
| 574 |
+
"columns": len(writer.fieldnames or []),
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
if tool_name == "csv.parse":
|
| 578 |
+
text = str(params.get("text", "") or "")
|
| 579 |
+
delimiter = str(params.get("delimiter", ",") or ",")
|
| 580 |
+
has_header = bool(params.get("has_header", True))
|
| 581 |
+
stream = io.StringIO(text)
|
| 582 |
+
if has_header:
|
| 583 |
+
reader = csv.DictReader(stream, delimiter=delimiter)
|
| 584 |
+
records = [dict(record) for record in reader]
|
| 585 |
+
else:
|
| 586 |
+
reader = csv.reader(stream, delimiter=delimiter)
|
| 587 |
+
rows = list(reader)
|
| 588 |
+
records = [{"col_" + str(idx): value for idx, value in enumerate(row)} for row in rows]
|
| 589 |
+
return {"records": records, "rows": len(records), "columns": len(records[0]) if records else 0}
|
| 590 |
+
|
| 591 |
+
if tool_name == "data.dedupe_rows":
|
| 592 |
+
rows = _coerce_records(params.get("rows", context.get("rows")))
|
| 593 |
+
key_fields = params.get("key_fields")
|
| 594 |
+
if not isinstance(key_fields, list):
|
| 595 |
+
key_fields = []
|
| 596 |
+
deduped: list[dict[str, Any]] = []
|
| 597 |
+
seen: set[str] = set()
|
| 598 |
+
for row in rows:
|
| 599 |
+
if key_fields:
|
| 600 |
+
key = "|".join(str(row.get(field, "")) for field in key_fields)
|
| 601 |
+
else:
|
| 602 |
+
key = json.dumps(row, sort_keys=True, default=str)
|
| 603 |
+
if key in seen:
|
| 604 |
+
continue
|
| 605 |
+
seen.add(key)
|
| 606 |
+
deduped.append(row)
|
| 607 |
+
return {"rows": deduped, "removed": len(rows) - len(deduped), "count": len(deduped)}
|
| 608 |
+
|
| 609 |
+
if tool_name == "data.rank_rows":
|
| 610 |
+
rows = _coerce_records(params.get("rows", context.get("rows")))
|
| 611 |
+
sort_field = str(params.get("sort_field", "") or "")
|
| 612 |
+
descending = bool(params.get("descending", True))
|
| 613 |
+
limit = int(params.get("limit", len(rows)) or len(rows))
|
| 614 |
+
if not rows:
|
| 615 |
+
return {"rows": [], "count": 0}
|
| 616 |
+
if not sort_field:
|
| 617 |
+
numeric_candidates = [
|
| 618 |
+
key
|
| 619 |
+
for key in rows[0].keys()
|
| 620 |
+
if any(_safe_float(row.get(key, ""), default=-1.0) != -1.0 for row in rows)
|
| 621 |
+
]
|
| 622 |
+
sort_field = numeric_candidates[0] if numeric_candidates else list(rows[0].keys())[0]
|
| 623 |
+
ranked = sorted(rows, key=lambda row: _safe_float(row.get(sort_field, ""), 0.0), reverse=descending)
|
| 624 |
+
return {"rows": ranked[: max(1, limit)], "sort_field": sort_field, "count": min(len(ranked), limit)}
|
| 625 |
+
|
| 626 |
+
if tool_name == "data.select_columns":
|
| 627 |
+
rows = _coerce_records(params.get("rows", context.get("rows")))
|
| 628 |
+
columns = params.get("columns")
|
| 629 |
+
if not isinstance(columns, list) or not columns:
|
| 630 |
+
return {"rows": rows, "columns": list(rows[0].keys()) if rows else []}
|
| 631 |
+
selected = [{column: row.get(column, "") for column in columns} for row in rows]
|
| 632 |
+
return {"rows": selected, "columns": columns, "count": len(selected)}
|
| 633 |
+
|
| 634 |
+
if tool_name.startswith("pandas."):
|
| 635 |
+
return {
|
| 636 |
+
"supported": False,
|
| 637 |
+
"reason": "pandas runtime execution is not enabled in this lightweight agent executor",
|
| 638 |
+
"tool": tool_name,
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
raise ValueError(f"Unsupported data tool '{tool_name}'")
|
| 642 |
+
|
| 643 |
+
def _run_extraction_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
|
| 644 |
+
if tool_name.startswith("regex."):
|
| 645 |
+
pattern = str(params.get("pattern", "") or "")
|
| 646 |
+
text = str(params.get("text", "") or "")
|
| 647 |
+
if not pattern:
|
| 648 |
+
raise ValueError("regex.* tools require a pattern")
|
| 649 |
+
if tool_name == "regex.match":
|
| 650 |
+
match = re.match(pattern, text)
|
| 651 |
+
return {"matched": bool(match), "groups": list(match.groups()) if match else []}
|
| 652 |
+
if tool_name == "regex.search":
|
| 653 |
+
match = re.search(pattern, text)
|
| 654 |
+
return {
|
| 655 |
+
"found": bool(match),
|
| 656 |
+
"position": match.start() if match else -1,
|
| 657 |
+
"match": match.group(0) if match else "",
|
| 658 |
+
}
|
| 659 |
+
if tool_name == "regex.findall":
|
| 660 |
+
matches = re.findall(pattern, text)
|
| 661 |
+
return {"matches": matches, "count": len(matches)}
|
| 662 |
+
if tool_name == "regex.sub":
|
| 663 |
+
replacement = str(params.get("replacement", "") or "")
|
| 664 |
+
result = re.sub(pattern, replacement, text)
|
| 665 |
+
return {"result": result, "replacements": max(0, len(re.findall(pattern, text)))}
|
| 666 |
+
if tool_name == "regex.split":
|
| 667 |
+
maxsplit = int(params.get("maxsplit", 0) or 0)
|
| 668 |
+
parts = re.split(pattern, text, maxsplit=maxsplit)
|
| 669 |
+
return {"parts": parts, "count": len(parts)}
|
| 670 |
+
raise ValueError(f"Unsupported regex tool '{tool_name}'")
|
| 671 |
+
|
| 672 |
+
text = str(params.get("text", "") or context.get("text", "") or context.get("html", "") or "")
|
| 673 |
+
|
| 674 |
+
if tool_name == "extract.emails":
|
| 675 |
+
emails = sorted(set(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)))
|
| 676 |
+
return {"emails": emails, "count": len(emails)}
|
| 677 |
+
|
| 678 |
+
if tool_name == "extract.phones":
|
| 679 |
+
phones = sorted(set(re.findall(r"(?:\+?\d[\d\-\s().]{7,}\d)", text)))
|
| 680 |
+
return {"phones": phones, "count": len(phones)}
|
| 681 |
+
|
| 682 |
+
if tool_name == "extract.urls":
|
| 683 |
+
urls = sorted(set(re.findall(r"https?://[^\s\"'<>]+", text)))
|
| 684 |
+
if not urls:
|
| 685 |
+
soup = context.get("soup")
|
| 686 |
+
if isinstance(soup, BeautifulSoup):
|
| 687 |
+
urls = [urljoin(str(context.get("url", "")), a.get("href")) for a in soup.select("a[href]")]
|
| 688 |
+
return {"urls": urls[:500], "count": len(urls)}
|
| 689 |
+
|
| 690 |
+
if tool_name == "extract.dates":
|
| 691 |
+
dates = sorted(
|
| 692 |
+
set(
|
| 693 |
+
re.findall(
|
| 694 |
+
r"\b(?:\d{4}-\d{2}-\d{2}|\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{2,4})\b",
|
| 695 |
+
text,
|
| 696 |
+
flags=re.IGNORECASE,
|
| 697 |
+
)
|
| 698 |
+
)
|
| 699 |
+
)
|
| 700 |
+
return {"dates": dates[:300], "count": len(dates)}
|
| 701 |
+
|
| 702 |
+
if tool_name == "extract.prices":
|
| 703 |
+
matches = re.findall(r"(?:[$€£₹]\s?\d[\d,]*(?:\.\d{1,2})?|\d[\d,]*(?:\.\d{1,2})?\s?(?:USD|EUR|INR|GBP))", text)
|
| 704 |
+
prices = [{"raw": match} for match in sorted(set(matches))]
|
| 705 |
+
return {"prices": prices[:300], "count": len(prices)}
|
| 706 |
+
|
| 707 |
+
if tool_name == "extract.addresses":
|
| 708 |
+
matches = re.findall(r"\b\d{1,5}\s+[A-Za-z0-9.\- ]+\s(?:Street|St|Road|Rd|Avenue|Ave|Lane|Ln|Boulevard|Blvd)\b", text)
|
| 709 |
+
addresses = [{"raw": match} for match in sorted(set(matches))]
|
| 710 |
+
return {"addresses": addresses, "count": len(addresses)}
|
| 711 |
+
|
| 712 |
+
if tool_name == "extract.social_handles":
|
| 713 |
+
handles = sorted(set(re.findall(r"@[A-Za-z0-9_\.]{2,30}", text)))
|
| 714 |
+
return {"handles": {"generic": handles[:500]}, "count": len(handles)}
|
| 715 |
+
|
| 716 |
+
if tool_name == "extract.top_n":
|
| 717 |
+
rows = _coerce_records(params.get("rows", context.get("rows")))
|
| 718 |
+
n = max(1, int(params.get("n", 10) or 10))
|
| 719 |
+
sort_field = str(params.get("sort_field", "") or "")
|
| 720 |
+
if rows and sort_field:
|
| 721 |
+
rows = sorted(rows, key=lambda row: _safe_float(row.get(sort_field, ""), 0.0), reverse=True)
|
| 722 |
+
return {"rows": rows[:n], "count": min(len(rows), n)}
|
| 723 |
+
|
| 724 |
+
raise ValueError(f"Unsupported extraction tool '{tool_name}'")
|
| 725 |
+
|
| 726 |
+
def _run_validation_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
|
| 727 |
+
if tool_name == "validate.url":
|
| 728 |
+
url = str(params.get("url", "") or context.get("url", "") or "")
|
| 729 |
+
parsed = urlparse(url)
|
| 730 |
+
valid = bool(parsed.scheme and parsed.netloc)
|
| 731 |
+
return {"valid": valid, "accessible": None, "status_code": None}
|
| 732 |
+
|
| 733 |
+
if tool_name == "validate.email":
|
| 734 |
+
email = str(params.get("email", "") or "")
|
| 735 |
+
valid = bool(re.match(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$", email))
|
| 736 |
+
return {"valid": valid, "normalized": email.strip().lower() if valid else ""}
|
| 737 |
+
|
| 738 |
+
if tool_name == "validate.json":
|
| 739 |
+
text = str(params.get("text", "") or "")
|
| 740 |
+
try:
|
| 741 |
+
json.loads(text)
|
| 742 |
+
return {"valid": True, "error": None}
|
| 743 |
+
except json.JSONDecodeError as exc:
|
| 744 |
+
return {"valid": False, "error": str(exc)}
|
| 745 |
+
|
| 746 |
+
if tool_name == "validate.html":
|
| 747 |
+
html = str(params.get("html", "") or context.get("html", "") or "")
|
| 748 |
+
if not html:
|
| 749 |
+
return {"valid": False, "errors": ["No HTML provided"]}
|
| 750 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 751 |
+
errors: list[str] = []
|
| 752 |
+
if not soup.find():
|
| 753 |
+
errors.append("HTML has no parseable elements")
|
| 754 |
+
return {"valid": not errors, "errors": errors}
|
| 755 |
+
|
| 756 |
+
if tool_name == "validate.schema":
|
| 757 |
+
data = params.get("data")
|
| 758 |
+
schema = params.get("schema") if isinstance(params.get("schema"), dict) else {}
|
| 759 |
+
required = schema.get("required", []) if isinstance(schema.get("required"), list) else []
|
| 760 |
+
if isinstance(data, dict):
|
| 761 |
+
missing = [field for field in required if field not in data]
|
| 762 |
+
else:
|
| 763 |
+
missing = required
|
| 764 |
+
return {"valid": not missing, "errors": [f"Missing field: {field}" for field in missing]}
|
| 765 |
+
|
| 766 |
+
if tool_name == "validate.data_completeness":
|
| 767 |
+
rows = _coerce_records(params.get("rows", context.get("rows")))
|
| 768 |
+
required_fields = params.get("fields")
|
| 769 |
+
if not isinstance(required_fields, list) or not required_fields:
|
| 770 |
+
required_fields = sorted({key for row in rows for key in row.keys()}) if rows else []
|
| 771 |
+
if not rows or not required_fields:
|
| 772 |
+
return {"score": 0.0, "missing_counts": {}, "fields": required_fields}
|
| 773 |
+
missing_counts = {field: 0 for field in required_fields}
|
| 774 |
+
for row in rows:
|
| 775 |
+
for field in required_fields:
|
| 776 |
+
value = row.get(field, "")
|
| 777 |
+
if value in (None, "", [], {}):
|
| 778 |
+
missing_counts[field] += 1
|
| 779 |
+
total_cells = len(rows) * len(required_fields)
|
| 780 |
+
missing_cells = sum(missing_counts.values())
|
| 781 |
+
score = 1.0 - (missing_cells / total_cells) if total_cells else 0.0
|
| 782 |
+
return {"score": round(score, 4), "missing_counts": missing_counts, "fields": required_fields}
|
| 783 |
+
|
| 784 |
+
if tool_name == "validate.row_signal":
|
| 785 |
+
rows = _coerce_records(params.get("rows", context.get("rows")))
|
| 786 |
+
if not rows:
|
| 787 |
+
return {"signal": 0.0, "reason": "No rows provided"}
|
| 788 |
+
non_empty_fields = 0
|
| 789 |
+
total_fields = 0
|
| 790 |
+
distinct_rows = len({json.dumps(row, sort_keys=True, default=str) for row in rows})
|
| 791 |
+
for row in rows:
|
| 792 |
+
for value in row.values():
|
| 793 |
+
total_fields += 1
|
| 794 |
+
if value not in (None, "", [], {}):
|
| 795 |
+
non_empty_fields += 1
|
| 796 |
+
completeness = (non_empty_fields / total_fields) if total_fields else 0.0
|
| 797 |
+
uniqueness = distinct_rows / len(rows)
|
| 798 |
+
signal = round((0.7 * completeness) + (0.3 * uniqueness), 4)
|
| 799 |
+
return {
|
| 800 |
+
"signal": signal,
|
| 801 |
+
"completeness": round(completeness, 4),
|
| 802 |
+
"uniqueness": round(uniqueness, 4),
|
| 803 |
+
}
|
| 804 |
+
|
| 805 |
+
raise ValueError(f"Unsupported validation tool '{tool_name}'")
|
| 806 |
+
|
| 807 |
+
def _run_analysis_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
|
| 808 |
+
text = str(params.get("text", "") or context.get("text", "") or "")
|
| 809 |
+
|
| 810 |
+
if tool_name == "text.keywords":
|
| 811 |
+
top_k = max(1, int(params.get("top_k", 10) or 10))
|
| 812 |
+
tokens = _tokenize(text)
|
| 813 |
+
frequencies: dict[str, int] = {}
|
| 814 |
+
for token in tokens:
|
| 815 |
+
frequencies[token] = frequencies.get(token, 0) + 1
|
| 816 |
+
ranked = sorted(frequencies.items(), key=lambda item: item[1], reverse=True)[:top_k]
|
| 817 |
+
return {"keywords": [item[0] for item in ranked], "scores": [item[1] for item in ranked]}
|
| 818 |
+
|
| 819 |
+
if tool_name == "text.entities":
|
| 820 |
+
entities = sorted(set(re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", text)))
|
| 821 |
+
requested_types = params.get("types") if isinstance(params.get("types"), list) else []
|
| 822 |
+
output = [{"text": entity, "type": "PROPER_NOUN"} for entity in entities]
|
| 823 |
+
if requested_types:
|
| 824 |
+
output = [entity for entity in output if entity["type"] in requested_types]
|
| 825 |
+
return {"entities": output[:200], "count": len(output)}
|
| 826 |
+
|
| 827 |
+
if tool_name == "text.sentiment":
|
| 828 |
+
positive = {"good", "great", "excellent", "amazing", "positive", "love", "best"}
|
| 829 |
+
negative = {"bad", "poor", "terrible", "awful", "negative", "worst", "hate"}
|
| 830 |
+
tokens = _tokenize(text)
|
| 831 |
+
score = sum(1 for token in tokens if token in positive) - sum(1 for token in tokens if token in negative)
|
| 832 |
+
label = "neutral"
|
| 833 |
+
if score > 0:
|
| 834 |
+
label = "positive"
|
| 835 |
+
elif score < 0:
|
| 836 |
+
label = "negative"
|
| 837 |
+
return {"score": score, "label": label}
|
| 838 |
+
|
| 839 |
+
if tool_name == "stats.describe":
|
| 840 |
+
values = [float(item) for item in params.get("data", []) if isinstance(item, (int, float))]
|
| 841 |
+
if not values:
|
| 842 |
+
return {"mean": 0.0, "median": 0.0, "std": 0.0, "min": 0.0, "max": 0.0}
|
| 843 |
+
return {
|
| 844 |
+
"mean": statistics.fmean(values),
|
| 845 |
+
"median": statistics.median(values),
|
| 846 |
+
"std": statistics.pstdev(values) if len(values) > 1 else 0.0,
|
| 847 |
+
"min": min(values),
|
| 848 |
+
"max": max(values),
|
| 849 |
+
}
|
| 850 |
+
|
| 851 |
+
if tool_name == "stats.correlation":
|
| 852 |
+
x = [float(item) for item in params.get("x", []) if isinstance(item, (int, float))]
|
| 853 |
+
y = [float(item) for item in params.get("y", []) if isinstance(item, (int, float))]
|
| 854 |
+
if len(x) != len(y) or len(x) < 2:
|
| 855 |
+
return {"correlation": 0.0, "p_value": None}
|
| 856 |
+
x_mean = statistics.fmean(x)
|
| 857 |
+
y_mean = statistics.fmean(y)
|
| 858 |
+
numerator = sum((a - x_mean) * (b - y_mean) for a, b in zip(x, y))
|
| 859 |
+
x_var = sum((a - x_mean) ** 2 for a in x)
|
| 860 |
+
y_var = sum((b - y_mean) ** 2 for b in y)
|
| 861 |
+
denominator = (x_var * y_var) ** 0.5
|
| 862 |
+
correlation = (numerator / denominator) if denominator else 0.0
|
| 863 |
+
return {"correlation": correlation, "p_value": None}
|
| 864 |
+
|
| 865 |
+
if tool_name == "analysis.infer_schema":
|
| 866 |
+
rows = _coerce_records(params.get("rows", context.get("rows")))
|
| 867 |
+
schema: dict[str, dict[str, Any]] = {}
|
| 868 |
+
for row in rows:
|
| 869 |
+
for key, value in row.items():
|
| 870 |
+
entry = schema.setdefault(key, {"types": set(), "nullable": False})
|
| 871 |
+
entry["types"].add(_infer_type(value))
|
| 872 |
+
if value in (None, "", [], {}):
|
| 873 |
+
entry["nullable"] = True
|
| 874 |
+
normalized = {
|
| 875 |
+
key: {"types": sorted(value["types"]), "nullable": value["nullable"]}
|
| 876 |
+
for key, value in schema.items()
|
| 877 |
+
}
|
| 878 |
+
return {"schema": normalized, "columns": sorted(normalized.keys())}
|
| 879 |
+
|
| 880 |
+
if tool_name == "analysis.score_relevance":
|
| 881 |
+
rows = _coerce_records(params.get("rows", context.get("rows")))
|
| 882 |
+
query = str(params.get("query", "") or context.get("instructions", "") or "")
|
| 883 |
+
query_tokens = set(_tokenize(query))
|
| 884 |
+
scored: list[dict[str, Any]] = []
|
| 885 |
+
for row in rows:
|
| 886 |
+
row_text = " ".join(str(value) for value in row.values())
|
| 887 |
+
row_tokens = set(_tokenize(row_text))
|
| 888 |
+
overlap = len(query_tokens & row_tokens)
|
| 889 |
+
score = overlap / max(1, len(query_tokens))
|
| 890 |
+
scored.append({"row": row, "score": round(score, 4)})
|
| 891 |
+
scored.sort(key=lambda item: item["score"], reverse=True)
|
| 892 |
+
return {"rows": scored, "count": len(scored)}
|
| 893 |
+
|
| 894 |
+
raise ValueError(f"Unsupported analysis tool '{tool_name}'")
|
| 895 |
+
|
| 896 |
+
|
| 897 |
+
def summarize_tool_results(results: list[ToolCallResult], max_items: int = 8) -> str:
|
| 898 |
+
"""Render compact tool result notes for downstream prompting."""
|
| 899 |
+
lines: list[str] = []
|
| 900 |
+
for result in results[:max_items]:
|
| 901 |
+
if result.success:
|
| 902 |
+
preview = _truncate(result.result, 220)
|
| 903 |
+
lines.append(f"- {result.tool_name}: success ({result.duration_ms}ms), result={preview}")
|
| 904 |
+
else:
|
| 905 |
+
lines.append(f"- {result.tool_name}: failed ({result.duration_ms}ms), error={result.error}")
|
| 906 |
+
return "\n".join(lines)
|
backend/app/api/routes/scrape.py
CHANGED
|
@@ -2547,6 +2547,100 @@ URL:"""
|
|
| 2547 |
# Get a larger sample of the HTML for LLM analysis (first 15000 chars to include content)
|
| 2548 |
html_sample = nav_obs.page_html[:15000]
|
| 2549 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2550 |
extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
|
| 2551 |
|
| 2552 |
USER REQUEST:
|
|
@@ -2562,6 +2656,9 @@ HTML SAMPLE (first 15000 chars):
|
|
| 2562 |
|
| 2563 |
{template_hint}
|
| 2564 |
|
|
|
|
|
|
|
|
|
|
| 2565 |
TASK: Generate Python code using BeautifulSoup to extract the requested data.
|
| 2566 |
|
| 2567 |
REQUIREMENTS:
|
|
|
|
| 2547 |
# Get a larger sample of the HTML for LLM analysis (first 15000 chars to include content)
|
| 2548 |
html_sample = nav_obs.page_html[:15000]
|
| 2549 |
|
| 2550 |
+
# === AGENT TOOL CALLING: runtime-selected, registry-backed ===
|
| 2551 |
+
agent_tool_calls = []
|
| 2552 |
+
tool_call_results = []
|
| 2553 |
+
tool_observations = ""
|
| 2554 |
+
|
| 2555 |
+
if live_llm_enabled:
|
| 2556 |
+
try:
|
| 2557 |
+
from app.agents.tool_caller import AgentToolCaller, ToolExecutor, summarize_tool_results
|
| 2558 |
+
|
| 2559 |
+
tool_caller = AgentToolCaller(model_router)
|
| 2560 |
+
executor = ToolExecutor()
|
| 2561 |
+
|
| 2562 |
+
agent_tool_calls = await tool_caller.decide_tools(
|
| 2563 |
+
task_description=(
|
| 2564 |
+
f"Extract {request.output_instructions or 'data'} from page content. "
|
| 2565 |
+
f"User instructions: {request.instructions}"
|
| 2566 |
+
),
|
| 2567 |
+
context={
|
| 2568 |
+
"url": target_url,
|
| 2569 |
+
"html_length": len(nav_obs.page_html),
|
| 2570 |
+
"instructions": request.instructions,
|
| 2571 |
+
"output_format": request.output_format.value,
|
| 2572 |
+
"tools_used": [],
|
| 2573 |
+
},
|
| 2574 |
+
model=request.model,
|
| 2575 |
+
max_tools=6,
|
| 2576 |
+
)
|
| 2577 |
+
|
| 2578 |
+
if agent_tool_calls:
|
| 2579 |
+
tool_decision_step = _record_step(
|
| 2580 |
+
session,
|
| 2581 |
+
ScrapeStep(
|
| 2582 |
+
step_number=len(session["steps"]),
|
| 2583 |
+
action="agent_decision",
|
| 2584 |
+
status="completed",
|
| 2585 |
+
message=f"Agent selected {len(agent_tool_calls)} runtime tools",
|
| 2586 |
+
reward=0.1,
|
| 2587 |
+
extracted_data={
|
| 2588 |
+
"tool_calls": [
|
| 2589 |
+
{
|
| 2590 |
+
"tool": tool_call.tool_name,
|
| 2591 |
+
"params": tool_call.parameters,
|
| 2592 |
+
"reasoning": tool_call.reasoning,
|
| 2593 |
+
}
|
| 2594 |
+
for tool_call in agent_tool_calls
|
| 2595 |
+
],
|
| 2596 |
+
},
|
| 2597 |
+
timestamp=_now_iso(),
|
| 2598 |
+
),
|
| 2599 |
+
)
|
| 2600 |
+
yield tool_decision_step
|
| 2601 |
+
|
| 2602 |
+
tool_context = {
|
| 2603 |
+
"soup": BeautifulSoup(nav_obs.page_html, "html.parser"),
|
| 2604 |
+
"html": nav_obs.page_html,
|
| 2605 |
+
"url": target_url,
|
| 2606 |
+
"instructions": request.instructions or "",
|
| 2607 |
+
}
|
| 2608 |
+
|
| 2609 |
+
for tool_call in agent_tool_calls:
|
| 2610 |
+
result = await executor.execute_tool_call(tool_call, tool_context)
|
| 2611 |
+
tool_call_results.append(result)
|
| 2612 |
+
|
| 2613 |
+
if result.success and isinstance(result.result, dict):
|
| 2614 |
+
for context_key in ("rows", "text", "data"):
|
| 2615 |
+
if context_key in result.result:
|
| 2616 |
+
tool_context[context_key] = result.result[context_key]
|
| 2617 |
+
|
| 2618 |
+
tool_exec_step = _record_step(
|
| 2619 |
+
session,
|
| 2620 |
+
ScrapeStep(
|
| 2621 |
+
step_number=len(session["steps"]),
|
| 2622 |
+
action="tool_call",
|
| 2623 |
+
status="completed" if result.success else "failed",
|
| 2624 |
+
message=f"Tool {result.tool_name}: {'ok' if result.success else 'failed'}",
|
| 2625 |
+
reward=0.05 if result.success else -0.02,
|
| 2626 |
+
extracted_data={
|
| 2627 |
+
"tool": result.tool_name,
|
| 2628 |
+
"success": result.success,
|
| 2629 |
+
"result_preview": str(result.result)[:200] if result.result is not None else None,
|
| 2630 |
+
"error": result.error,
|
| 2631 |
+
"duration_ms": result.duration_ms,
|
| 2632 |
+
},
|
| 2633 |
+
timestamp=_now_iso(),
|
| 2634 |
+
),
|
| 2635 |
+
)
|
| 2636 |
+
yield tool_exec_step
|
| 2637 |
+
|
| 2638 |
+
if tool_call_results:
|
| 2639 |
+
tool_observations = summarize_tool_results(tool_call_results)
|
| 2640 |
+
|
| 2641 |
+
except Exception as e:
|
| 2642 |
+
logger.warning("Agent tool calling failed: %s", e)
|
| 2643 |
+
|
| 2644 |
extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
|
| 2645 |
|
| 2646 |
USER REQUEST:
|
|
|
|
| 2656 |
|
| 2657 |
{template_hint}
|
| 2658 |
|
| 2659 |
+
AGENT TOOL OBSERVATIONS (runtime execution, not hardcoded):
|
| 2660 |
+
{tool_observations or "No additional tool observations collected."}
|
| 2661 |
+
|
| 2662 |
TASK: Generate Python code using BeautifulSoup to extract the requested data.
|
| 2663 |
|
| 2664 |
REQUIREMENTS:
|
backend/app/plugins/registry.py
CHANGED
|
@@ -182,6 +182,27 @@ HTML_TOOLS = [
|
|
| 182 |
parameters={"selector": "string (optional)"},
|
| 183 |
returns={"forms": "list[dict]", "count": "int"},
|
| 184 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
]
|
| 186 |
|
| 187 |
# ==============================================================================
|
|
@@ -259,6 +280,27 @@ DATA_TOOLS = [
|
|
| 259 |
parameters={"condition": "string"},
|
| 260 |
returns={"filtered_rows": "int", "original_rows": "int"},
|
| 261 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
]
|
| 263 |
|
| 264 |
# ==============================================================================
|
|
@@ -420,6 +462,20 @@ ANALYSIS_TOOLS = [
|
|
| 420 |
parameters={"text": "string", "top_k": "int"},
|
| 421 |
returns={"keywords": "list[string]", "scores": "list[float]"},
|
| 422 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
]
|
| 424 |
|
| 425 |
# ==============================================================================
|
|
@@ -476,6 +532,13 @@ EXTRACTION_TOOLS = [
|
|
| 476 |
parameters={"text": "string", "platforms": "list[string]"},
|
| 477 |
returns={"handles": "dict[string, list]", "count": "int"},
|
| 478 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
]
|
| 480 |
|
| 481 |
# ==============================================================================
|
|
@@ -518,6 +581,20 @@ VALIDATION_TOOLS = [
|
|
| 518 |
parameters={"data": "any", "schema": "dict"},
|
| 519 |
returns={"valid": "bool", "errors": "list[string]"},
|
| 520 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
]
|
| 522 |
|
| 523 |
# ==============================================================================
|
|
|
|
| 182 |
parameters={"selector": "string (optional)"},
|
| 183 |
returns={"forms": "list[dict]", "count": "int"},
|
| 184 |
),
|
| 185 |
+
ToolDefinition(
|
| 186 |
+
name="html.extract_meta",
|
| 187 |
+
description="Extract page title and meta tags",
|
| 188 |
+
category=PluginCategory.PARSER,
|
| 189 |
+
parameters={"include_og": "bool"},
|
| 190 |
+
returns={"title": "string", "meta": "dict[string, string]", "count": "int"},
|
| 191 |
+
),
|
| 192 |
+
ToolDefinition(
|
| 193 |
+
name="html.extract_jsonld",
|
| 194 |
+
description="Extract JSON-LD structured data blocks",
|
| 195 |
+
category=PluginCategory.PARSER,
|
| 196 |
+
parameters={"include_arrays": "bool"},
|
| 197 |
+
returns={"items": "list[dict]", "count": "int"},
|
| 198 |
+
),
|
| 199 |
+
ToolDefinition(
|
| 200 |
+
name="html.detect_repeating_blocks",
|
| 201 |
+
description="Find repeated DOM block signatures for list extraction",
|
| 202 |
+
category=PluginCategory.PARSER,
|
| 203 |
+
parameters={"min_repetitions": "int"},
|
| 204 |
+
returns={"candidates": "list[dict]", "count": "int"},
|
| 205 |
+
),
|
| 206 |
]
|
| 207 |
|
| 208 |
# ==============================================================================
|
|
|
|
| 280 |
parameters={"condition": "string"},
|
| 281 |
returns={"filtered_rows": "int", "original_rows": "int"},
|
| 282 |
),
|
| 283 |
+
ToolDefinition(
|
| 284 |
+
name="data.dedupe_rows",
|
| 285 |
+
description="Remove duplicate rows from list-of-dicts data",
|
| 286 |
+
category=PluginCategory.DATA,
|
| 287 |
+
parameters={"rows": "list[dict]", "key_fields": "list[string]"},
|
| 288 |
+
returns={"rows": "list[dict]", "removed": "int", "count": "int"},
|
| 289 |
+
),
|
| 290 |
+
ToolDefinition(
|
| 291 |
+
name="data.rank_rows",
|
| 292 |
+
description="Rank rows by score/value field",
|
| 293 |
+
category=PluginCategory.DATA,
|
| 294 |
+
parameters={"rows": "list[dict]", "sort_field": "string", "descending": "bool", "limit": "int"},
|
| 295 |
+
returns={"rows": "list[dict]", "sort_field": "string", "count": "int"},
|
| 296 |
+
),
|
| 297 |
+
ToolDefinition(
|
| 298 |
+
name="data.select_columns",
|
| 299 |
+
description="Project rows to requested output columns",
|
| 300 |
+
category=PluginCategory.DATA,
|
| 301 |
+
parameters={"rows": "list[dict]", "columns": "list[string]"},
|
| 302 |
+
returns={"rows": "list[dict]", "columns": "list[string]", "count": "int"},
|
| 303 |
+
),
|
| 304 |
]
|
| 305 |
|
| 306 |
# ==============================================================================
|
|
|
|
| 462 |
parameters={"text": "string", "top_k": "int"},
|
| 463 |
returns={"keywords": "list[string]", "scores": "list[float]"},
|
| 464 |
),
|
| 465 |
+
ToolDefinition(
|
| 466 |
+
name="analysis.infer_schema",
|
| 467 |
+
description="Infer field types and nullability from extracted rows",
|
| 468 |
+
category=PluginCategory.ANALYSIS,
|
| 469 |
+
parameters={"rows": "list[dict]"},
|
| 470 |
+
returns={"schema": "dict[string, dict]", "columns": "list[string]"},
|
| 471 |
+
),
|
| 472 |
+
ToolDefinition(
|
| 473 |
+
name="analysis.score_relevance",
|
| 474 |
+
description="Score row relevance against user query/instructions",
|
| 475 |
+
category=PluginCategory.ANALYSIS,
|
| 476 |
+
parameters={"rows": "list[dict]", "query": "string"},
|
| 477 |
+
returns={"rows": "list[dict]", "count": "int"},
|
| 478 |
+
),
|
| 479 |
]
|
| 480 |
|
| 481 |
# ==============================================================================
|
|
|
|
| 532 |
parameters={"text": "string", "platforms": "list[string]"},
|
| 533 |
returns={"handles": "dict[string, list]", "count": "int"},
|
| 534 |
),
|
| 535 |
+
ToolDefinition(
|
| 536 |
+
name="extract.top_n",
|
| 537 |
+
description="Select top N rows from extracted dataset",
|
| 538 |
+
category=PluginCategory.EXTRACTION,
|
| 539 |
+
parameters={"rows": "list[dict]", "n": "int", "sort_field": "string"},
|
| 540 |
+
returns={"rows": "list[dict]", "count": "int"},
|
| 541 |
+
),
|
| 542 |
]
|
| 543 |
|
| 544 |
# ==============================================================================
|
|
|
|
| 581 |
parameters={"data": "any", "schema": "dict"},
|
| 582 |
returns={"valid": "bool", "errors": "list[string]"},
|
| 583 |
),
|
| 584 |
+
ToolDefinition(
|
| 585 |
+
name="validate.data_completeness",
|
| 586 |
+
description="Score completeness of extracted rows against required fields",
|
| 587 |
+
category=PluginCategory.VALIDATION,
|
| 588 |
+
parameters={"rows": "list[dict]", "fields": "list[string]"},
|
| 589 |
+
returns={"score": "float", "missing_counts": "dict[string, int]", "fields": "list[string]"},
|
| 590 |
+
),
|
| 591 |
+
ToolDefinition(
|
| 592 |
+
name="validate.row_signal",
|
| 593 |
+
description="Estimate quality signal of extracted rows",
|
| 594 |
+
category=PluginCategory.VALIDATION,
|
| 595 |
+
parameters={"rows": "list[dict]"},
|
| 596 |
+
returns={"signal": "float", "completeness": "float", "uniqueness": "float"},
|
| 597 |
+
),
|
| 598 |
]
|
| 599 |
|
| 600 |
# ==============================================================================
|