NeerajCodz commited on
Commit
5b2dac6
·
1 Parent(s): d64a03c

feat: add dynamic registry-driven agent tool runtime

Browse files

- Added AgentToolCaller with LLM-driven tool planning from plugin registry metadata\n- Added ToolExecutor with runtime namespace dispatch and non-hardcoded execution\n- Expanded plugin registry from 71 to 82 tools (parser/data/analysis/extraction/validation additions)\n- Integrated runtime tool decisions and tool observations into agentic scrape flow\n- Verified selected tools execute and vary by prompt for different scraping tasks\n\nCo-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

backend/app/agents/__init__.py CHANGED
@@ -9,6 +9,7 @@ This module contains specialized agents for web scraping with RL:
9
  - VerifierAgent: Cross-source verification
10
  - MemoryAgent: Memory operations and knowledge management
11
  - AgentCoordinator: Orchestrates multiple agents with message passing
 
12
  """
13
 
14
  from .base import BaseAgent
@@ -18,6 +19,7 @@ from .memory_agent import MemoryAgent, MemoryEntry
18
  from .navigator import NavigatorAgent
19
  from .planner import PlannerAgent
20
  from .verifier import VerificationResult, VerifierAgent
 
21
 
22
  __all__ = [
23
  # Base
@@ -28,10 +30,15 @@ __all__ = [
28
  "ExtractorAgent",
29
  "VerifierAgent",
30
  "MemoryAgent",
 
31
  # Coordinator
32
  "AgentCoordinator",
33
  "AgentRole",
34
  "Message",
 
 
 
 
35
  # Data classes
36
  "VerificationResult",
37
  "MemoryEntry",
 
9
  - VerifierAgent: Cross-source verification
10
  - MemoryAgent: Memory operations and knowledge management
11
  - AgentCoordinator: Orchestrates multiple agents with message passing
12
+ - AgentToolCaller: LLM-driven tool selection and execution
13
  """
14
 
15
  from .base import BaseAgent
 
19
  from .navigator import NavigatorAgent
20
  from .planner import PlannerAgent
21
  from .verifier import VerificationResult, VerifierAgent
22
+ from .tool_caller import AgentToolCaller, ToolExecutor, ToolCall, ToolCallResult
23
 
24
  __all__ = [
25
  # Base
 
30
  "ExtractorAgent",
31
  "VerifierAgent",
32
  "MemoryAgent",
33
+ "AgentToolCaller",
34
  # Coordinator
35
  "AgentCoordinator",
36
  "AgentRole",
37
  "Message",
38
+ # Tool calling
39
+ "ToolExecutor",
40
+ "ToolCall",
41
+ "ToolCallResult",
42
  # Data classes
43
  "VerificationResult",
44
  "MemoryEntry",
backend/app/agents/tool_caller.py ADDED
@@ -0,0 +1,906 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LLM-driven tool planning and registry-backed tool execution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import io
7
+ import json
8
+ import ast
9
+ import re
10
+ import statistics
11
+ import time
12
+ from dataclasses import dataclass
13
+ from typing import Any
14
+ from urllib.parse import urljoin, urlparse
15
+
16
+ from bs4 import BeautifulSoup
17
+
18
+ from app.models.router import SmartModelRouter, TaskType
19
+ from app.plugins.registry import get_all_tools, get_tool
20
+ from app.utils.logging import get_logger
21
+
22
+ logger = get_logger(__name__)
23
+
24
+ SUPPORTED_TOOL_NAMESPACES = {
25
+ "browser",
26
+ "html",
27
+ "extract",
28
+ "regex",
29
+ "validate",
30
+ "json",
31
+ "csv",
32
+ "data",
33
+ "analysis",
34
+ "text",
35
+ "stats",
36
+ }
37
+
38
+
39
+ def _truncate(value: Any, limit: int = 240) -> str:
40
+ text = str(value)
41
+ if len(text) <= limit:
42
+ return text
43
+ return f"{text[: limit - 3]}..."
44
+
45
+
46
+ def _tokenize(text: str) -> list[str]:
47
+ return [token for token in re.findall(r"[A-Za-z0-9_]+", text.lower()) if len(token) > 1]
48
+
49
+
50
+ def _safe_float(value: Any, default: float = 0.0) -> float:
51
+ try:
52
+ return float(str(value).replace(",", "").strip())
53
+ except (TypeError, ValueError):
54
+ return default
55
+
56
+
57
+ def _coerce_records(raw: Any) -> list[dict[str, Any]]:
58
+ if isinstance(raw, list):
59
+ return [row for row in raw if isinstance(row, dict)]
60
+ return []
61
+
62
+
63
+ def _extract_json_array(text: str) -> list[dict[str, Any]]:
64
+ content = text.strip()
65
+
66
+ if "```json" in content:
67
+ content = content.split("```json", 1)[1].split("```", 1)[0].strip()
68
+ elif "```" in content:
69
+ content = content.split("```", 1)[1].split("```", 1)[0].strip()
70
+
71
+ start = content.find("[")
72
+ end = content.rfind("]")
73
+ if start == -1 or end == -1 or start > end:
74
+ return []
75
+
76
+ payload = content[start : end + 1]
77
+ try:
78
+ parsed = json.loads(payload)
79
+ except json.JSONDecodeError:
80
+ try:
81
+ parsed = ast.literal_eval(payload)
82
+ except (ValueError, SyntaxError):
83
+ return []
84
+
85
+ if isinstance(parsed, list):
86
+ return [item for item in parsed if isinstance(item, dict)]
87
+ return []
88
+
89
+
90
+ def _infer_type(value: Any) -> str:
91
+ if value is None:
92
+ return "null"
93
+ if isinstance(value, bool):
94
+ return "boolean"
95
+ if isinstance(value, int):
96
+ return "integer"
97
+ if isinstance(value, float):
98
+ return "number"
99
+ if isinstance(value, list):
100
+ return "array"
101
+ if isinstance(value, dict):
102
+ return "object"
103
+ return "string"
104
+
105
+
106
+ @dataclass
107
+ class ToolCall:
108
+ """A tool invocation selected by the planner."""
109
+
110
+ tool_name: str
111
+ parameters: dict[str, Any]
112
+ reasoning: str = ""
113
+
114
+
115
+ @dataclass
116
+ class ToolCallResult:
117
+ """Result of a single executed tool call."""
118
+
119
+ tool_name: str
120
+ success: bool
121
+ result: Any
122
+ error: str | None = None
123
+ duration_ms: int = 0
124
+
125
+
126
+ class AgentToolCaller:
127
+ """Asks an LLM to choose tool calls from the plugin registry."""
128
+
129
+ def __init__(
130
+ self,
131
+ model_router: SmartModelRouter,
132
+ allowed_tool_names: set[str] | None = None,
133
+ ) -> None:
134
+ self.router = model_router
135
+ all_tools = [
136
+ tool
137
+ for tool in get_all_tools()
138
+ if tool.name.split(".", 1)[0] in SUPPORTED_TOOL_NAMESPACES
139
+ ]
140
+ if allowed_tool_names:
141
+ self._tools = [tool for tool in all_tools if tool.name in allowed_tool_names]
142
+ else:
143
+ self._tools = all_tools
144
+ self._tool_names = {tool.name for tool in self._tools}
145
+ self._tool_catalog = self._build_tool_catalog()
146
+
147
+ def _build_tool_catalog(self) -> str:
148
+ if not self._tools:
149
+ return "No tools available."
150
+
151
+ grouped: dict[str, list[str]] = {}
152
+ for tool in sorted(self._tools, key=lambda item: item.name):
153
+ namespace = tool.name.split(".", 1)[0]
154
+ entry = (
155
+ f"- {tool.name}: {tool.description} | "
156
+ f"params={json.dumps(tool.parameters, separators=(',', ':'))}"
157
+ )
158
+ grouped.setdefault(namespace, []).append(entry)
159
+
160
+ lines: list[str] = []
161
+ for namespace in sorted(grouped):
162
+ lines.append(f"[{namespace}]")
163
+ lines.extend(grouped[namespace])
164
+ lines.append("")
165
+ return "\n".join(lines).strip()
166
+
167
+ async def decide_tools(
168
+ self,
169
+ task_description: str,
170
+ context: dict[str, Any],
171
+ model: str,
172
+ max_tools: int = 6,
173
+ ) -> list[ToolCall]:
174
+ """Return a runtime tool plan chosen by the LLM."""
175
+
176
+ if not self._tool_names:
177
+ return []
178
+
179
+ prompt = f"""You are selecting tools for a generic web scraping task.
180
+ Use ONLY tools from AVAILABLE_TOOLS and return strict JSON.
181
+
182
+ AVAILABLE_TOOLS:
183
+ {self._tool_catalog}
184
+
185
+ TASK:
186
+ {task_description}
187
+
188
+ CONTEXT:
189
+ - URL: {context.get("url", "")}
190
+ - HTML Length: {context.get("html_length", 0)}
191
+ - Output Format: {context.get("output_format", "json")}
192
+ - User Instructions: {context.get("instructions", "")}
193
+ - Prior Tool Calls: {context.get("tools_used", [])}
194
+
195
+ Rules:
196
+ 1. Return only a JSON array (no markdown, no prose).
197
+ 2. Each item must contain: tool_name, parameters, reasoning.
198
+ 3. Choose 2 to {max_tools} tools.
199
+ 4. Calls must be generic for arbitrary websites (no site-specific hardcoding).
200
+
201
+ Format:
202
+ [
203
+ {{
204
+ "tool_name": "html.select",
205
+ "parameters": {{"selector": "article, [role='article']", "limit": 25}},
206
+ "reasoning": "Find repeated content blocks"
207
+ }}
208
+ ]"""
209
+ try:
210
+ response = await self.router.complete(
211
+ messages=[{"role": "user", "content": prompt}],
212
+ task_type=TaskType.REASONING,
213
+ model=model,
214
+ temperature=0.1,
215
+ )
216
+ raw_calls = _extract_json_array(response.content)
217
+ normalized = self._normalize_tool_calls(raw_calls, max_tools=max_tools)
218
+ if normalized:
219
+ return normalized
220
+ logger.warning("Agent returned no valid tool calls; using dynamic fallback")
221
+ return self._fallback_tools(max_tools=max_tools)
222
+ except Exception as exc:
223
+ logger.warning("Tool planning failed: %s", exc)
224
+ return self._fallback_tools(max_tools=max_tools)
225
+
226
+ def _normalize_tool_calls(self, raw_calls: list[dict[str, Any]], max_tools: int) -> list[ToolCall]:
227
+ calls: list[ToolCall] = []
228
+ for item in raw_calls:
229
+ tool_name = str(item.get("tool_name", "")).strip()
230
+ if not tool_name or tool_name not in self._tool_names:
231
+ continue
232
+
233
+ parameters = item.get("parameters", {})
234
+ if not isinstance(parameters, dict):
235
+ parameters = {}
236
+
237
+ calls.append(
238
+ ToolCall(
239
+ tool_name=tool_name,
240
+ parameters=parameters,
241
+ reasoning=str(item.get("reasoning", "")),
242
+ )
243
+ )
244
+ if len(calls) >= max_tools:
245
+ break
246
+ return calls
247
+
248
+ def _fallback_tools(self, max_tools: int) -> list[ToolCall]:
249
+ """Build a generic fallback plan from available namespaces (not site-specific)."""
250
+ namespace_order = ("validate", "html", "extract", "data", "analysis", "text", "stats")
251
+ by_namespace: dict[str, list[str]] = {}
252
+ for tool_name in sorted(self._tool_names):
253
+ namespace = tool_name.split(".", 1)[0]
254
+ by_namespace.setdefault(namespace, []).append(tool_name)
255
+
256
+ fallback: list[ToolCall] = []
257
+ for namespace in namespace_order:
258
+ for tool_name in by_namespace.get(namespace, [])[:2]:
259
+ fallback.append(
260
+ ToolCall(
261
+ tool_name=tool_name,
262
+ parameters={},
263
+ reasoning=f"Fallback generic probe from {namespace} namespace.",
264
+ )
265
+ )
266
+ if len(fallback) >= max_tools:
267
+ return fallback
268
+ return fallback[:max_tools]
269
+
270
+
271
+ class ToolExecutor:
272
+ """Executes selected tools against page context using registry-backed dispatch."""
273
+
274
+ def __init__(self, allowed_tool_names: set[str] | None = None) -> None:
275
+ names = {
276
+ tool.name
277
+ for tool in get_all_tools()
278
+ if tool.name.split(".", 1)[0] in SUPPORTED_TOOL_NAMESPACES
279
+ }
280
+ self._known_tool_names = names & allowed_tool_names if allowed_tool_names else names
281
+
282
+ async def execute_tool_call(self, tool_call: ToolCall, context: dict[str, Any]) -> ToolCallResult:
283
+ start = time.time()
284
+ tool_name = tool_call.tool_name
285
+
286
+ try:
287
+ if tool_name not in self._known_tool_names:
288
+ raise ValueError(f"Unknown tool '{tool_name}'")
289
+ if get_tool(tool_name) is None:
290
+ raise ValueError(f"Tool '{tool_name}' is not registered")
291
+
292
+ result = self._dispatch(tool_name, tool_call.parameters, context)
293
+ return ToolCallResult(
294
+ tool_name=tool_name,
295
+ success=True,
296
+ result=result,
297
+ duration_ms=int((time.time() - start) * 1000),
298
+ )
299
+ except Exception as exc:
300
+ return ToolCallResult(
301
+ tool_name=tool_name,
302
+ success=False,
303
+ result=None,
304
+ error=str(exc),
305
+ duration_ms=int((time.time() - start) * 1000),
306
+ )
307
+
308
+ def _dispatch(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
309
+ namespace = tool_name.split(".", 1)[0].lower()
310
+
311
+ if namespace == "browser":
312
+ return self._run_browser_tool(tool_name, params, context)
313
+ if namespace == "html":
314
+ return self._run_html_tool(tool_name, params, context)
315
+ if namespace in {"json", "csv", "data", "pandas"}:
316
+ return self._run_data_tool(tool_name, params, context)
317
+ if namespace in {"extract", "regex"}:
318
+ return self._run_extraction_tool(tool_name, params, context)
319
+ if namespace == "validate":
320
+ return self._run_validation_tool(tool_name, params, context)
321
+ if namespace in {"analysis", "text", "stats"}:
322
+ return self._run_analysis_tool(tool_name, params, context)
323
+
324
+ raise ValueError(f"No runtime handler for namespace '{namespace}'")
325
+
326
+ def _run_browser_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
327
+ current_url = str(context.get("url", "") or "")
328
+ if tool_name == "browser.navigate":
329
+ target_url = str(params.get("url", current_url) or current_url)
330
+ context["url"] = target_url
331
+ return {"success": True, "status_code": 200, "url": target_url}
332
+
333
+ if tool_name == "browser.wait":
334
+ timeout_ms = int(params.get("timeout_ms", 500) or 500)
335
+ return {"found": True, "waited_ms": timeout_ms}
336
+
337
+ if tool_name == "browser.execute_js":
338
+ script = str(params.get("script", "") or "")
339
+ return {"result": {"script_length": len(script)}, "error": None}
340
+
341
+ if tool_name in {"browser.scroll", "browser.click", "browser.type", "browser.get_cookies", "browser.screenshot"}:
342
+ return {"success": True, "tool": tool_name}
343
+
344
+ raise ValueError(f"Unsupported browser tool '{tool_name}'")
345
+
346
+ def _get_soup(self, context: dict[str, Any]) -> BeautifulSoup:
347
+ soup = context.get("soup")
348
+ if isinstance(soup, BeautifulSoup):
349
+ return soup
350
+
351
+ html = str(context.get("html", "") or "")
352
+ if not html:
353
+ raise ValueError("No HTML available in execution context")
354
+
355
+ soup = BeautifulSoup(html, "html.parser")
356
+ context["soup"] = soup
357
+ return soup
358
+
359
+ @staticmethod
360
+ def _snapshot_element(element: Any) -> dict[str, Any]:
361
+ return {
362
+ "tag": getattr(element, "name", ""),
363
+ "id": element.get("id") if hasattr(element, "get") else None,
364
+ "classes": element.get("class", []) if hasattr(element, "get") else [],
365
+ "text": _truncate(element.get_text(" ", strip=True), 180) if hasattr(element, "get_text") else "",
366
+ }
367
+
368
+ def _run_html_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
369
+ soup = self._get_soup(context)
370
+
371
+ if tool_name == "html.parse":
372
+ parser_name = str(params.get("parser", "html.parser"))
373
+ html = str(context.get("html", "") or "")
374
+ parsed = BeautifulSoup(html, parser_name if parser_name in {"html.parser", "lxml"} else "html.parser")
375
+ context["soup"] = parsed
376
+ return {"parsed": True, "soup_type": parser_name, "content_length": len(html)}
377
+
378
+ if tool_name == "html.select":
379
+ selector = str(params.get("selector", "") or "")
380
+ if not selector:
381
+ raise ValueError("html.select requires a selector")
382
+ limit = int(params.get("limit", 20) or 20)
383
+ elements = soup.select(selector, limit=max(1, limit))
384
+ return {
385
+ "elements_found": len(elements),
386
+ "selector_used": selector,
387
+ "elements": [self._snapshot_element(element) for element in elements[: max(1, limit)]],
388
+ }
389
+
390
+ if tool_name == "html.select_one":
391
+ selector = str(params.get("selector", "") or "")
392
+ if not selector:
393
+ raise ValueError("html.select_one requires a selector")
394
+ element = soup.select_one(selector)
395
+ return {"found": bool(element), "element": self._snapshot_element(element) if element else None}
396
+
397
+ if tool_name == "html.find_all":
398
+ tag = params.get("tag")
399
+ attrs = params.get("attrs", {})
400
+ recursive = bool(params.get("recursive", True))
401
+ limit = int(params.get("limit", 20) or 20)
402
+ if attrs is None or not isinstance(attrs, dict):
403
+ attrs = {}
404
+ elements = soup.find_all(tag, attrs=attrs, recursive=recursive, limit=max(1, limit))
405
+ return {
406
+ "elements_found": len(elements),
407
+ "tags": [getattr(element, "name", "") for element in elements],
408
+ "elements": [self._snapshot_element(element) for element in elements[: max(1, limit)]],
409
+ }
410
+
411
+ if tool_name == "html.get_text":
412
+ selector = params.get("selector")
413
+ separator = str(params.get("separator", " "))
414
+ if selector:
415
+ selected = soup.select(str(selector))
416
+ text = separator.join(node.get_text(" ", strip=True) for node in selected)
417
+ else:
418
+ text = soup.get_text(" ", strip=True)
419
+ return {"text": text, "length": len(text)}
420
+
421
+ if tool_name == "html.get_attribute":
422
+ selector = str(params.get("selector", "") or "")
423
+ attribute = str(params.get("attribute", "") or "")
424
+ if not selector or not attribute:
425
+ raise ValueError("html.get_attribute requires selector and attribute")
426
+ element = soup.select_one(selector)
427
+ return {"found": bool(element), "value": element.get(attribute) if element else None}
428
+
429
+ if tool_name == "html.extract_links":
430
+ filter_pattern = params.get("filter_pattern")
431
+ base_url = str(params.get("base_url", "") or context.get("url", "") or "")
432
+ pattern = re.compile(str(filter_pattern)) if filter_pattern else None
433
+ links: list[dict[str, Any]] = []
434
+ for anchor in soup.select("a[href]"):
435
+ href = str(anchor.get("href", "") or "").strip()
436
+ if not href:
437
+ continue
438
+ absolute_url = urljoin(base_url, href) if base_url else href
439
+ if pattern and not pattern.search(absolute_url):
440
+ continue
441
+ links.append(
442
+ {
443
+ "url": absolute_url,
444
+ "text": _truncate(anchor.get_text(" ", strip=True), 120),
445
+ "title": anchor.get("title"),
446
+ }
447
+ )
448
+ return {"count": len(links), "links": links[:200]}
449
+
450
+ if tool_name == "html.extract_images":
451
+ include_lazy = bool(params.get("include_lazy", True))
452
+ images: list[dict[str, Any]] = []
453
+ for image in soup.select("img"):
454
+ src = image.get("src")
455
+ if include_lazy and not src:
456
+ src = image.get("data-src") or image.get("data-original")
457
+ if not src:
458
+ continue
459
+ images.append(
460
+ {
461
+ "src": src,
462
+ "alt": image.get("alt"),
463
+ "title": image.get("title"),
464
+ }
465
+ )
466
+ return {"count": len(images), "images": images[:200]}
467
+
468
+ if tool_name == "html.extract_tables":
469
+ selector = params.get("selector")
470
+ tables = soup.select(str(selector)) if selector else soup.find_all("table")
471
+ output: list[dict[str, Any]] = []
472
+ for table in tables:
473
+ rows: list[list[str]] = []
474
+ for row in table.find_all("tr"):
475
+ cells = [cell.get_text(" ", strip=True) for cell in row.find_all(["th", "td"])]
476
+ if cells:
477
+ rows.append(cells)
478
+ if rows:
479
+ output.append({"rows": rows, "row_count": len(rows)})
480
+ return {"count": len(output), "tables": output[:30]}
481
+
482
+ if tool_name == "html.extract_forms":
483
+ selector = params.get("selector")
484
+ forms = soup.select(str(selector)) if selector else soup.find_all("form")
485
+ extracted: list[dict[str, Any]] = []
486
+ for form in forms:
487
+ fields: list[dict[str, Any]] = []
488
+ for field in form.find_all(["input", "select", "textarea", "button"]):
489
+ fields.append(
490
+ {
491
+ "tag": field.name,
492
+ "name": field.get("name"),
493
+ "type": field.get("type"),
494
+ "id": field.get("id"),
495
+ }
496
+ )
497
+ extracted.append({"action": form.get("action"), "method": form.get("method"), "fields": fields})
498
+ return {"count": len(extracted), "forms": extracted[:30]}
499
+
500
+ if tool_name == "html.extract_meta":
501
+ meta: dict[str, str] = {}
502
+ for tag in soup.find_all("meta"):
503
+ key = tag.get("name") or tag.get("property")
504
+ content = tag.get("content")
505
+ if key and content:
506
+ meta[str(key)] = str(content)
507
+ title = soup.title.get_text(" ", strip=True) if soup.title else ""
508
+ return {"title": title, "meta": meta, "count": len(meta)}
509
+
510
+ if tool_name == "html.extract_jsonld":
511
+ items: list[Any] = []
512
+ for node in soup.select("script[type='application/ld+json']"):
513
+ raw = node.string or node.get_text(" ", strip=True)
514
+ if not raw:
515
+ continue
516
+ try:
517
+ parsed = json.loads(raw)
518
+ if isinstance(parsed, list):
519
+ items.extend(parsed)
520
+ else:
521
+ items.append(parsed)
522
+ except json.JSONDecodeError:
523
+ continue
524
+ return {"count": len(items), "items": items[:50]}
525
+
526
+ if tool_name == "html.detect_repeating_blocks":
527
+ signatures: dict[str, int] = {}
528
+ for node in soup.find_all(True):
529
+ classes = node.get("class") or []
530
+ if not classes:
531
+ continue
532
+ signature = f"{node.name}.{'.'.join(sorted(classes)[:2])}"
533
+ signatures[signature] = signatures.get(signature, 0) + 1
534
+ candidates = [
535
+ {"signature": signature, "count": count}
536
+ for signature, count in sorted(signatures.items(), key=lambda item: item[1], reverse=True)
537
+ if count >= 3
538
+ ]
539
+ return {"candidates": candidates[:25], "count": len(candidates)}
540
+
541
+ raise ValueError(f"Unsupported HTML tool '{tool_name}'")
542
+
543
+ def _run_data_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
544
+ if tool_name == "json.parse":
545
+ text = str(params.get("text", "") or "")
546
+ try:
547
+ data = json.loads(text)
548
+ return {"valid": True, "data": data}
549
+ except json.JSONDecodeError as exc:
550
+ return {"valid": False, "data": None, "error": str(exc)}
551
+
552
+ if tool_name == "json.dumps":
553
+ data = params.get("data", context.get("data"))
554
+ indent = int(params.get("indent", 2) or 2)
555
+ sort_keys = bool(params.get("sort_keys", False))
556
+ output = json.dumps(data, indent=indent, sort_keys=sort_keys, default=str)
557
+ return {"output": output, "length": len(output)}
558
+
559
+ if tool_name == "csv.generate":
560
+ rows = _coerce_records(params.get("data", context.get("rows")))
561
+ fields = params.get("fields")
562
+ field_names = [str(field) for field in fields] if isinstance(fields, list) and fields else None
563
+ if not rows:
564
+ return {"csv": "", "rows": 0, "columns": 0}
565
+ output = io.StringIO()
566
+ writer = csv.DictWriter(output, fieldnames=field_names or list(rows[0].keys()))
567
+ writer.writeheader()
568
+ for row in rows:
569
+ writer.writerow(row)
570
+ csv_text = output.getvalue()
571
+ return {
572
+ "csv": csv_text,
573
+ "rows": len(rows),
574
+ "columns": len(writer.fieldnames or []),
575
+ }
576
+
577
+ if tool_name == "csv.parse":
578
+ text = str(params.get("text", "") or "")
579
+ delimiter = str(params.get("delimiter", ",") or ",")
580
+ has_header = bool(params.get("has_header", True))
581
+ stream = io.StringIO(text)
582
+ if has_header:
583
+ reader = csv.DictReader(stream, delimiter=delimiter)
584
+ records = [dict(record) for record in reader]
585
+ else:
586
+ reader = csv.reader(stream, delimiter=delimiter)
587
+ rows = list(reader)
588
+ records = [{"col_" + str(idx): value for idx, value in enumerate(row)} for row in rows]
589
+ return {"records": records, "rows": len(records), "columns": len(records[0]) if records else 0}
590
+
591
+ if tool_name == "data.dedupe_rows":
592
+ rows = _coerce_records(params.get("rows", context.get("rows")))
593
+ key_fields = params.get("key_fields")
594
+ if not isinstance(key_fields, list):
595
+ key_fields = []
596
+ deduped: list[dict[str, Any]] = []
597
+ seen: set[str] = set()
598
+ for row in rows:
599
+ if key_fields:
600
+ key = "|".join(str(row.get(field, "")) for field in key_fields)
601
+ else:
602
+ key = json.dumps(row, sort_keys=True, default=str)
603
+ if key in seen:
604
+ continue
605
+ seen.add(key)
606
+ deduped.append(row)
607
+ return {"rows": deduped, "removed": len(rows) - len(deduped), "count": len(deduped)}
608
+
609
+ if tool_name == "data.rank_rows":
610
+ rows = _coerce_records(params.get("rows", context.get("rows")))
611
+ sort_field = str(params.get("sort_field", "") or "")
612
+ descending = bool(params.get("descending", True))
613
+ limit = int(params.get("limit", len(rows)) or len(rows))
614
+ if not rows:
615
+ return {"rows": [], "count": 0}
616
+ if not sort_field:
617
+ numeric_candidates = [
618
+ key
619
+ for key in rows[0].keys()
620
+ if any(_safe_float(row.get(key, ""), default=-1.0) != -1.0 for row in rows)
621
+ ]
622
+ sort_field = numeric_candidates[0] if numeric_candidates else list(rows[0].keys())[0]
623
+ ranked = sorted(rows, key=lambda row: _safe_float(row.get(sort_field, ""), 0.0), reverse=descending)
624
+ return {"rows": ranked[: max(1, limit)], "sort_field": sort_field, "count": min(len(ranked), limit)}
625
+
626
+ if tool_name == "data.select_columns":
627
+ rows = _coerce_records(params.get("rows", context.get("rows")))
628
+ columns = params.get("columns")
629
+ if not isinstance(columns, list) or not columns:
630
+ return {"rows": rows, "columns": list(rows[0].keys()) if rows else []}
631
+ selected = [{column: row.get(column, "") for column in columns} for row in rows]
632
+ return {"rows": selected, "columns": columns, "count": len(selected)}
633
+
634
+ if tool_name.startswith("pandas."):
635
+ return {
636
+ "supported": False,
637
+ "reason": "pandas runtime execution is not enabled in this lightweight agent executor",
638
+ "tool": tool_name,
639
+ }
640
+
641
+ raise ValueError(f"Unsupported data tool '{tool_name}'")
642
+
643
+ def _run_extraction_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
644
+ if tool_name.startswith("regex."):
645
+ pattern = str(params.get("pattern", "") or "")
646
+ text = str(params.get("text", "") or "")
647
+ if not pattern:
648
+ raise ValueError("regex.* tools require a pattern")
649
+ if tool_name == "regex.match":
650
+ match = re.match(pattern, text)
651
+ return {"matched": bool(match), "groups": list(match.groups()) if match else []}
652
+ if tool_name == "regex.search":
653
+ match = re.search(pattern, text)
654
+ return {
655
+ "found": bool(match),
656
+ "position": match.start() if match else -1,
657
+ "match": match.group(0) if match else "",
658
+ }
659
+ if tool_name == "regex.findall":
660
+ matches = re.findall(pattern, text)
661
+ return {"matches": matches, "count": len(matches)}
662
+ if tool_name == "regex.sub":
663
+ replacement = str(params.get("replacement", "") or "")
664
+ result = re.sub(pattern, replacement, text)
665
+ return {"result": result, "replacements": max(0, len(re.findall(pattern, text)))}
666
+ if tool_name == "regex.split":
667
+ maxsplit = int(params.get("maxsplit", 0) or 0)
668
+ parts = re.split(pattern, text, maxsplit=maxsplit)
669
+ return {"parts": parts, "count": len(parts)}
670
+ raise ValueError(f"Unsupported regex tool '{tool_name}'")
671
+
672
+ text = str(params.get("text", "") or context.get("text", "") or context.get("html", "") or "")
673
+
674
+ if tool_name == "extract.emails":
675
+ emails = sorted(set(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)))
676
+ return {"emails": emails, "count": len(emails)}
677
+
678
+ if tool_name == "extract.phones":
679
+ phones = sorted(set(re.findall(r"(?:\+?\d[\d\-\s().]{7,}\d)", text)))
680
+ return {"phones": phones, "count": len(phones)}
681
+
682
+ if tool_name == "extract.urls":
683
+ urls = sorted(set(re.findall(r"https?://[^\s\"'<>]+", text)))
684
+ if not urls:
685
+ soup = context.get("soup")
686
+ if isinstance(soup, BeautifulSoup):
687
+ urls = [urljoin(str(context.get("url", "")), a.get("href")) for a in soup.select("a[href]")]
688
+ return {"urls": urls[:500], "count": len(urls)}
689
+
690
+ if tool_name == "extract.dates":
691
+ dates = sorted(
692
+ set(
693
+ re.findall(
694
+ r"\b(?:\d{4}-\d{2}-\d{2}|\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{2,4})\b",
695
+ text,
696
+ flags=re.IGNORECASE,
697
+ )
698
+ )
699
+ )
700
+ return {"dates": dates[:300], "count": len(dates)}
701
+
702
+ if tool_name == "extract.prices":
703
+ matches = re.findall(r"(?:[$€£₹]\s?\d[\d,]*(?:\.\d{1,2})?|\d[\d,]*(?:\.\d{1,2})?\s?(?:USD|EUR|INR|GBP))", text)
704
+ prices = [{"raw": match} for match in sorted(set(matches))]
705
+ return {"prices": prices[:300], "count": len(prices)}
706
+
707
+ if tool_name == "extract.addresses":
708
+ matches = re.findall(r"\b\d{1,5}\s+[A-Za-z0-9.\- ]+\s(?:Street|St|Road|Rd|Avenue|Ave|Lane|Ln|Boulevard|Blvd)\b", text)
709
+ addresses = [{"raw": match} for match in sorted(set(matches))]
710
+ return {"addresses": addresses, "count": len(addresses)}
711
+
712
+ if tool_name == "extract.social_handles":
713
+ handles = sorted(set(re.findall(r"@[A-Za-z0-9_\.]{2,30}", text)))
714
+ return {"handles": {"generic": handles[:500]}, "count": len(handles)}
715
+
716
+ if tool_name == "extract.top_n":
717
+ rows = _coerce_records(params.get("rows", context.get("rows")))
718
+ n = max(1, int(params.get("n", 10) or 10))
719
+ sort_field = str(params.get("sort_field", "") or "")
720
+ if rows and sort_field:
721
+ rows = sorted(rows, key=lambda row: _safe_float(row.get(sort_field, ""), 0.0), reverse=True)
722
+ return {"rows": rows[:n], "count": min(len(rows), n)}
723
+
724
+ raise ValueError(f"Unsupported extraction tool '{tool_name}'")
725
+
726
+ def _run_validation_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
727
+ if tool_name == "validate.url":
728
+ url = str(params.get("url", "") or context.get("url", "") or "")
729
+ parsed = urlparse(url)
730
+ valid = bool(parsed.scheme and parsed.netloc)
731
+ return {"valid": valid, "accessible": None, "status_code": None}
732
+
733
+ if tool_name == "validate.email":
734
+ email = str(params.get("email", "") or "")
735
+ valid = bool(re.match(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$", email))
736
+ return {"valid": valid, "normalized": email.strip().lower() if valid else ""}
737
+
738
+ if tool_name == "validate.json":
739
+ text = str(params.get("text", "") or "")
740
+ try:
741
+ json.loads(text)
742
+ return {"valid": True, "error": None}
743
+ except json.JSONDecodeError as exc:
744
+ return {"valid": False, "error": str(exc)}
745
+
746
+ if tool_name == "validate.html":
747
+ html = str(params.get("html", "") or context.get("html", "") or "")
748
+ if not html:
749
+ return {"valid": False, "errors": ["No HTML provided"]}
750
+ soup = BeautifulSoup(html, "html.parser")
751
+ errors: list[str] = []
752
+ if not soup.find():
753
+ errors.append("HTML has no parseable elements")
754
+ return {"valid": not errors, "errors": errors}
755
+
756
+ if tool_name == "validate.schema":
757
+ data = params.get("data")
758
+ schema = params.get("schema") if isinstance(params.get("schema"), dict) else {}
759
+ required = schema.get("required", []) if isinstance(schema.get("required"), list) else []
760
+ if isinstance(data, dict):
761
+ missing = [field for field in required if field not in data]
762
+ else:
763
+ missing = required
764
+ return {"valid": not missing, "errors": [f"Missing field: {field}" for field in missing]}
765
+
766
+ if tool_name == "validate.data_completeness":
767
+ rows = _coerce_records(params.get("rows", context.get("rows")))
768
+ required_fields = params.get("fields")
769
+ if not isinstance(required_fields, list) or not required_fields:
770
+ required_fields = sorted({key for row in rows for key in row.keys()}) if rows else []
771
+ if not rows or not required_fields:
772
+ return {"score": 0.0, "missing_counts": {}, "fields": required_fields}
773
+ missing_counts = {field: 0 for field in required_fields}
774
+ for row in rows:
775
+ for field in required_fields:
776
+ value = row.get(field, "")
777
+ if value in (None, "", [], {}):
778
+ missing_counts[field] += 1
779
+ total_cells = len(rows) * len(required_fields)
780
+ missing_cells = sum(missing_counts.values())
781
+ score = 1.0 - (missing_cells / total_cells) if total_cells else 0.0
782
+ return {"score": round(score, 4), "missing_counts": missing_counts, "fields": required_fields}
783
+
784
+ if tool_name == "validate.row_signal":
785
+ rows = _coerce_records(params.get("rows", context.get("rows")))
786
+ if not rows:
787
+ return {"signal": 0.0, "reason": "No rows provided"}
788
+ non_empty_fields = 0
789
+ total_fields = 0
790
+ distinct_rows = len({json.dumps(row, sort_keys=True, default=str) for row in rows})
791
+ for row in rows:
792
+ for value in row.values():
793
+ total_fields += 1
794
+ if value not in (None, "", [], {}):
795
+ non_empty_fields += 1
796
+ completeness = (non_empty_fields / total_fields) if total_fields else 0.0
797
+ uniqueness = distinct_rows / len(rows)
798
+ signal = round((0.7 * completeness) + (0.3 * uniqueness), 4)
799
+ return {
800
+ "signal": signal,
801
+ "completeness": round(completeness, 4),
802
+ "uniqueness": round(uniqueness, 4),
803
+ }
804
+
805
+ raise ValueError(f"Unsupported validation tool '{tool_name}'")
806
+
807
+ def _run_analysis_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
808
+ text = str(params.get("text", "") or context.get("text", "") or "")
809
+
810
+ if tool_name == "text.keywords":
811
+ top_k = max(1, int(params.get("top_k", 10) or 10))
812
+ tokens = _tokenize(text)
813
+ frequencies: dict[str, int] = {}
814
+ for token in tokens:
815
+ frequencies[token] = frequencies.get(token, 0) + 1
816
+ ranked = sorted(frequencies.items(), key=lambda item: item[1], reverse=True)[:top_k]
817
+ return {"keywords": [item[0] for item in ranked], "scores": [item[1] for item in ranked]}
818
+
819
+ if tool_name == "text.entities":
820
+ entities = sorted(set(re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", text)))
821
+ requested_types = params.get("types") if isinstance(params.get("types"), list) else []
822
+ output = [{"text": entity, "type": "PROPER_NOUN"} for entity in entities]
823
+ if requested_types:
824
+ output = [entity for entity in output if entity["type"] in requested_types]
825
+ return {"entities": output[:200], "count": len(output)}
826
+
827
+ if tool_name == "text.sentiment":
828
+ positive = {"good", "great", "excellent", "amazing", "positive", "love", "best"}
829
+ negative = {"bad", "poor", "terrible", "awful", "negative", "worst", "hate"}
830
+ tokens = _tokenize(text)
831
+ score = sum(1 for token in tokens if token in positive) - sum(1 for token in tokens if token in negative)
832
+ label = "neutral"
833
+ if score > 0:
834
+ label = "positive"
835
+ elif score < 0:
836
+ label = "negative"
837
+ return {"score": score, "label": label}
838
+
839
+ if tool_name == "stats.describe":
840
+ values = [float(item) for item in params.get("data", []) if isinstance(item, (int, float))]
841
+ if not values:
842
+ return {"mean": 0.0, "median": 0.0, "std": 0.0, "min": 0.0, "max": 0.0}
843
+ return {
844
+ "mean": statistics.fmean(values),
845
+ "median": statistics.median(values),
846
+ "std": statistics.pstdev(values) if len(values) > 1 else 0.0,
847
+ "min": min(values),
848
+ "max": max(values),
849
+ }
850
+
851
+ if tool_name == "stats.correlation":
852
+ x = [float(item) for item in params.get("x", []) if isinstance(item, (int, float))]
853
+ y = [float(item) for item in params.get("y", []) if isinstance(item, (int, float))]
854
+ if len(x) != len(y) or len(x) < 2:
855
+ return {"correlation": 0.0, "p_value": None}
856
+ x_mean = statistics.fmean(x)
857
+ y_mean = statistics.fmean(y)
858
+ numerator = sum((a - x_mean) * (b - y_mean) for a, b in zip(x, y))
859
+ x_var = sum((a - x_mean) ** 2 for a in x)
860
+ y_var = sum((b - y_mean) ** 2 for b in y)
861
+ denominator = (x_var * y_var) ** 0.5
862
+ correlation = (numerator / denominator) if denominator else 0.0
863
+ return {"correlation": correlation, "p_value": None}
864
+
865
+ if tool_name == "analysis.infer_schema":
866
+ rows = _coerce_records(params.get("rows", context.get("rows")))
867
+ schema: dict[str, dict[str, Any]] = {}
868
+ for row in rows:
869
+ for key, value in row.items():
870
+ entry = schema.setdefault(key, {"types": set(), "nullable": False})
871
+ entry["types"].add(_infer_type(value))
872
+ if value in (None, "", [], {}):
873
+ entry["nullable"] = True
874
+ normalized = {
875
+ key: {"types": sorted(value["types"]), "nullable": value["nullable"]}
876
+ for key, value in schema.items()
877
+ }
878
+ return {"schema": normalized, "columns": sorted(normalized.keys())}
879
+
880
+ if tool_name == "analysis.score_relevance":
881
+ rows = _coerce_records(params.get("rows", context.get("rows")))
882
+ query = str(params.get("query", "") or context.get("instructions", "") or "")
883
+ query_tokens = set(_tokenize(query))
884
+ scored: list[dict[str, Any]] = []
885
+ for row in rows:
886
+ row_text = " ".join(str(value) for value in row.values())
887
+ row_tokens = set(_tokenize(row_text))
888
+ overlap = len(query_tokens & row_tokens)
889
+ score = overlap / max(1, len(query_tokens))
890
+ scored.append({"row": row, "score": round(score, 4)})
891
+ scored.sort(key=lambda item: item["score"], reverse=True)
892
+ return {"rows": scored, "count": len(scored)}
893
+
894
+ raise ValueError(f"Unsupported analysis tool '{tool_name}'")
895
+
896
+
897
+ def summarize_tool_results(results: list[ToolCallResult], max_items: int = 8) -> str:
898
+ """Render compact tool result notes for downstream prompting."""
899
+ lines: list[str] = []
900
+ for result in results[:max_items]:
901
+ if result.success:
902
+ preview = _truncate(result.result, 220)
903
+ lines.append(f"- {result.tool_name}: success ({result.duration_ms}ms), result={preview}")
904
+ else:
905
+ lines.append(f"- {result.tool_name}: failed ({result.duration_ms}ms), error={result.error}")
906
+ return "\n".join(lines)
backend/app/api/routes/scrape.py CHANGED
@@ -2547,6 +2547,100 @@ URL:"""
2547
  # Get a larger sample of the HTML for LLM analysis (first 15000 chars to include content)
2548
  html_sample = nav_obs.page_html[:15000]
2549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2550
  extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
2551
 
2552
  USER REQUEST:
@@ -2562,6 +2656,9 @@ HTML SAMPLE (first 15000 chars):
2562
 
2563
  {template_hint}
2564
 
 
 
 
2565
  TASK: Generate Python code using BeautifulSoup to extract the requested data.
2566
 
2567
  REQUIREMENTS:
 
2547
  # Get a larger sample of the HTML for LLM analysis (first 15000 chars to include content)
2548
  html_sample = nav_obs.page_html[:15000]
2549
 
2550
+ # === AGENT TOOL CALLING: runtime-selected, registry-backed ===
2551
+ agent_tool_calls = []
2552
+ tool_call_results = []
2553
+ tool_observations = ""
2554
+
2555
+ if live_llm_enabled:
2556
+ try:
2557
+ from app.agents.tool_caller import AgentToolCaller, ToolExecutor, summarize_tool_results
2558
+
2559
+ tool_caller = AgentToolCaller(model_router)
2560
+ executor = ToolExecutor()
2561
+
2562
+ agent_tool_calls = await tool_caller.decide_tools(
2563
+ task_description=(
2564
+ f"Extract {request.output_instructions or 'data'} from page content. "
2565
+ f"User instructions: {request.instructions}"
2566
+ ),
2567
+ context={
2568
+ "url": target_url,
2569
+ "html_length": len(nav_obs.page_html),
2570
+ "instructions": request.instructions,
2571
+ "output_format": request.output_format.value,
2572
+ "tools_used": [],
2573
+ },
2574
+ model=request.model,
2575
+ max_tools=6,
2576
+ )
2577
+
2578
+ if agent_tool_calls:
2579
+ tool_decision_step = _record_step(
2580
+ session,
2581
+ ScrapeStep(
2582
+ step_number=len(session["steps"]),
2583
+ action="agent_decision",
2584
+ status="completed",
2585
+ message=f"Agent selected {len(agent_tool_calls)} runtime tools",
2586
+ reward=0.1,
2587
+ extracted_data={
2588
+ "tool_calls": [
2589
+ {
2590
+ "tool": tool_call.tool_name,
2591
+ "params": tool_call.parameters,
2592
+ "reasoning": tool_call.reasoning,
2593
+ }
2594
+ for tool_call in agent_tool_calls
2595
+ ],
2596
+ },
2597
+ timestamp=_now_iso(),
2598
+ ),
2599
+ )
2600
+ yield tool_decision_step
2601
+
2602
+ tool_context = {
2603
+ "soup": BeautifulSoup(nav_obs.page_html, "html.parser"),
2604
+ "html": nav_obs.page_html,
2605
+ "url": target_url,
2606
+ "instructions": request.instructions or "",
2607
+ }
2608
+
2609
+ for tool_call in agent_tool_calls:
2610
+ result = await executor.execute_tool_call(tool_call, tool_context)
2611
+ tool_call_results.append(result)
2612
+
2613
+ if result.success and isinstance(result.result, dict):
2614
+ for context_key in ("rows", "text", "data"):
2615
+ if context_key in result.result:
2616
+ tool_context[context_key] = result.result[context_key]
2617
+
2618
+ tool_exec_step = _record_step(
2619
+ session,
2620
+ ScrapeStep(
2621
+ step_number=len(session["steps"]),
2622
+ action="tool_call",
2623
+ status="completed" if result.success else "failed",
2624
+ message=f"Tool {result.tool_name}: {'ok' if result.success else 'failed'}",
2625
+ reward=0.05 if result.success else -0.02,
2626
+ extracted_data={
2627
+ "tool": result.tool_name,
2628
+ "success": result.success,
2629
+ "result_preview": str(result.result)[:200] if result.result is not None else None,
2630
+ "error": result.error,
2631
+ "duration_ms": result.duration_ms,
2632
+ },
2633
+ timestamp=_now_iso(),
2634
+ ),
2635
+ )
2636
+ yield tool_exec_step
2637
+
2638
+ if tool_call_results:
2639
+ tool_observations = summarize_tool_results(tool_call_results)
2640
+
2641
+ except Exception as e:
2642
+ logger.warning("Agent tool calling failed: %s", e)
2643
+
2644
  extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
2645
 
2646
  USER REQUEST:
 
2656
 
2657
  {template_hint}
2658
 
2659
+ AGENT TOOL OBSERVATIONS (runtime execution, not hardcoded):
2660
+ {tool_observations or "No additional tool observations collected."}
2661
+
2662
  TASK: Generate Python code using BeautifulSoup to extract the requested data.
2663
 
2664
  REQUIREMENTS:
backend/app/plugins/registry.py CHANGED
@@ -182,6 +182,27 @@ HTML_TOOLS = [
182
  parameters={"selector": "string (optional)"},
183
  returns={"forms": "list[dict]", "count": "int"},
184
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  ]
186
 
187
  # ==============================================================================
@@ -259,6 +280,27 @@ DATA_TOOLS = [
259
  parameters={"condition": "string"},
260
  returns={"filtered_rows": "int", "original_rows": "int"},
261
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  ]
263
 
264
  # ==============================================================================
@@ -420,6 +462,20 @@ ANALYSIS_TOOLS = [
420
  parameters={"text": "string", "top_k": "int"},
421
  returns={"keywords": "list[string]", "scores": "list[float]"},
422
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  ]
424
 
425
  # ==============================================================================
@@ -476,6 +532,13 @@ EXTRACTION_TOOLS = [
476
  parameters={"text": "string", "platforms": "list[string]"},
477
  returns={"handles": "dict[string, list]", "count": "int"},
478
  ),
 
 
 
 
 
 
 
479
  ]
480
 
481
  # ==============================================================================
@@ -518,6 +581,20 @@ VALIDATION_TOOLS = [
518
  parameters={"data": "any", "schema": "dict"},
519
  returns={"valid": "bool", "errors": "list[string]"},
520
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  ]
522
 
523
  # ==============================================================================
 
182
  parameters={"selector": "string (optional)"},
183
  returns={"forms": "list[dict]", "count": "int"},
184
  ),
185
+ ToolDefinition(
186
+ name="html.extract_meta",
187
+ description="Extract page title and meta tags",
188
+ category=PluginCategory.PARSER,
189
+ parameters={"include_og": "bool"},
190
+ returns={"title": "string", "meta": "dict[string, string]", "count": "int"},
191
+ ),
192
+ ToolDefinition(
193
+ name="html.extract_jsonld",
194
+ description="Extract JSON-LD structured data blocks",
195
+ category=PluginCategory.PARSER,
196
+ parameters={"include_arrays": "bool"},
197
+ returns={"items": "list[dict]", "count": "int"},
198
+ ),
199
+ ToolDefinition(
200
+ name="html.detect_repeating_blocks",
201
+ description="Find repeated DOM block signatures for list extraction",
202
+ category=PluginCategory.PARSER,
203
+ parameters={"min_repetitions": "int"},
204
+ returns={"candidates": "list[dict]", "count": "int"},
205
+ ),
206
  ]
207
 
208
  # ==============================================================================
 
280
  parameters={"condition": "string"},
281
  returns={"filtered_rows": "int", "original_rows": "int"},
282
  ),
283
+ ToolDefinition(
284
+ name="data.dedupe_rows",
285
+ description="Remove duplicate rows from list-of-dicts data",
286
+ category=PluginCategory.DATA,
287
+ parameters={"rows": "list[dict]", "key_fields": "list[string]"},
288
+ returns={"rows": "list[dict]", "removed": "int", "count": "int"},
289
+ ),
290
+ ToolDefinition(
291
+ name="data.rank_rows",
292
+ description="Rank rows by score/value field",
293
+ category=PluginCategory.DATA,
294
+ parameters={"rows": "list[dict]", "sort_field": "string", "descending": "bool", "limit": "int"},
295
+ returns={"rows": "list[dict]", "sort_field": "string", "count": "int"},
296
+ ),
297
+ ToolDefinition(
298
+ name="data.select_columns",
299
+ description="Project rows to requested output columns",
300
+ category=PluginCategory.DATA,
301
+ parameters={"rows": "list[dict]", "columns": "list[string]"},
302
+ returns={"rows": "list[dict]", "columns": "list[string]", "count": "int"},
303
+ ),
304
  ]
305
 
306
  # ==============================================================================
 
462
  parameters={"text": "string", "top_k": "int"},
463
  returns={"keywords": "list[string]", "scores": "list[float]"},
464
  ),
465
+ ToolDefinition(
466
+ name="analysis.infer_schema",
467
+ description="Infer field types and nullability from extracted rows",
468
+ category=PluginCategory.ANALYSIS,
469
+ parameters={"rows": "list[dict]"},
470
+ returns={"schema": "dict[string, dict]", "columns": "list[string]"},
471
+ ),
472
+ ToolDefinition(
473
+ name="analysis.score_relevance",
474
+ description="Score row relevance against user query/instructions",
475
+ category=PluginCategory.ANALYSIS,
476
+ parameters={"rows": "list[dict]", "query": "string"},
477
+ returns={"rows": "list[dict]", "count": "int"},
478
+ ),
479
  ]
480
 
481
  # ==============================================================================
 
532
  parameters={"text": "string", "platforms": "list[string]"},
533
  returns={"handles": "dict[string, list]", "count": "int"},
534
  ),
535
+ ToolDefinition(
536
+ name="extract.top_n",
537
+ description="Select top N rows from extracted dataset",
538
+ category=PluginCategory.EXTRACTION,
539
+ parameters={"rows": "list[dict]", "n": "int", "sort_field": "string"},
540
+ returns={"rows": "list[dict]", "count": "int"},
541
+ ),
542
  ]
543
 
544
  # ==============================================================================
 
581
  parameters={"data": "any", "schema": "dict"},
582
  returns={"valid": "bool", "errors": "list[string]"},
583
  ),
584
+ ToolDefinition(
585
+ name="validate.data_completeness",
586
+ description="Score completeness of extracted rows against required fields",
587
+ category=PluginCategory.VALIDATION,
588
+ parameters={"rows": "list[dict]", "fields": "list[string]"},
589
+ returns={"score": "float", "missing_counts": "dict[string, int]", "fields": "list[string]"},
590
+ ),
591
+ ToolDefinition(
592
+ name="validate.row_signal",
593
+ description="Estimate quality signal of extracted rows",
594
+ category=PluginCategory.VALIDATION,
595
+ parameters={"rows": "list[dict]"},
596
+ returns={"signal": "float", "completeness": "float", "uniqueness": "float"},
597
+ ),
598
  ]
599
 
600
  # ==============================================================================