Spaces:

NeerajCodz
/

scrapeRL

Sleeping

NeerajCodz commited on Apr 8

Commit

5b2dac6

1 Parent(s): d64a03c

feat: add dynamic registry-driven agent tool runtime

- Added AgentToolCaller with LLM-driven tool planning from plugin registry metadata\n- Added ToolExecutor with runtime namespace dispatch and non-hardcoded execution\n- Expanded plugin registry from 71 to 82 tools (parser/data/analysis/extraction/validation additions)\n- Integrated runtime tool decisions and tool observations into agentic scrape flow\n- Verified selected tools execute and vary by prompt for different scraping tasks\n\nCo-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Files changed (4) hide show

backend/app/agents/__init__.py +7 -0
backend/app/agents/tool_caller.py +906 -0
backend/app/api/routes/scrape.py +97 -0
backend/app/plugins/registry.py +77 -0

backend/app/agents/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ This module contains specialized agents for web scraping with RL:
 - VerifierAgent: Cross-source verification
 - MemoryAgent: Memory operations and knowledge management
 - AgentCoordinator: Orchestrates multiple agents with message passing
 """
 from .base import BaseAgent
@@ -18,6 +19,7 @@ from .memory_agent import MemoryAgent, MemoryEntry
 from .navigator import NavigatorAgent
 from .planner import PlannerAgent
 from .verifier import VerificationResult, VerifierAgent
 __all__ = [
     # Base
@@ -28,10 +30,15 @@ __all__ = [
     "ExtractorAgent",
     "VerifierAgent",
     "MemoryAgent",
     # Coordinator
     "AgentCoordinator",
     "AgentRole",
     "Message",
     # Data classes
     "VerificationResult",
     "MemoryEntry",

 - VerifierAgent: Cross-source verification
 - MemoryAgent: Memory operations and knowledge management
 - AgentCoordinator: Orchestrates multiple agents with message passing
+- AgentToolCaller: LLM-driven tool selection and execution
 """
 from .base import BaseAgent
 from .navigator import NavigatorAgent
 from .planner import PlannerAgent
 from .verifier import VerificationResult, VerifierAgent
+from .tool_caller import AgentToolCaller, ToolExecutor, ToolCall, ToolCallResult
 __all__ = [
     # Base
     "ExtractorAgent",
     "VerifierAgent",
     "MemoryAgent",
+    "AgentToolCaller",
     # Coordinator
     "AgentCoordinator",
     "AgentRole",
     "Message",
+    # Tool calling
+    "ToolExecutor",
+    "ToolCall",
+    "ToolCallResult",
     # Data classes
     "VerificationResult",
     "MemoryEntry",

backend/app/agents/tool_caller.py ADDED Viewed

	@@ -0,0 +1,906 @@

+"""LLM-driven tool planning and registry-backed tool execution."""
+from __future__ import annotations
+import csv
+import io
+import json
+import ast
+import re
+import statistics
+import time
+from dataclasses import dataclass
+from typing import Any
+from urllib.parse import urljoin, urlparse
+from bs4 import BeautifulSoup
+from app.models.router import SmartModelRouter, TaskType
+from app.plugins.registry import get_all_tools, get_tool
+from app.utils.logging import get_logger
+logger = get_logger(__name__)
+SUPPORTED_TOOL_NAMESPACES = {
+    "browser",
+    "html",
+    "extract",
+    "regex",
+    "validate",
+    "json",
+    "csv",
+    "data",
+    "analysis",
+    "text",
+    "stats",
+}
+def _truncate(value: Any, limit: int = 240) -> str:
+    text = str(value)
+    if len(text) <= limit:
+        return text
+    return f"{text[: limit - 3]}..."
+def _tokenize(text: str) -> list[str]:
+    return [token for token in re.findall(r"[A-Za-z0-9_]+", text.lower()) if len(token) > 1]
+def _safe_float(value: Any, default: float = 0.0) -> float:
+    try:
+        return float(str(value).replace(",", "").strip())
+    except (TypeError, ValueError):
+        return default
+def _coerce_records(raw: Any) -> list[dict[str, Any]]:
+    if isinstance(raw, list):
+        return [row for row in raw if isinstance(row, dict)]
+    return []
+def _extract_json_array(text: str) -> list[dict[str, Any]]:
+    content = text.strip()
+    if "```json" in content:
+        content = content.split("```json", 1)[1].split("```", 1)[0].strip()
+    elif "```" in content:
+        content = content.split("```", 1)[1].split("```", 1)[0].strip()
+    start = content.find("[")
+    end = content.rfind("]")
+    if start == -1 or end == -1 or start > end:
+        return []
+    payload = content[start : end + 1]
+    try:
+        parsed = json.loads(payload)
+    except json.JSONDecodeError:
+        try:
+            parsed = ast.literal_eval(payload)
+        except (ValueError, SyntaxError):
+            return []
+    if isinstance(parsed, list):
+        return [item for item in parsed if isinstance(item, dict)]
+    return []
+def _infer_type(value: Any) -> str:
+    if value is None:
+        return "null"
+    if isinstance(value, bool):
+        return "boolean"
+    if isinstance(value, int):
+        return "integer"
+    if isinstance(value, float):
+        return "number"
+    if isinstance(value, list):
+        return "array"
+    if isinstance(value, dict):
+        return "object"
+    return "string"
+@dataclass
+class ToolCall:
+    """A tool invocation selected by the planner."""
+    tool_name: str
+    parameters: dict[str, Any]
+    reasoning: str = ""
+@dataclass
+class ToolCallResult:
+    """Result of a single executed tool call."""
+    tool_name: str
+    success: bool
+    result: Any
+    error: str | None = None
+    duration_ms: int = 0
+class AgentToolCaller:
+    """Asks an LLM to choose tool calls from the plugin registry."""
+    def __init__(
+        self,
+        model_router: SmartModelRouter,
+        allowed_tool_names: set[str] | None = None,
+    ) -> None:
+        self.router = model_router
+        all_tools = [
+            tool
+            for tool in get_all_tools()
+            if tool.name.split(".", 1)[0] in SUPPORTED_TOOL_NAMESPACES
+        ]
+        if allowed_tool_names:
+            self._tools = [tool for tool in all_tools if tool.name in allowed_tool_names]
+        else:
+            self._tools = all_tools
+        self._tool_names = {tool.name for tool in self._tools}
+        self._tool_catalog = self._build_tool_catalog()
+    def _build_tool_catalog(self) -> str:
+        if not self._tools:
+            return "No tools available."
+        grouped: dict[str, list[str]] = {}
+        for tool in sorted(self._tools, key=lambda item: item.name):
+            namespace = tool.name.split(".", 1)[0]
+            entry = (
+                f"- {tool.name}: {tool.description} | "
+                f"params={json.dumps(tool.parameters, separators=(',', ':'))}"
+            )
+            grouped.setdefault(namespace, []).append(entry)
+        lines: list[str] = []
+        for namespace in sorted(grouped):
+            lines.append(f"[{namespace}]")
+            lines.extend(grouped[namespace])
+            lines.append("")
+        return "\n".join(lines).strip()
+    async def decide_tools(
+        self,
+        task_description: str,
+        context: dict[str, Any],
+        model: str,
+        max_tools: int = 6,
+    ) -> list[ToolCall]:
+        """Return a runtime tool plan chosen by the LLM."""
+        if not self._tool_names:
+            return []
+        prompt = f"""You are selecting tools for a generic web scraping task.
+Use ONLY tools from AVAILABLE_TOOLS and return strict JSON.
+AVAILABLE_TOOLS:
+{self._tool_catalog}
+TASK:
+{task_description}
+CONTEXT:
+- URL: {context.get("url", "")}
+- HTML Length: {context.get("html_length", 0)}
+- Output Format: {context.get("output_format", "json")}
+- User Instructions: {context.get("instructions", "")}
+- Prior Tool Calls: {context.get("tools_used", [])}
+Rules:
+1. Return only a JSON array (no markdown, no prose).
+2. Each item must contain: tool_name, parameters, reasoning.
+3. Choose 2 to {max_tools} tools.
+4. Calls must be generic for arbitrary websites (no site-specific hardcoding).
+Format:
+[
+  {{
+    "tool_name": "html.select",
+    "parameters": {{"selector": "article, [role='article']", "limit": 25}},
+    "reasoning": "Find repeated content blocks"
+  }}
+]"""
+        try:
+            response = await self.router.complete(
+                messages=[{"role": "user", "content": prompt}],
+                task_type=TaskType.REASONING,
+                model=model,
+                temperature=0.1,
+            )
+            raw_calls = _extract_json_array(response.content)
+            normalized = self._normalize_tool_calls(raw_calls, max_tools=max_tools)
+            if normalized:
+                return normalized
+            logger.warning("Agent returned no valid tool calls; using dynamic fallback")
+            return self._fallback_tools(max_tools=max_tools)
+        except Exception as exc:
+            logger.warning("Tool planning failed: %s", exc)
+            return self._fallback_tools(max_tools=max_tools)
+    def _normalize_tool_calls(self, raw_calls: list[dict[str, Any]], max_tools: int) -> list[ToolCall]:
+        calls: list[ToolCall] = []
+        for item in raw_calls:
+            tool_name = str(item.get("tool_name", "")).strip()
+            if not tool_name or tool_name not in self._tool_names:
+                continue
+            parameters = item.get("parameters", {})
+            if not isinstance(parameters, dict):
+                parameters = {}
+            calls.append(
+                ToolCall(
+                    tool_name=tool_name,
+                    parameters=parameters,
+                    reasoning=str(item.get("reasoning", "")),
+                )
+            )
+            if len(calls) >= max_tools:
+                break
+        return calls
+    def _fallback_tools(self, max_tools: int) -> list[ToolCall]:
+        """Build a generic fallback plan from available namespaces (not site-specific)."""
+        namespace_order = ("validate", "html", "extract", "data", "analysis", "text", "stats")
+        by_namespace: dict[str, list[str]] = {}
+        for tool_name in sorted(self._tool_names):
+            namespace = tool_name.split(".", 1)[0]
+            by_namespace.setdefault(namespace, []).append(tool_name)
+        fallback: list[ToolCall] = []
+        for namespace in namespace_order:
+            for tool_name in by_namespace.get(namespace, [])[:2]:
+                fallback.append(
+                    ToolCall(
+                        tool_name=tool_name,
+                        parameters={},
+                        reasoning=f"Fallback generic probe from {namespace} namespace.",
+                    )
+                )
+                if len(fallback) >= max_tools:
+                    return fallback
+        return fallback[:max_tools]
+class ToolExecutor:
+    """Executes selected tools against page context using registry-backed dispatch."""
+    def __init__(self, allowed_tool_names: set[str] | None = None) -> None:
+        names = {
+            tool.name
+            for tool in get_all_tools()
+            if tool.name.split(".", 1)[0] in SUPPORTED_TOOL_NAMESPACES
+        }
+        self._known_tool_names = names & allowed_tool_names if allowed_tool_names else names
+    async def execute_tool_call(self, tool_call: ToolCall, context: dict[str, Any]) -> ToolCallResult:
+        start = time.time()
+        tool_name = tool_call.tool_name
+        try:
+            if tool_name not in self._known_tool_names:
+                raise ValueError(f"Unknown tool '{tool_name}'")
+            if get_tool(tool_name) is None:
+                raise ValueError(f"Tool '{tool_name}' is not registered")
+            result = self._dispatch(tool_name, tool_call.parameters, context)
+            return ToolCallResult(
+                tool_name=tool_name,
+                success=True,
+                result=result,
+                duration_ms=int((time.time() - start) * 1000),
+            )
+        except Exception as exc:
+            return ToolCallResult(
+                tool_name=tool_name,
+                success=False,
+                result=None,
+                error=str(exc),
+                duration_ms=int((time.time() - start) * 1000),
+            )
+    def _dispatch(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
+        namespace = tool_name.split(".", 1)[0].lower()
+        if namespace == "browser":
+            return self._run_browser_tool(tool_name, params, context)
+        if namespace == "html":
+            return self._run_html_tool(tool_name, params, context)
+        if namespace in {"json", "csv", "data", "pandas"}:
+            return self._run_data_tool(tool_name, params, context)
+        if namespace in {"extract", "regex"}:
+            return self._run_extraction_tool(tool_name, params, context)
+        if namespace == "validate":
+            return self._run_validation_tool(tool_name, params, context)
+        if namespace in {"analysis", "text", "stats"}:
+            return self._run_analysis_tool(tool_name, params, context)
+        raise ValueError(f"No runtime handler for namespace '{namespace}'")
+    def _run_browser_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
+        current_url = str(context.get("url", "") or "")
+        if tool_name == "browser.navigate":
+            target_url = str(params.get("url", current_url) or current_url)
+            context["url"] = target_url
+            return {"success": True, "status_code": 200, "url": target_url}
+        if tool_name == "browser.wait":
+            timeout_ms = int(params.get("timeout_ms", 500) or 500)
+            return {"found": True, "waited_ms": timeout_ms}
+        if tool_name == "browser.execute_js":
+            script = str(params.get("script", "") or "")
+            return {"result": {"script_length": len(script)}, "error": None}
+        if tool_name in {"browser.scroll", "browser.click", "browser.type", "browser.get_cookies", "browser.screenshot"}:
+            return {"success": True, "tool": tool_name}
+        raise ValueError(f"Unsupported browser tool '{tool_name}'")
+    def _get_soup(self, context: dict[str, Any]) -> BeautifulSoup:
+        soup = context.get("soup")
+        if isinstance(soup, BeautifulSoup):
+            return soup
+        html = str(context.get("html", "") or "")
+        if not html:
+            raise ValueError("No HTML available in execution context")
+        soup = BeautifulSoup(html, "html.parser")
+        context["soup"] = soup
+        return soup
+    @staticmethod
+    def _snapshot_element(element: Any) -> dict[str, Any]:
+        return {
+            "tag": getattr(element, "name", ""),
+            "id": element.get("id") if hasattr(element, "get") else None,
+            "classes": element.get("class", []) if hasattr(element, "get") else [],
+            "text": _truncate(element.get_text(" ", strip=True), 180) if hasattr(element, "get_text") else "",
+        }
+    def _run_html_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
+        soup = self._get_soup(context)
+        if tool_name == "html.parse":
+            parser_name = str(params.get("parser", "html.parser"))
+            html = str(context.get("html", "") or "")
+            parsed = BeautifulSoup(html, parser_name if parser_name in {"html.parser", "lxml"} else "html.parser")
+            context["soup"] = parsed
+            return {"parsed": True, "soup_type": parser_name, "content_length": len(html)}
+        if tool_name == "html.select":
+            selector = str(params.get("selector", "") or "")
+            if not selector:
+                raise ValueError("html.select requires a selector")
+            limit = int(params.get("limit", 20) or 20)
+            elements = soup.select(selector, limit=max(1, limit))
+            return {
+                "elements_found": len(elements),
+                "selector_used": selector,
+                "elements": [self._snapshot_element(element) for element in elements[: max(1, limit)]],
+            }
+        if tool_name == "html.select_one":
+            selector = str(params.get("selector", "") or "")
+            if not selector:
+                raise ValueError("html.select_one requires a selector")
+            element = soup.select_one(selector)
+            return {"found": bool(element), "element": self._snapshot_element(element) if element else None}
+        if tool_name == "html.find_all":
+            tag = params.get("tag")
+            attrs = params.get("attrs", {})
+            recursive = bool(params.get("recursive", True))
+            limit = int(params.get("limit", 20) or 20)
+            if attrs is None or not isinstance(attrs, dict):
+                attrs = {}
+            elements = soup.find_all(tag, attrs=attrs, recursive=recursive, limit=max(1, limit))
+            return {
+                "elements_found": len(elements),
+                "tags": [getattr(element, "name", "") for element in elements],
+                "elements": [self._snapshot_element(element) for element in elements[: max(1, limit)]],
+            }
+        if tool_name == "html.get_text":
+            selector = params.get("selector")
+            separator = str(params.get("separator", " "))
+            if selector:
+                selected = soup.select(str(selector))
+                text = separator.join(node.get_text(" ", strip=True) for node in selected)
+            else:
+                text = soup.get_text(" ", strip=True)
+            return {"text": text, "length": len(text)}
+        if tool_name == "html.get_attribute":
+            selector = str(params.get("selector", "") or "")
+            attribute = str(params.get("attribute", "") or "")
+            if not selector or not attribute:
+                raise ValueError("html.get_attribute requires selector and attribute")
+            element = soup.select_one(selector)
+            return {"found": bool(element), "value": element.get(attribute) if element else None}
+        if tool_name == "html.extract_links":
+            filter_pattern = params.get("filter_pattern")
+            base_url = str(params.get("base_url", "") or context.get("url", "") or "")
+            pattern = re.compile(str(filter_pattern)) if filter_pattern else None
+            links: list[dict[str, Any]] = []
+            for anchor in soup.select("a[href]"):
+                href = str(anchor.get("href", "") or "").strip()
+                if not href:
+                    continue
+                absolute_url = urljoin(base_url, href) if base_url else href
+                if pattern and not pattern.search(absolute_url):
+                    continue
+                links.append(
+                    {
+                        "url": absolute_url,
+                        "text": _truncate(anchor.get_text(" ", strip=True), 120),
+                        "title": anchor.get("title"),
+                    }
+                )
+            return {"count": len(links), "links": links[:200]}
+        if tool_name == "html.extract_images":
+            include_lazy = bool(params.get("include_lazy", True))
+            images: list[dict[str, Any]] = []
+            for image in soup.select("img"):
+                src = image.get("src")
+                if include_lazy and not src:
+                    src = image.get("data-src") or image.get("data-original")
+                if not src:
+                    continue
+                images.append(
+                    {
+                        "src": src,
+                        "alt": image.get("alt"),
+                        "title": image.get("title"),
+                    }
+                )
+            return {"count": len(images), "images": images[:200]}
+        if tool_name == "html.extract_tables":
+            selector = params.get("selector")
+            tables = soup.select(str(selector)) if selector else soup.find_all("table")
+            output: list[dict[str, Any]] = []
+            for table in tables:
+                rows: list[list[str]] = []
+                for row in table.find_all("tr"):
+                    cells = [cell.get_text(" ", strip=True) for cell in row.find_all(["th", "td"])]
+                    if cells:
+                        rows.append(cells)
+                if rows:
+                    output.append({"rows": rows, "row_count": len(rows)})
+            return {"count": len(output), "tables": output[:30]}
+        if tool_name == "html.extract_forms":
+            selector = params.get("selector")
+            forms = soup.select(str(selector)) if selector else soup.find_all("form")
+            extracted: list[dict[str, Any]] = []
+            for form in forms:
+                fields: list[dict[str, Any]] = []
+                for field in form.find_all(["input", "select", "textarea", "button"]):
+                    fields.append(
+                        {
+                            "tag": field.name,
+                            "name": field.get("name"),
+                            "type": field.get("type"),
+                            "id": field.get("id"),
+                        }
+                    )
+                extracted.append({"action": form.get("action"), "method": form.get("method"), "fields": fields})
+            return {"count": len(extracted), "forms": extracted[:30]}
+        if tool_name == "html.extract_meta":
+            meta: dict[str, str] = {}
+            for tag in soup.find_all("meta"):
+                key = tag.get("name") or tag.get("property")
+                content = tag.get("content")
+                if key and content:
+                    meta[str(key)] = str(content)
+            title = soup.title.get_text(" ", strip=True) if soup.title else ""
+            return {"title": title, "meta": meta, "count": len(meta)}
+        if tool_name == "html.extract_jsonld":
+            items: list[Any] = []
+            for node in soup.select("script[type='application/ld+json']"):
+                raw = node.string or node.get_text(" ", strip=True)
+                if not raw:
+                    continue
+                try:
+                    parsed = json.loads(raw)
+                    if isinstance(parsed, list):
+                        items.extend(parsed)
+                    else:
+                        items.append(parsed)
+                except json.JSONDecodeError:
+                    continue
+            return {"count": len(items), "items": items[:50]}
+        if tool_name == "html.detect_repeating_blocks":
+            signatures: dict[str, int] = {}
+            for node in soup.find_all(True):
+                classes = node.get("class") or []
+                if not classes:
+                    continue
+                signature = f"{node.name}.{'.'.join(sorted(classes)[:2])}"
+                signatures[signature] = signatures.get(signature, 0) + 1
+            candidates = [
+                {"signature": signature, "count": count}
+                for signature, count in sorted(signatures.items(), key=lambda item: item[1], reverse=True)
+                if count >= 3
+            ]
+            return {"candidates": candidates[:25], "count": len(candidates)}
+        raise ValueError(f"Unsupported HTML tool '{tool_name}'")
+    def _run_data_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
+        if tool_name == "json.parse":
+            text = str(params.get("text", "") or "")
+            try:
+                data = json.loads(text)
+                return {"valid": True, "data": data}
+            except json.JSONDecodeError as exc:
+                return {"valid": False, "data": None, "error": str(exc)}
+        if tool_name == "json.dumps":
+            data = params.get("data", context.get("data"))
+            indent = int(params.get("indent", 2) or 2)
+            sort_keys = bool(params.get("sort_keys", False))
+            output = json.dumps(data, indent=indent, sort_keys=sort_keys, default=str)
+            return {"output": output, "length": len(output)}
+        if tool_name == "csv.generate":
+            rows = _coerce_records(params.get("data", context.get("rows")))
+            fields = params.get("fields")
+            field_names = [str(field) for field in fields] if isinstance(fields, list) and fields else None
+            if not rows:
+                return {"csv": "", "rows": 0, "columns": 0}
+            output = io.StringIO()
+            writer = csv.DictWriter(output, fieldnames=field_names or list(rows[0].keys()))
+            writer.writeheader()
+            for row in rows:
+                writer.writerow(row)
+            csv_text = output.getvalue()
+            return {
+                "csv": csv_text,
+                "rows": len(rows),
+                "columns": len(writer.fieldnames or []),
+            }
+        if tool_name == "csv.parse":
+            text = str(params.get("text", "") or "")
+            delimiter = str(params.get("delimiter", ",") or ",")
+            has_header = bool(params.get("has_header", True))
+            stream = io.StringIO(text)
+            if has_header:
+                reader = csv.DictReader(stream, delimiter=delimiter)
+                records = [dict(record) for record in reader]
+            else:
+                reader = csv.reader(stream, delimiter=delimiter)
+                rows = list(reader)
+                records = [{"col_" + str(idx): value for idx, value in enumerate(row)} for row in rows]
+            return {"records": records, "rows": len(records), "columns": len(records[0]) if records else 0}
+        if tool_name == "data.dedupe_rows":
+            rows = _coerce_records(params.get("rows", context.get("rows")))
+            key_fields = params.get("key_fields")
+            if not isinstance(key_fields, list):
+                key_fields = []
+            deduped: list[dict[str, Any]] = []
+            seen: set[str] = set()
+            for row in rows:
+                if key_fields:
+                    key = "|".join(str(row.get(field, "")) for field in key_fields)
+                else:
+                    key = json.dumps(row, sort_keys=True, default=str)
+                if key in seen:
+                    continue
+                seen.add(key)
+                deduped.append(row)
+            return {"rows": deduped, "removed": len(rows) - len(deduped), "count": len(deduped)}
+        if tool_name == "data.rank_rows":
+            rows = _coerce_records(params.get("rows", context.get("rows")))
+            sort_field = str(params.get("sort_field", "") or "")
+            descending = bool(params.get("descending", True))
+            limit = int(params.get("limit", len(rows)) or len(rows))
+            if not rows:
+                return {"rows": [], "count": 0}
+            if not sort_field:
+                numeric_candidates = [
+                    key
+                    for key in rows[0].keys()
+                    if any(_safe_float(row.get(key, ""), default=-1.0) != -1.0 for row in rows)
+                ]
+                sort_field = numeric_candidates[0] if numeric_candidates else list(rows[0].keys())[0]
+            ranked = sorted(rows, key=lambda row: _safe_float(row.get(sort_field, ""), 0.0), reverse=descending)
+            return {"rows": ranked[: max(1, limit)], "sort_field": sort_field, "count": min(len(ranked), limit)}
+        if tool_name == "data.select_columns":
+            rows = _coerce_records(params.get("rows", context.get("rows")))
+            columns = params.get("columns")
+            if not isinstance(columns, list) or not columns:
+                return {"rows": rows, "columns": list(rows[0].keys()) if rows else []}
+            selected = [{column: row.get(column, "") for column in columns} for row in rows]
+            return {"rows": selected, "columns": columns, "count": len(selected)}
+        if tool_name.startswith("pandas."):
+            return {
+                "supported": False,
+                "reason": "pandas runtime execution is not enabled in this lightweight agent executor",
+                "tool": tool_name,
+            }
+        raise ValueError(f"Unsupported data tool '{tool_name}'")
+    def _run_extraction_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
+        if tool_name.startswith("regex."):
+            pattern = str(params.get("pattern", "") or "")
+            text = str(params.get("text", "") or "")
+            if not pattern:
+                raise ValueError("regex.* tools require a pattern")
+            if tool_name == "regex.match":
+                match = re.match(pattern, text)
+                return {"matched": bool(match), "groups": list(match.groups()) if match else []}
+            if tool_name == "regex.search":
+                match = re.search(pattern, text)
+                return {
+                    "found": bool(match),
+                    "position": match.start() if match else -1,
+                    "match": match.group(0) if match else "",
+                }
+            if tool_name == "regex.findall":
+                matches = re.findall(pattern, text)
+                return {"matches": matches, "count": len(matches)}
+            if tool_name == "regex.sub":
+                replacement = str(params.get("replacement", "") or "")
+                result = re.sub(pattern, replacement, text)
+                return {"result": result, "replacements": max(0, len(re.findall(pattern, text)))}
+            if tool_name == "regex.split":
+                maxsplit = int(params.get("maxsplit", 0) or 0)
+                parts = re.split(pattern, text, maxsplit=maxsplit)
+                return {"parts": parts, "count": len(parts)}
+            raise ValueError(f"Unsupported regex tool '{tool_name}'")
+        text = str(params.get("text", "") or context.get("text", "") or context.get("html", "") or "")
+        if tool_name == "extract.emails":
+            emails = sorted(set(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)))
+            return {"emails": emails, "count": len(emails)}
+        if tool_name == "extract.phones":
+            phones = sorted(set(re.findall(r"(?:\+?\d[\d\-\s().]{7,}\d)", text)))
+            return {"phones": phones, "count": len(phones)}
+        if tool_name == "extract.urls":
+            urls = sorted(set(re.findall(r"https?://[^\s\"'<>]+", text)))
+            if not urls:
+                soup = context.get("soup")
+                if isinstance(soup, BeautifulSoup):
+                    urls = [urljoin(str(context.get("url", "")), a.get("href")) for a in soup.select("a[href]")]
+            return {"urls": urls[:500], "count": len(urls)}
+        if tool_name == "extract.dates":
+            dates = sorted(
+                set(
+                    re.findall(
+                        r"\b(?:\d{4}-\d{2}-\d{2}|\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{2,4})\b",
+                        text,
+                        flags=re.IGNORECASE,
+                    )
+                )
+            )
+            return {"dates": dates[:300], "count": len(dates)}
+        if tool_name == "extract.prices":
+            matches = re.findall(r"(?:[$€£₹]\s?\d[\d,]*(?:\.\d{1,2})?|\d[\d,]*(?:\.\d{1,2})?\s?(?:USD|EUR|INR|GBP))", text)
+            prices = [{"raw": match} for match in sorted(set(matches))]
+            return {"prices": prices[:300], "count": len(prices)}
+        if tool_name == "extract.addresses":
+            matches = re.findall(r"\b\d{1,5}\s+[A-Za-z0-9.\- ]+\s(?:Street|St|Road|Rd|Avenue|Ave|Lane|Ln|Boulevard|Blvd)\b", text)
+            addresses = [{"raw": match} for match in sorted(set(matches))]
+            return {"addresses": addresses, "count": len(addresses)}
+        if tool_name == "extract.social_handles":
+            handles = sorted(set(re.findall(r"@[A-Za-z0-9_\.]{2,30}", text)))
+            return {"handles": {"generic": handles[:500]}, "count": len(handles)}
+        if tool_name == "extract.top_n":
+            rows = _coerce_records(params.get("rows", context.get("rows")))
+            n = max(1, int(params.get("n", 10) or 10))
+            sort_field = str(params.get("sort_field", "") or "")
+            if rows and sort_field:
+                rows = sorted(rows, key=lambda row: _safe_float(row.get(sort_field, ""), 0.0), reverse=True)
+            return {"rows": rows[:n], "count": min(len(rows), n)}
+        raise ValueError(f"Unsupported extraction tool '{tool_name}'")
+    def _run_validation_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
+        if tool_name == "validate.url":
+            url = str(params.get("url", "") or context.get("url", "") or "")
+            parsed = urlparse(url)
+            valid = bool(parsed.scheme and parsed.netloc)
+            return {"valid": valid, "accessible": None, "status_code": None}
+        if tool_name == "validate.email":
+            email = str(params.get("email", "") or "")
+            valid = bool(re.match(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$", email))
+            return {"valid": valid, "normalized": email.strip().lower() if valid else ""}
+        if tool_name == "validate.json":
+            text = str(params.get("text", "") or "")
+            try:
+                json.loads(text)
+                return {"valid": True, "error": None}
+            except json.JSONDecodeError as exc:
+                return {"valid": False, "error": str(exc)}
+        if tool_name == "validate.html":
+            html = str(params.get("html", "") or context.get("html", "") or "")
+            if not html:
+                return {"valid": False, "errors": ["No HTML provided"]}
+            soup = BeautifulSoup(html, "html.parser")
+            errors: list[str] = []
+            if not soup.find():
+                errors.append("HTML has no parseable elements")
+            return {"valid": not errors, "errors": errors}
+        if tool_name == "validate.schema":
+            data = params.get("data")
+            schema = params.get("schema") if isinstance(params.get("schema"), dict) else {}
+            required = schema.get("required", []) if isinstance(schema.get("required"), list) else []
+            if isinstance(data, dict):
+                missing = [field for field in required if field not in data]
+            else:
+                missing = required
+            return {"valid": not missing, "errors": [f"Missing field: {field}" for field in missing]}
+        if tool_name == "validate.data_completeness":
+            rows = _coerce_records(params.get("rows", context.get("rows")))
+            required_fields = params.get("fields")
+            if not isinstance(required_fields, list) or not required_fields:
+                required_fields = sorted({key for row in rows for key in row.keys()}) if rows else []
+            if not rows or not required_fields:
+                return {"score": 0.0, "missing_counts": {}, "fields": required_fields}
+            missing_counts = {field: 0 for field in required_fields}
+            for row in rows:
+                for field in required_fields:
+                    value = row.get(field, "")
+                    if value in (None, "", [], {}):
+                        missing_counts[field] += 1
+            total_cells = len(rows) * len(required_fields)
+            missing_cells = sum(missing_counts.values())
+            score = 1.0 - (missing_cells / total_cells) if total_cells else 0.0
+            return {"score": round(score, 4), "missing_counts": missing_counts, "fields": required_fields}
+        if tool_name == "validate.row_signal":
+            rows = _coerce_records(params.get("rows", context.get("rows")))
+            if not rows:
+                return {"signal": 0.0, "reason": "No rows provided"}
+            non_empty_fields = 0
+            total_fields = 0
+            distinct_rows = len({json.dumps(row, sort_keys=True, default=str) for row in rows})
+            for row in rows:
+                for value in row.values():
+                    total_fields += 1
+                    if value not in (None, "", [], {}):
+                        non_empty_fields += 1
+            completeness = (non_empty_fields / total_fields) if total_fields else 0.0
+            uniqueness = distinct_rows / len(rows)
+            signal = round((0.7 * completeness) + (0.3 * uniqueness), 4)
+            return {
+                "signal": signal,
+                "completeness": round(completeness, 4),
+                "uniqueness": round(uniqueness, 4),
+            }
+        raise ValueError(f"Unsupported validation tool '{tool_name}'")
+    def _run_analysis_tool(self, tool_name: str, params: dict[str, Any], context: dict[str, Any]) -> Any:
+        text = str(params.get("text", "") or context.get("text", "") or "")
+        if tool_name == "text.keywords":
+            top_k = max(1, int(params.get("top_k", 10) or 10))
+            tokens = _tokenize(text)
+            frequencies: dict[str, int] = {}
+            for token in tokens:
+                frequencies[token] = frequencies.get(token, 0) + 1
+            ranked = sorted(frequencies.items(), key=lambda item: item[1], reverse=True)[:top_k]
+            return {"keywords": [item[0] for item in ranked], "scores": [item[1] for item in ranked]}
+        if tool_name == "text.entities":
+            entities = sorted(set(re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", text)))
+            requested_types = params.get("types") if isinstance(params.get("types"), list) else []
+            output = [{"text": entity, "type": "PROPER_NOUN"} for entity in entities]
+            if requested_types:
+                output = [entity for entity in output if entity["type"] in requested_types]
+            return {"entities": output[:200], "count": len(output)}
+        if tool_name == "text.sentiment":
+            positive = {"good", "great", "excellent", "amazing", "positive", "love", "best"}
+            negative = {"bad", "poor", "terrible", "awful", "negative", "worst", "hate"}
+            tokens = _tokenize(text)
+            score = sum(1 for token in tokens if token in positive) - sum(1 for token in tokens if token in negative)
+            label = "neutral"
+            if score > 0:
+                label = "positive"
+            elif score < 0:
+                label = "negative"
+            return {"score": score, "label": label}
+        if tool_name == "stats.describe":
+            values = [float(item) for item in params.get("data", []) if isinstance(item, (int, float))]
+            if not values:
+                return {"mean": 0.0, "median": 0.0, "std": 0.0, "min": 0.0, "max": 0.0}
+            return {
+                "mean": statistics.fmean(values),
+                "median": statistics.median(values),
+                "std": statistics.pstdev(values) if len(values) > 1 else 0.0,
+                "min": min(values),
+                "max": max(values),
+            }
+        if tool_name == "stats.correlation":
+            x = [float(item) for item in params.get("x", []) if isinstance(item, (int, float))]
+            y = [float(item) for item in params.get("y", []) if isinstance(item, (int, float))]
+            if len(x) != len(y) or len(x) < 2:
+                return {"correlation": 0.0, "p_value": None}
+            x_mean = statistics.fmean(x)
+            y_mean = statistics.fmean(y)
+            numerator = sum((a - x_mean) * (b - y_mean) for a, b in zip(x, y))
+            x_var = sum((a - x_mean) ** 2 for a in x)
+            y_var = sum((b - y_mean) ** 2 for b in y)
+            denominator = (x_var * y_var) ** 0.5
+            correlation = (numerator / denominator) if denominator else 0.0
+            return {"correlation": correlation, "p_value": None}
+        if tool_name == "analysis.infer_schema":
+            rows = _coerce_records(params.get("rows", context.get("rows")))
+            schema: dict[str, dict[str, Any]] = {}
+            for row in rows:
+                for key, value in row.items():
+                    entry = schema.setdefault(key, {"types": set(), "nullable": False})
+                    entry["types"].add(_infer_type(value))
+                    if value in (None, "", [], {}):
+                        entry["nullable"] = True
+            normalized = {
+                key: {"types": sorted(value["types"]), "nullable": value["nullable"]}
+                for key, value in schema.items()
+            }
+            return {"schema": normalized, "columns": sorted(normalized.keys())}
+        if tool_name == "analysis.score_relevance":
+            rows = _coerce_records(params.get("rows", context.get("rows")))
+            query = str(params.get("query", "") or context.get("instructions", "") or "")
+            query_tokens = set(_tokenize(query))
+            scored: list[dict[str, Any]] = []
+            for row in rows:
+                row_text = " ".join(str(value) for value in row.values())
+                row_tokens = set(_tokenize(row_text))
+                overlap = len(query_tokens & row_tokens)
+                score = overlap / max(1, len(query_tokens))
+                scored.append({"row": row, "score": round(score, 4)})
+            scored.sort(key=lambda item: item["score"], reverse=True)
+            return {"rows": scored, "count": len(scored)}
+        raise ValueError(f"Unsupported analysis tool '{tool_name}'")
+def summarize_tool_results(results: list[ToolCallResult], max_items: int = 8) -> str:
+    """Render compact tool result notes for downstream prompting."""
+    lines: list[str] = []
+    for result in results[:max_items]:
+        if result.success:
+            preview = _truncate(result.result, 220)
+            lines.append(f"- {result.tool_name}: success ({result.duration_ms}ms), result={preview}")
+        else:
+            lines.append(f"- {result.tool_name}: failed ({result.duration_ms}ms), error={result.error}")
+    return "\n".join(lines)

backend/app/api/routes/scrape.py CHANGED Viewed

@@ -2547,6 +2547,100 @@ URL:"""
     # Get a larger sample of the HTML for LLM analysis (first 15000 chars to include content)
     html_sample = nav_obs.page_html[:15000]
     extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
 USER REQUEST:
@@ -2562,6 +2656,9 @@ HTML SAMPLE (first 15000 chars):
 {template_hint}
 TASK: Generate Python code using BeautifulSoup to extract the requested data.
 REQUIREMENTS:

     # Get a larger sample of the HTML for LLM analysis (first 15000 chars to include content)
     html_sample = nav_obs.page_html[:15000]
+    # === AGENT TOOL CALLING: runtime-selected, registry-backed ===
+    agent_tool_calls = []
+    tool_call_results = []
+    tool_observations = ""
+    if live_llm_enabled:
+        try:
+            from app.agents.tool_caller import AgentToolCaller, ToolExecutor, summarize_tool_results
+            tool_caller = AgentToolCaller(model_router)
+            executor = ToolExecutor()
+            agent_tool_calls = await tool_caller.decide_tools(
+                task_description=(
+                    f"Extract {request.output_instructions or 'data'} from page content. "
+                    f"User instructions: {request.instructions}"
+                ),
+                context={
+                    "url": target_url,
+                    "html_length": len(nav_obs.page_html),
+                    "instructions": request.instructions,
+                    "output_format": request.output_format.value,
+                    "tools_used": [],
+                },
+                model=request.model,
+                max_tools=6,
+            )
+            if agent_tool_calls:
+                tool_decision_step = _record_step(
+                    session,
+                    ScrapeStep(
+                        step_number=len(session["steps"]),
+                        action="agent_decision",
+                        status="completed",
+                        message=f"Agent selected {len(agent_tool_calls)} runtime tools",
+                        reward=0.1,
+                        extracted_data={
+                            "tool_calls": [
+                                {
+                                    "tool": tool_call.tool_name,
+                                    "params": tool_call.parameters,
+                                    "reasoning": tool_call.reasoning,
+                                }
+                                for tool_call in agent_tool_calls
+                            ],
+                        },
+                        timestamp=_now_iso(),
+                    ),
+                )
+                yield tool_decision_step
+            tool_context = {
+                "soup": BeautifulSoup(nav_obs.page_html, "html.parser"),
+                "html": nav_obs.page_html,
+                "url": target_url,
+                "instructions": request.instructions or "",
+            }
+            for tool_call in agent_tool_calls:
+                result = await executor.execute_tool_call(tool_call, tool_context)
+                tool_call_results.append(result)
+                if result.success and isinstance(result.result, dict):
+                    for context_key in ("rows", "text", "data"):
+                        if context_key in result.result:
+                            tool_context[context_key] = result.result[context_key]
+                tool_exec_step = _record_step(
+                    session,
+                    ScrapeStep(
+                        step_number=len(session["steps"]),
+                        action="tool_call",
+                        status="completed" if result.success else "failed",
+                        message=f"Tool {result.tool_name}: {'ok' if result.success else 'failed'}",
+                        reward=0.05 if result.success else -0.02,
+                        extracted_data={
+                            "tool": result.tool_name,
+                            "success": result.success,
+                            "result_preview": str(result.result)[:200] if result.result is not None else None,
+                            "error": result.error,
+                            "duration_ms": result.duration_ms,
+                        },
+                        timestamp=_now_iso(),
+                    ),
+                )
+                yield tool_exec_step
+            if tool_call_results:
+                tool_observations = summarize_tool_results(tool_call_results)
+        except Exception as e:
+            logger.warning("Agent tool calling failed: %s", e)
     extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
 USER REQUEST:
 {template_hint}
+AGENT TOOL OBSERVATIONS (runtime execution, not hardcoded):
+{tool_observations or "No additional tool observations collected."}
 TASK: Generate Python code using BeautifulSoup to extract the requested data.
 REQUIREMENTS:

backend/app/plugins/registry.py CHANGED Viewed

@@ -182,6 +182,27 @@ HTML_TOOLS = [
         parameters={"selector": "string (optional)"},
         returns={"forms": "list[dict]", "count": "int"},
     ),
 ]
 # ==============================================================================
@@ -259,6 +280,27 @@ DATA_TOOLS = [
         parameters={"condition": "string"},
         returns={"filtered_rows": "int", "original_rows": "int"},
     ),
 ]
 # ==============================================================================
@@ -420,6 +462,20 @@ ANALYSIS_TOOLS = [
         parameters={"text": "string", "top_k": "int"},
         returns={"keywords": "list[string]", "scores": "list[float]"},
     ),
 ]
 # ==============================================================================
@@ -476,6 +532,13 @@ EXTRACTION_TOOLS = [
         parameters={"text": "string", "platforms": "list[string]"},
         returns={"handles": "dict[string, list]", "count": "int"},
     ),
 ]
 # ==============================================================================
@@ -518,6 +581,20 @@ VALIDATION_TOOLS = [
         parameters={"data": "any", "schema": "dict"},
         returns={"valid": "bool", "errors": "list[string]"},
     ),
 ]
 # ==============================================================================

         parameters={"selector": "string (optional)"},
         returns={"forms": "list[dict]", "count": "int"},
     ),
+    ToolDefinition(
+        name="html.extract_meta",
+        description="Extract page title and meta tags",
+        category=PluginCategory.PARSER,
+        parameters={"include_og": "bool"},
+        returns={"title": "string", "meta": "dict[string, string]", "count": "int"},
+    ),
+    ToolDefinition(
+        name="html.extract_jsonld",
+        description="Extract JSON-LD structured data blocks",
+        category=PluginCategory.PARSER,
+        parameters={"include_arrays": "bool"},
+        returns={"items": "list[dict]", "count": "int"},
+    ),
+    ToolDefinition(
+        name="html.detect_repeating_blocks",
+        description="Find repeated DOM block signatures for list extraction",
+        category=PluginCategory.PARSER,
+        parameters={"min_repetitions": "int"},
+        returns={"candidates": "list[dict]", "count": "int"},
+    ),
 ]
 # ==============================================================================
         parameters={"condition": "string"},
         returns={"filtered_rows": "int", "original_rows": "int"},
     ),
+    ToolDefinition(
+        name="data.dedupe_rows",
+        description="Remove duplicate rows from list-of-dicts data",
+        category=PluginCategory.DATA,
+        parameters={"rows": "list[dict]", "key_fields": "list[string]"},
+        returns={"rows": "list[dict]", "removed": "int", "count": "int"},
+    ),
+    ToolDefinition(
+        name="data.rank_rows",
+        description="Rank rows by score/value field",
+        category=PluginCategory.DATA,
+        parameters={"rows": "list[dict]", "sort_field": "string", "descending": "bool", "limit": "int"},
+        returns={"rows": "list[dict]", "sort_field": "string", "count": "int"},
+    ),
+    ToolDefinition(
+        name="data.select_columns",
+        description="Project rows to requested output columns",
+        category=PluginCategory.DATA,
+        parameters={"rows": "list[dict]", "columns": "list[string]"},
+        returns={"rows": "list[dict]", "columns": "list[string]", "count": "int"},
+    ),
 ]
 # ==============================================================================
         parameters={"text": "string", "top_k": "int"},
         returns={"keywords": "list[string]", "scores": "list[float]"},
     ),
+    ToolDefinition(
+        name="analysis.infer_schema",
+        description="Infer field types and nullability from extracted rows",
+        category=PluginCategory.ANALYSIS,
+        parameters={"rows": "list[dict]"},
+        returns={"schema": "dict[string, dict]", "columns": "list[string]"},
+    ),
+    ToolDefinition(
+        name="analysis.score_relevance",
+        description="Score row relevance against user query/instructions",
+        category=PluginCategory.ANALYSIS,
+        parameters={"rows": "list[dict]", "query": "string"},
+        returns={"rows": "list[dict]", "count": "int"},
+    ),
 ]
 # ==============================================================================
         parameters={"text": "string", "platforms": "list[string]"},
         returns={"handles": "dict[string, list]", "count": "int"},
     ),
+    ToolDefinition(
+        name="extract.top_n",
+        description="Select top N rows from extracted dataset",
+        category=PluginCategory.EXTRACTION,
+        parameters={"rows": "list[dict]", "n": "int", "sort_field": "string"},
+        returns={"rows": "list[dict]", "count": "int"},
+    ),
 ]
 # ==============================================================================
         parameters={"data": "any", "schema": "dict"},
         returns={"valid": "bool", "errors": "list[string]"},
     ),
+    ToolDefinition(
+        name="validate.data_completeness",
+        description="Score completeness of extracted rows against required fields",
+        category=PluginCategory.VALIDATION,
+        parameters={"rows": "list[dict]", "fields": "list[string]"},
+        returns={"score": "float", "missing_counts": "dict[string, int]", "fields": "list[string]"},
+    ),
+    ToolDefinition(
+        name="validate.row_signal",
+        description="Estimate quality signal of extracted rows",
+        category=PluginCategory.VALIDATION,
+        parameters={"rows": "list[dict]"},
+        returns={"signal": "float", "completeness": "float", "uniqueness": "float"},
+    ),
 ]
 # ==============================================================================