Spaces:

NeerajCodz
/

scrapeRL

Sleeping

File size: 159,081 Bytes

"""Scraping endpoints with SSE and websocket live updates."""

from __future__ import annotations

import asyncio
import csv
import io
import json
import logging
import os
import re
import shutil
import tempfile
import time
import uuid
from datetime import datetime, timezone
from enum import Enum
from pathlib import Path
from typing import Any, AsyncGenerator
from urllib.error import HTTPError, URLError
from urllib.parse import quote_plus, urljoin, urlparse
from urllib.request import Request, urlopen

from bs4 import BeautifulSoup
from fastapi import APIRouter, BackgroundTasks, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field

from app.config import Settings
from app.api.deps import (
    MemoryManagerDep,
    SettingsDep,
    get_model_router,
    create_environment,
    remove_environment,
)
from app.models.router import SmartModelRouter, TaskType
from app.api.routes.plugins import PLUGIN_REGISTRY
from app.api.routes.websocket import get_connection_manager
from app.core.action import Action, ActionType
from app.memory.manager import MemoryManager, MemoryType
from app.plugins.python_sandbox import (
    DEFAULT_ANALYSIS_CODE,
    SandboxExecutionResult,
    execute_python_sandbox,
)
from app.search.engine import SearchEngineRouter
from app.search.providers.duckduckgo import DuckDuckGoProvider
from app.sites import match_site_template, serialize_site_template

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/scrape", tags=["Scraping"])


def parse_html(html: str) -> BeautifulSoup:
    """Parse HTML string into BeautifulSoup object."""
    return BeautifulSoup(html, "html.parser")


class OutputFormat(str, Enum):
    """Supported output formats."""

    JSON = "json"
    CSV = "csv"
    MARKDOWN = "markdown"
    TEXT = "text"


class TaskComplexity(str, Enum):
    """Task complexity levels."""

    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"


class ScrapeRequest(BaseModel):
    """Request model for scraping."""

    assets: list[str] = Field(..., description="List of URLs or asset identifiers")
    instructions: str = Field(..., description="Scraping instructions")
    output_instructions: str = Field(
        default="Return as JSON",
        description="Output format instructions",
    )
    output_format: OutputFormat = Field(
        default=OutputFormat.JSON,
        description="Desired output format",
    )
    complexity: TaskComplexity = Field(
        default=TaskComplexity.MEDIUM,
        description="Task complexity",
    )
    session_id: str | None = Field(default=None, description="Optional client-provided session ID")
    model: str = Field(default="llama-3.3-70b", description="AI model to use")
    provider: str = Field(default="nvidia", description="AI provider")
    enable_memory: bool = Field(default=True, description="Enable memory features")
    enable_plugins: list[str] = Field(default_factory=list, description="Enabled plugin IDs")
    selected_agents: list[str] = Field(default_factory=list, description="Enabled agent roles/modules")
    max_steps: int = Field(default=50, description="Maximum steps per URL")
    python_code: str | None = Field(
        default=None,
        description="Optional sandboxed Python analysis code (must assign to variable `result`)",
    )


class ScrapeStep(BaseModel):
    """A single step in the scraping process."""

    step_number: int
    action: str
    url: str | None = None
    status: str
    message: str
    reward: float = 0.0
    extracted_data: dict[str, Any] | None = None
    duration_ms: float | None = None
    timestamp: str


class ScrapeResponse(BaseModel):
    """Final scrape response."""

    session_id: str
    status: str
    total_steps: int
    total_reward: float
    extracted_data: dict[str, Any]
    output: str
    output_format: OutputFormat
    duration_seconds: float
    urls_processed: int
    errors: list[str]
    enabled_plugins: list[str]
    requested_plugins: list[str]
    selected_agents: list[str]
    memory_enabled: bool
    sandbox_artifacts: list[str] = Field(default_factory=list)


_active_sessions: dict[str, dict[str, Any]] = {}


def _now_iso() -> str:
    """Return UTC timestamp in ISO format."""

    return datetime.now(timezone.utc).isoformat()


def _sse_event(event: dict[str, Any]) -> str:
    """Serialize a dictionary as one SSE event."""

    return f"data: {json.dumps(event, default=str)}\n\n"


def get_session(session_id: str) -> dict[str, Any] | None:
    """Get an active session by ID."""

    return _active_sessions.get(session_id)


def _is_agent_plugin_id(plugin_id: str) -> bool:
    """Check if a plugin id actually belongs to an agent/skill."""

    lowered = plugin_id.lower()
    return lowered.startswith("skill-") or lowered == "web_scraper"


def _resolve_enabled_plugins(
    requested_plugins: list[str],
) -> tuple[list[str], list[str]]:
    """Resolve requested plugin IDs against installed plugin registry."""

    if not requested_plugins:
        return [], []

    available: set[str] = {
        plugin["id"]
        for category_name, category in PLUGIN_REGISTRY.items()
        if category_name != "skills"
        for plugin in category
        if plugin.get("installed")
    }
    unique_requested = list(dict.fromkeys(requested_plugins))
    enabled = [plugin_id for plugin_id in unique_requested if plugin_id in available]
    missing = [
        plugin_id
        for plugin_id in unique_requested
        if plugin_id not in available and not _is_agent_plugin_id(plugin_id)
    ]
    return enabled, missing


def create_session(session_id: str, request: ScrapeRequest, enabled_plugins: list[str]) -> dict[str, Any]:
    """Create and store a scraping session."""

    sandbox_dir = Path(tempfile.mkdtemp(prefix=f"scraperl-session-{session_id}-"))
    session = {
        "id": session_id,
        "request": request,
        "status": "running",
        "steps": [],
        "total_reward": 0.0,
        "extracted_data": {},
        "errors": [],
        "start_time": time.time(),
        "current_url_index": 0,
        "enabled_plugins": enabled_plugins,
        "resolved_assets": [],
        "sandbox_dir": str(sandbox_dir),
    }
    _active_sessions[session_id] = session
    return session


def update_session(session_id: str, updates: dict[str, Any]) -> dict[str, Any] | None:
    """Update a session in storage."""

    if session_id in _active_sessions:
        _active_sessions[session_id].update(updates)
        return _active_sessions[session_id]
    return None


def remove_session(session_id: str) -> bool:
    """Remove a session from storage."""

    if session_id in _active_sessions:
        sandbox_dir = _active_sessions[session_id].get("sandbox_dir")
        if sandbox_dir:
            shutil.rmtree(sandbox_dir, ignore_errors=True)
        del _active_sessions[session_id]
        return True
    return False


def _safe_artifact_name(value: str) -> str:
    """Create a safe artifact filename stem."""

    sanitized = re.sub(r"[^a-zA-Z0-9_-]+", "_", value).strip("_")
    return sanitized[:80] or "artifact"


def _write_session_artifact(session: dict[str, Any], file_name: str, content: str) -> None:
    """Write a text artifact to the session sandbox."""

    sandbox_dir = session.get("sandbox_dir")
    if not sandbox_dir:
        return
    path = Path(sandbox_dir) / file_name
    path.write_text(content, encoding="utf-8")


def _write_session_json_artifact(session: dict[str, Any], file_name: str, data: Any) -> None:
    """Write a JSON artifact to the session sandbox."""

    sandbox_dir = session.get("sandbox_dir")
    if not sandbox_dir:
        return
    path = Path(sandbox_dir) / file_name
    path.write_text(json.dumps(data, indent=2, default=str), encoding="utf-8")


def _list_session_artifacts(session: dict[str, Any]) -> list[str]:
    """List files currently written to the session sandbox."""

    sandbox_dir = session.get("sandbox_dir")
    if not sandbox_dir:
        return []
    base = Path(sandbox_dir)
    if not base.exists():
        return []
    return sorted([file.name for file in base.iterdir() if file.is_file()])


def _create_tool_call_step(
    session: dict[str, Any],
    tool_name: str,
    description: str,
    parameters: dict[str, Any],
    status: str = "running",
    result: dict[str, Any] | None = None,
    reward: float = 0.0,
    url: str | None = None,
) -> dict[str, Any]:
    """Create a tool call step event."""
    step_number = len(session.get("steps", [])) + 1

    def _format_arg(value: Any) -> str:
        rendered = json.dumps(value, default=str)
        return rendered if len(rendered) <= 40 else f"{rendered[:37]}..."

    message = f"{tool_name}({', '.join(f'{k}={_format_arg(v)}' for k, v in parameters.items())})"
    if status == "completed" and result:
        result_preview = ", ".join(f"{k}={v}" for k, v in list(result.items())[:2])
        message = f"{tool_name}() → {result_preview[:50]}"
    
    return _record_step(
        session,
        ScrapeStep(
            step_number=step_number,
            action="tool_call",
            url=url,
            status=status,
            message=message,
            reward=reward,
            extracted_data={
                "tool_name": tool_name,
                "tool_description": description,
                "parameters": parameters,
                **({"result": result} if result else {}),
            },
            timestamp=_now_iso(),
        ),
    )


def _record_step(session: dict[str, Any], step: ScrapeStep) -> dict[str, Any]:
    """Store and return a step event payload."""

    payload = step.model_dump()
    session["steps"].append(payload)
    return {"type": "step", "data": payload}


def _csv_escape(value: Any) -> str:
    """Escape one CSV value."""

    text = str(value)
    if any(ch in text for ch in [",", '"', "\n"]):
        text = '"' + text.replace('"', '""') + '"'
    return text


def _rows_to_csv(rows: list[dict[str, Any]], preferred_headers: list[str] | None = None) -> str:
    """Render list-of-dicts rows as CSV text."""

    if not rows:
        return ""
    headers = preferred_headers or list(rows[0].keys())
    lines = [",".join(_csv_escape(h) for h in headers)]
    for row in rows:
        lines.append(",".join(_csv_escape(row.get(h, "")) for h in headers))
    return "\n".join(lines)


def _flatten_for_csv(data: dict[str, Any]) -> tuple[list[str], list[list[str]]]:
    """Flatten extracted dict into CSV headers and rows."""

    if not data:
        return [], []

    if all(isinstance(value, dict) for value in data.values()):
        all_headers = sorted({k for value in data.values() if isinstance(value, dict) for k in value.keys()})
        headers = ["asset", *all_headers]
        rows = []
        for asset, values in data.items():
            value_dict = values if isinstance(values, dict) else {}
            row = [_csv_escape(asset), *[_csv_escape(value_dict.get(key, "")) for key in all_headers]]
            rows.append(row)
        return headers, rows

    headers = ["key", "value"]
    rows = [[_csv_escape(k), _csv_escape(v)] for k, v in data.items()]
    return headers, rows


async def format_output(data: dict[str, Any], output_format: OutputFormat, _instructions: str) -> str:
    """Format extracted data based on requested output format."""

    if output_format == OutputFormat.JSON:
        return json.dumps(data, indent=2, default=str)

    if output_format == OutputFormat.CSV:
        # Check if there's a pre-formatted csv_output
        if isinstance(data, dict) and "csv_output" in data:
            return data["csv_output"]
        
        # Check for rows format
        if (
            isinstance(data, dict)
            and isinstance(data.get("rows"), list)
            and all(isinstance(row, dict) for row in data.get("rows", []))
        ):
            rows = data.get("rows", [])
            preferred_headers = (
                data.get("columns")
                if isinstance(data.get("columns"), list)
                else None
            )
            return _rows_to_csv(rows, preferred_headers=preferred_headers)

        headers, rows = _flatten_for_csv(data)
        if not headers:
            return ""
        lines = [",".join(headers)]
        lines.extend(",".join(row) for row in rows)
        return "\n".join(lines)

    if output_format == OutputFormat.MARKDOWN:
        lines: list[str] = ["# Extracted Data", ""]
        for key, value in data.items():
            lines.append(f"## {key}")
            if isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    lines.append(f"- **{sub_key}**: {sub_value}")
            elif isinstance(value, list):
                for item in value:
                    lines.append(f"- {item}")
            else:
                lines.append(f"- {value}")
            lines.append("")
        return "\n".join(lines)

    lines = [f"{key}: {value}" for key, value in data.items()]
    return "\n".join(lines)


def _extract_fields_for_complexity(complexity: TaskComplexity) -> list[str]:
    """Map complexity level to extraction fields."""
    
    # For agentic scraping, we need to be goal-oriented
    # These are basic fields, but the planner should navigate intelligently
    fields = ["title", "content", "links"]
    if complexity in (TaskComplexity.MEDIUM, TaskComplexity.HIGH):
        fields.extend(["meta", "images", "data"])
    if complexity == TaskComplexity.HIGH:
        fields.extend(["scripts", "forms", "tables"])
    return fields


def _plan_from_site_template(
    site_template: Any,
    strategy_override: str | None = None,
    extraction_goal_override: str | None = None,
) -> dict[str, Any]:
    """Build a navigation plan from a matched site template."""

    target_urls = list(site_template.target_urls) if site_template.target_urls else []
    if not target_urls and site_template.domains:
        target_urls = [f"https://{site_template.domains[0]}"]

    return {
        "strategy": strategy_override or "intelligent_exploration",
        "target_urls": target_urls,
        "navigation_steps": list(site_template.navigation_steps) or [
            "Navigate to site and identify relevant sections",
            "Extract structured fields aligned with instructions",
        ],
        "extraction_goal": extraction_goal_override or site_template.extraction_goal,
        "output_fields": list(site_template.output_fields),
        "site_template_id": site_template.site_id,
        "site_template_name": site_template.name,
        "site_template_domains": list(site_template.domains),
    }


def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) -> dict[str, Any]:
    """Create an intelligent navigation plan based on user instructions."""
    
    instructions_lower = instructions.lower()
    site_template = match_site_template(instructions, assets)

    # Site-specific strategy overrides
    if site_template and site_template.site_id == "github":
        # Detect GitHub trending/top repos requests (flexible matching)
        github_trending_signals = [
            "trending" in instructions_lower,
            "top" in instructions_lower and "repo" in instructions_lower,
            "top" in instructions_lower and "project" in instructions_lower,
            "best" in instructions_lower and "repo" in instructions_lower,
            "popular" in instructions_lower and "repo" in instructions_lower,
            "this week" in instructions_lower,
            "this month" in instructions_lower,
            "today" in instructions_lower and "repo" in instructions_lower,
        ]
        if any(github_trending_signals):
            return _plan_from_site_template(
                site_template,
                strategy_override="github_trending",
                extraction_goal_override="trending_repositories",
            )

    if site_template and site_template.site_id == "reddit":
        if any(
            token in instructions_lower
            for token in ("trending", "popular", "community", "communities", "subreddit", "subreddits")
        ):
            return _plan_from_site_template(
                site_template,
                strategy_override="reddit_trending",
                extraction_goal_override="trending_communities",
            )

    if site_template:
        return _plan_from_site_template(site_template)
    
    # News articles detection
    elif any(word in instructions_lower for word in ["news", "article", "headline"]):
        return {
            "strategy": "news_extraction",
            "navigation_steps": [
                "Navigate to main news page",
                "Extract article headlines and summaries",
                "Follow article links if needed"
            ],
            "extraction_goal": "news_articles",
            "output_fields": ["headline", "summary", "publish_date", "author"]
        }
    
    # General search/exploration
    elif any(word in instructions_lower for word in ["search", "find", "explore", "all"]):
        return {
            "strategy": "intelligent_exploration", 
            "navigation_steps": [
                "Analyze main page for relevant navigation",
                "Follow relevant links based on instructions",
                "Extract data according to specified format"
            ],
            "extraction_goal": "custom_exploration"
        }
    
    # Default single-page extraction
    return {
        "strategy": "single_page",
        "navigation_steps": ["Extract content from provided URL"],
        "extraction_goal": "basic_extraction",
        "site_template_id": None,
        "site_template_name": None,
        "site_template_domains": [],
    }


def _is_url_asset(asset: str) -> bool:
    """Check whether an asset string is a URL."""

    return _coerce_url_asset(asset) is not None


def _looks_like_host(host: str) -> bool:
    """Return True when host resembles a real domain, localhost, or IPv4."""

    lowered = host.lower()
    if lowered == "localhost":
        return True

    if re.match(r"^\d{1,3}(?:\.\d{1,3}){3}$", lowered):
        return True

    return bool(re.match(r"^(?:[a-z0-9-]+\.)+[a-z]{2,63}$", lowered))


def _coerce_url_asset(asset: str) -> str | None:
    """Normalize URL-like asset strings (supports bare domains such as github.com)."""

    candidate = asset.strip()
    if not candidate or any(ch.isspace() for ch in candidate):
        return None

    normalized = candidate
    if not re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", normalized):
        normalized = f"https://{normalized}"

    parsed = urlparse(normalized)
    if parsed.scheme not in {"http", "https"} or not parsed.netloc:
        return None

    host = (parsed.hostname or "").strip().lower()
    if not host or not _looks_like_host(host):
        return None

    return normalized


def _discover_assets_for_query(query: str) -> list[str]:
    """Resolve non-URL query assets using deterministic query-aware fallbacks."""

    query_l = query.lower()
    if "gold" in query_l and ("price" in query_l or "trend" in query_l):
        return [
            "https://raw.githubusercontent.com/datasets/gold-prices/master/data/monthly.csv",
            "https://github.com/datasets/gold-prices",
        ]
    encoded = quote_plus(query)
    # r.jina.ai provides a static, text-friendly rendering of dynamic search pages.
    return [f"https://r.jina.ai/http://duckduckgo.com/?q={encoded}"]


def _fetch_text_render_markdown(url: str, timeout_seconds: int = 12) -> tuple[str, str] | None:
    """Fetch a URL through r.jina.ai text rendering for dynamic-page fallback extraction."""

    normalized = _coerce_url_asset(url) or url
    if "://" not in normalized:
        normalized = f"https://{normalized}"
    proxy_url = _apply_text_render_proxy(normalized, force=True)
    request = Request(
        proxy_url,
        headers={
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/124.0.0.0 Safari/537.36"
            ),
            "Accept": "text/plain,text/markdown,*/*",
        },
    )
    try:
        with urlopen(request, timeout=timeout_seconds) as response:
            payload = response.read()
        markdown = payload.decode("utf-8", errors="replace")
        if markdown.strip():
            return markdown, proxy_url
    except (HTTPError, URLError, TimeoutError, ValueError) as error:
        logger.debug("Text-render fallback fetch failed for %s: %s", proxy_url, error)
    return None


async def _search_urls_with_mcp(query: str, max_results: int = 6) -> list[str]:
    """Use MCP search provider to discover URLs for non-URL assets."""

    router = SearchEngineRouter()
    provider = DuckDuckGoProvider()
    router.register_provider("duckduckgo", provider, set_default=True)

    try:
        await router.initialize()
        results = await router.search(query=query, max_results=max_results, provider="duckduckgo")
        urls: list[str] = []
        for result in results:
            url = result.url if hasattr(result, "url") else result.get("url", "")
            if not _is_url_asset(str(url)):
                continue
            if "example.com" in str(url):
                continue
            if url not in urls:
                urls.append(str(url))
        return urls
    except Exception:
        return []
    finally:
        await router.shutdown()


def _build_recovery_queries(base_url: str, instructions: str | None) -> list[str]:
    """Build generic discovery queries for low-relevance extraction recovery."""

    normalized_url = _coerce_url_asset(base_url) or base_url
    if "://" not in normalized_url:
        normalized_url = f"https://{normalized_url}"
    parsed = urlparse(normalized_url)
    host = (parsed.hostname or "").lower()

    clean_instructions = (instructions or "").strip()
    queries: list[str] = []
    if host and clean_instructions:
        queries.append(f"{host} {clean_instructions}")
    if clean_instructions:
        queries.append(clean_instructions)
    if host:
        queries.append(f"{host} latest trending top")

    deduped: list[str] = []
    for query in queries:
        normalized = query.strip()
        if not normalized or normalized in deduped:
            continue
        deduped.append(normalized)
    return deduped


def _extract_markdown_link_rows(
    markdown: str,
    source_url: str,
    output_instructions: str | None,
    instructions: str | None,
    row_limit: int,
) -> list[dict[str, Any]]:
    """Extract rows from markdown content using link patterns and line analysis."""
    
    columns = _requested_columns_from_output_instructions(output_instructions) or ["title", "link", "content"]
    keywords = _instruction_keywords(instructions, max_keywords=8)
    
    # Boilerplate patterns to filter out
    boilerplate_labels = {
        "home", "about", "contact", "contact us", "help", "search", "press",
        "copyright", "creator", "creators", "advertise", "developers", "terms",
        "privacy", "policy & safety", "sign in", "log in", "sign up", "register",
        "settings", "report history", "send feedback", "learn more", "more info",
        "test new features", "how youtube works", "nfl sunday ticket", "shorts",
        "subscriptions", "you", "playlist", "now playing", "skip navigation",
    }
    boilerplate_url_tokens = (
        "privacy", "terms", "cookie", "contact", "advertis", "copyright",
        "policy", "press", "help", "about/", "/t/", "legal", "support",
        "feedback", "settings", "account", "login", "signin", "signup",
        "ServiceLogin", "accounts.google.com",
    )
    
    candidate_rows: list[tuple[int, dict[str, Any]]] = []
    seen_titles: set[str] = set()
    seen_links: set[str] = set()
    
    # Patterns for extracting content
    # Match markdown links like [Title](URL) but NOT image links like ![Image](URL)
    # URL ends at first space, quote, or closing paren
    content_link_pattern = re.compile(r'(?<!!)\[([^\]]+)\]\((https?://[^\s"\)]+)')
    # Match complex links with embedded images: [![Image](img_url) Text](link_url)
    # This captures the text after the image and the final link
    complex_link_pattern = re.compile(r'\[!\[Image[^\]]*\]\([^\)]+\)\s*([^\]]+)\]\((https?://[^\s"\)]+)\)')
    # Match view/viewer/point counts anywhere (including "47.2K viewers", "787 points" format)
    views_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*(?:views?|viewers?|points?)', re.IGNORECASE)
    likes_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*likes?', re.IGNORECASE)
    comments_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*comments?', re.IGNORECASE)
    date_pattern = re.compile(r'\b(today|yesterday|\d+\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\s+ago)\b', re.IGNORECASE)
    
    # Extract view counts from the entire document first, map them by line number
    lines = markdown.split('\n')
    line_views: dict[int, str] = {}
    for i, line in enumerate(lines):
        view_match = views_pattern.search(line)
        if view_match:
            line_views[i] = view_match.group(1)
    
    def get_nearby_metrics(line_idx: int, window: int = 5) -> dict[str, str]:
        """Get metrics from nearby lines."""
        metrics = {"views": "", "likes": "", "comments": "", "date": ""}
        for offset in range(-window, window + 1):
            check_idx = line_idx + offset
            if 0 <= check_idx < len(lines):
                check_line = lines[check_idx]
                if not metrics["views"]:
                    m = views_pattern.search(check_line)
                    if m:
                        metrics["views"] = m.group(1)
                if not metrics["likes"]:
                    m = likes_pattern.search(check_line)
                    if m:
                        metrics["likes"] = m.group(1)
                if not metrics["comments"]:
                    m = comments_pattern.search(check_line)
                    if m:
                        metrics["comments"] = m.group(1)
                if not metrics["date"]:
                    m = date_pattern.search(check_line)
                    if m:
                        metrics["date"] = m.group(1)
        return metrics
    
    # Process each line
    for i, line in enumerate(lines):
        line = line.strip()
        if not line or len(line) < 15:
            continue
        
        lowered_line = line.lower()
        
        # Skip pure navigation/boilerplate lines
        if any(label == lowered_line for label in boilerplate_labels):
            continue
        
        # First check for complex links (like Twitch format: [![Image](url) StreamerName Game Live 22K viewers](channel_url))
        complex_match = complex_link_pattern.search(line)
        if complex_match:
            embedded_text = complex_match.group(1).strip()
            link = complex_match.group(2).strip()
            
            # Parse embedded text: "StreamerName Game Live 22K 22K viewers Use the..."
            # Remove "Use the..." suffix and Hype Train info
            embedded_text = re.sub(r'\s*Use the.*$', '', embedded_text)
            embedded_text = re.sub(r'\s*Hype Train.*$', '', embedded_text, flags=re.IGNORECASE)
            # Extract viewer count first
            viewer_match = views_pattern.search(embedded_text)
            viewers = viewer_match.group(1) if viewer_match else ""
            # Remove ALL occurrences of viewer count patterns, including orphan "ers"/"viewers"
            name_game = re.sub(r'\d+(?:[.,]\d+)?[KkMmBb]?\s*(?:views?|viewers?|ers)?', '', embedded_text, flags=re.IGNORECASE)
            # Remove standalone "ers" or "viewers" that might remain
            name_game = re.sub(r'\b(?:ers|viewers?|views?)\b', '', name_game, flags=re.IGNORECASE)
            # Remove "Live" 
            name_game = re.sub(r'\bLive\b', '', name_game, flags=re.IGNORECASE)
            # Collapse whitespace
            name_game = re.sub(r'\s+', ' ', name_game).strip()
            
            # Split into name and game (heuristic: first word is name, rest is game)
            parts = name_game.split(maxsplit=1)
            streamer_name = parts[0] if parts else ""
            game = parts[1].strip() if len(parts) > 1 else ""
            
            if streamer_name and link:
                link_normalized = link.split('?')[0]
                if link_normalized not in seen_links:
                    seen_links.add(link_normalized)
                    
                    row: dict[str, Any] = {}
                    for col in columns:
                        lower_col = col.lower()
                        if lower_col in {"url", "link", "href", "channel"}:
                            row[col] = link
                        elif lower_col in {"title", "name", "streamer_name", "streamer", "username"}:
                            row[col] = streamer_name
                        elif lower_col in {"game", "category", "playing"}:
                            row[col] = game
                        elif lower_col in {"views", "view_count", "viewers", "viewer_count"}:
                            row[col] = viewers
                        else:
                            row[col] = ""
                    
                    # Streams with viewers are highly relevant
                    score = 5 if viewers else 2
                    candidate_rows.append((score, row))
                    continue  # Move to next line
        
        # Find content links (not images)
        for match in content_link_pattern.finditer(line):
            title = match.group(1).strip()
            link = match.group(2).strip()
            
            # Skip image references in title
            if title.startswith("Image ") or title.startswith("!["):
                continue
            
            # Skip very short titles (likely navigation)
            if len(title) < 5:
                continue
                
            # Skip boilerplate titles
            title_lower = title.lower()
            if title_lower in boilerplate_labels:
                continue
            
            # Skip titles that are just "#### Something" headers without real content
            clean_title = re.sub(r'^#+\s*', '', title).strip()
            if not clean_title or len(clean_title) < 5:
                continue
            
            # Skip if already seen this title or link
            title_normalized = clean_title.lower()[:50]
            link_normalized = link.split('?')[0]  # Remove query params for dedup
            if title_normalized in seen_titles:
                continue
            if link_normalized in seen_links and "watch" in link.lower():
                continue
            
            # Skip boilerplate URLs
            if any(token in link.lower() for token in boilerplate_url_tokens):
                continue
            
            # Get metrics from nearby lines
            metrics = get_nearby_metrics(i)
            
            # Calculate relevance score
            score_text = f"{clean_title} {link}".lower()
            keyword_score = sum(1 for kw in keywords if kw in score_text)
            has_content_marker = any([
                "video" in score_text,
                "music" in score_text,
                "official" in score_text,
                metrics["views"],
                metrics["likes"],
                "watch" in link.lower(),
            ])
            
            # Skip if no keyword match and no content markers
            if keywords and keyword_score == 0 and not has_content_marker:
                continue
            
            # Build row
            row: dict[str, Any] = {}
            for col in columns:
                lower_col = col.lower()
                if lower_col in {"url", "link", "href"}:
                    row[col] = link
                elif lower_col in {"title", "name", "text"}:
                    row[col] = clean_title[:160]
                elif lower_col in {"content", "summary", "description"}:
                    row[col] = clean_title[:320]
                elif lower_col in {"views", "view_count", "viewers", "points", "score", "upvotes"}:
                    row[col] = metrics["views"]
                elif lower_col in {"likes", "like_count"}:
                    row[col] = metrics["likes"]
                elif lower_col in {"comments", "comment_count"}:
                    row[col] = metrics["comments"]
                elif lower_col in {"date", "date_uploaded", "date_uplaoded", "published", "uploaded"}:
                    row[col] = metrics["date"]
                else:
                    row[col] = ""
            
            # Track seen items
            seen_titles.add(title_normalized)
            seen_links.add(link_normalized)
            
            # Calculate final score for ranking
            quality_score = keyword_score
            if metrics["views"]:
                quality_score += 3
            if metrics["likes"] or metrics["comments"]:
                quality_score += 1
            if "official" in title_lower:
                quality_score += 1
            if "watch" in link.lower():
                quality_score += 1
            
            candidate_rows.append((quality_score, row))
    
    # Also look for standalone lines with view counts (sometimes titles are separate from links)
    # But only if we haven't found enough rows with proper links
    if len(candidate_rows) < row_limit:
        for i, views in line_views.items():
            if i > 0 and len(candidate_rows) < row_limit * 2:
                prev_line = lines[i - 1].strip()
                # Check if previous line might be a title
                if len(prev_line) > 20 and not prev_line.startswith("![") and not prev_line.startswith("http"):
                    title_normalized = prev_line.lower()[:50]
                    if title_normalized not in seen_titles:
                        # Look for a nearby link
                        nearby_link = None
                        for offset in range(-3, 4):
                            check_idx = i + offset
                            if 0 <= check_idx < len(lines):
                                link_match = content_link_pattern.search(lines[check_idx])
                                if link_match and "watch" in link_match.group(2).lower():
                                    nearby_link = link_match.group(2)
                                    break
                        
                        if nearby_link:  # Only add if we found a real link
                            row = {}
                            for col in columns:
                                lower_col = col.lower()
                                if lower_col in {"title", "name", "text"}:
                                    row[col] = prev_line[:160]
                                elif lower_col in {"views", "view_count", "viewers"}:
                                    row[col] = views
                                elif lower_col in {"url", "link", "href"}:
                                    row[col] = nearby_link
                                else:
                                    row[col] = ""
                            seen_titles.add(title_normalized)
                            candidate_rows.append((2, row))  # Lower score for these
    
    # Sort by score (higher is better) and filter out items without views when we have enough with views
    candidate_rows.sort(key=lambda x: x[0], reverse=True)
    
    # Prefer rows with views
    with_views = [(score, row) for score, row in candidate_rows if row.get("views") or row.get("view_count")]
    without_views = [(score, row) for score, row in candidate_rows if not (row.get("views") or row.get("view_count"))]
    
    result = []
    for _, row in with_views[:row_limit]:
        result.append(row)
    
    # Fill remaining slots with rows without views
    remaining = row_limit - len(result)
    if remaining > 0:
        for _, row in without_views[:remaining]:
            result.append(row)
    
    return result


def _extract_rows_from_text_render(
    markdown: str,
    source_url: str,
    output_instructions: str | None,
    instructions: str | None,
    row_limit: int,
) -> tuple[list[dict[str, Any]], list[str]]:
    """Execute fallback extraction code against text-rendered markdown."""
    
    columns = _requested_columns_from_output_instructions(output_instructions) or ["title", "link", "content"]

    # First try dedicated markdown extraction (better for jina.ai output)
    markdown_rows = _extract_markdown_link_rows(
        markdown=markdown,
        source_url=source_url,
        output_instructions=output_instructions,
        instructions=instructions,
        row_limit=row_limit,
    )
    
    if _rows_have_signal(markdown_rows):
        markdown_rows, _ = _enforce_requested_schema(markdown_rows, output_instructions)
        return markdown_rows[:row_limit], columns

    # Fallback to HTML-based extraction (for cases where markdown contains HTML)
    extraction_code = _fallback_extraction_code(output_instructions, instructions)
    sandbox_globals = {
        "soup": BeautifulSoup(markdown, "html.parser"),
        "html": markdown,
        "url": source_url,
        "re": re,
        "urljoin": urljoin,
        "urlparse": urlparse,
        "BeautifulSoup": BeautifulSoup,
        "extracted_data": [],
    }
    try:
        exec(extraction_code, sandbox_globals)
        extracted_data = sandbox_globals.get("extracted_data", [])
    except Exception as error:
        logger.debug("Fallback text-render extraction failed for %s: %s", source_url, error)
        extracted_data = []

    if not isinstance(extracted_data, list):
        extracted_data = [extracted_data] if extracted_data else []
    extracted_data, output_columns = _enforce_requested_schema(extracted_data, output_instructions)
    extracted_data = extracted_data[:row_limit]
    return extracted_data, output_columns or columns


async def _search_recovery_rows(
    base_url: str,
    instructions: str | None,
    output_instructions: str | None,
    row_limit: int,
) -> tuple[list[dict[str, Any]], list[str], str | None, float]:
    """Search-guided generic recovery for low-relevance extraction results.
    
    IMPORTANT: Prioritize the user's specified site - try alternative paths on the same domain
    before resorting to external search engines.
    """

    best_rows: list[dict[str, Any]] = []
    best_columns: list[str] = []
    best_source: str | None = None
    best_score = 0.0

    # Normalize the base URL
    normalized = _coerce_url_asset(base_url) or base_url
    if "://" not in normalized:
        normalized = f"https://{normalized}"
    parsed = urlparse(normalized)
    
    # FIRST: Try alternative paths on the SAME SITE (stay on user's specified domain)
    alternative_paths = _infer_navigation_paths(instructions)
    for alt_path in alternative_paths[:4]:
        alt_url = f"{parsed.scheme}://{parsed.netloc}{alt_path}"
        text_payload = _fetch_text_render_markdown(alt_url, timeout_seconds=12)
        if not text_payload:
            continue
        markdown, source_url = text_payload
        rows, columns = _extract_rows_from_text_render(
            markdown=markdown,
            source_url=source_url,
            output_instructions=output_instructions,
            instructions=instructions,
            row_limit=row_limit,
        )
        if not _rows_have_signal(rows):
            continue
        score = _rows_relevance_score(rows, instructions)
        if score > best_score or (
            abs(score - best_score) <= 0.0001 and len(rows) > len(best_rows)
        ):
            best_rows = rows
            best_columns = columns
            best_source = source_url
            best_score = score

    # If we found good data on the user's site, return it
    if best_score > 0.25:
        return best_rows, best_columns, best_source, best_score

    # SECOND: Only as last resort, try external search (duckduckgo)
    queries = _build_recovery_queries(base_url, instructions)
    for query in queries[:2]:
        discovered_urls = await _search_urls_with_mcp(query, max_results=5)
        if not discovered_urls:
            discovered_urls = _discover_assets_for_query(query)

        for candidate_url in discovered_urls[:3]:
            text_payload = _fetch_text_render_markdown(candidate_url, timeout_seconds=12)
            if not text_payload:
                continue
            markdown, source_url = text_payload
            rows, columns = _extract_rows_from_text_render(
                markdown=markdown,
                source_url=source_url,
                output_instructions=output_instructions,
                instructions=instructions,
                row_limit=row_limit,
            )
            if not _rows_have_signal(rows):
                continue
            score = _rows_relevance_score(rows, instructions)
            if score > best_score or (
                abs(score - best_score) <= 0.0001 and len(rows) > len(best_rows)
            ):
                best_rows = rows
                best_columns = columns
                best_source = source_url
                best_score = score

    return best_rows, best_columns, best_source, best_score


async def _discover_reddit_communities_via_search(limit: int = 25) -> list[dict[str, Any]]:
    """Discover subreddit URLs via search engine fallback."""

    queries = [
        "site:reddit.com/r popular communities",
        "reddit popular subreddits list",
        "best reddit communities technology",
    ]
    excluded = {"popular", "all", "announcements", "new", "top", "best"}
    seen: set[str] = set()
    communities: list[dict[str, Any]] = []

    for query in queries:
        urls = await _search_urls_with_mcp(query, max_results=18)
        for candidate in urls:
            match = re.search(r"reddit\.com/r/([A-Za-z0-9_]+)/?", candidate, flags=re.IGNORECASE)
            if not match:
                continue
            name = match.group(1)
            normalized = name.lower()
            if normalized in excluded or normalized in seen:
                continue
            seen.add(normalized)
            communities.append(
                {
                    "subreddit": f"r/{name}",
                    "title": f"r/{name}",
                    "subscribers": 0,
                    "active_users": 0,
                    "url": f"https://www.reddit.com/r/{name}/",
                    "description": "Discovered via search fallback",
                }
            )
            if len(communities) >= limit:
                return communities

    return communities


def _fallback_reddit_communities_static(limit: int = 25) -> list[dict[str, Any]]:
    """Provide deterministic Reddit community rows when direct fetch is unavailable."""

    names = [
        "AskReddit",
        "funny",
        "gaming",
        "worldnews",
        "todayilearned",
        "science",
        "movies",
        "technology",
        "pics",
        "news",
        "aww",
        "sports",
        "Music",
        "books",
        "food",
        "dataisbeautiful",
        "MachineLearning",
        "programming",
        "python",
        "javascript",
        "learnprogramming",
        "wallstreetbets",
        "explainlikeimfive",
        "history",
        "space",
    ]
    rows: list[dict[str, Any]] = []
    for name in names[:limit]:
        rows.append(
            {
                "subreddit": f"r/{name}",
                "title": f"r/{name}",
                "subscribers": 0,
                "active_users": 0,
                "url": f"https://www.reddit.com/r/{name}/",
                "description": "Static fallback community entry",
            }
        )
    return rows


def _fetch_reddit_communities(limit: int = 25) -> tuple[list[dict[str, Any]], str]:
    """Compatibility helper used by tests and optional monkeypatch overrides."""

    return _fallback_reddit_communities_static(limit), "static_fallback"


async def _resolve_assets(
    assets: list[str],
    enabled_plugins: list[str],
) -> tuple[list[str], list[dict[str, Any]]]:
    """Resolve user-provided assets into URLs for scraping."""

    resolved: list[str] = []
    discoveries: list[dict[str, Any]] = []
    for asset in assets:
        candidate = asset.strip()
        if not candidate:
            continue

        normalized_url = _coerce_url_asset(candidate)
        if normalized_url:
            if normalized_url not in resolved:
                resolved.append(normalized_url)
            continue

        discovered: list[str] = await _search_urls_with_mcp(candidate, max_results=8)
        if not discovered:
            discovered = _discover_assets_for_query(candidate)

        if discovered:
            for url in discovered:
                if url not in resolved:
                    resolved.append(url)
            discoveries.append({"query": candidate, "resolved_urls": discovered})
        else:
            discoveries.append({"query": candidate, "resolved_urls": []})
    return resolved, discoveries


def _normalize_month(value: Any) -> str | None:
    """Normalize date-like values to YYYY-MM."""

    if value is None:
        return None
    text = str(value).strip()
    if not text:
        return None
    match = re.match(r"^(\d{4})[-/](\d{1,2})", text)
    if not match:
        return None
    year = int(match.group(1))
    month = int(match.group(2))
    if month < 1 or month > 12:
        return None
    return f"{year:04d}-{month:02d}"


def _parse_price(value: Any) -> float | None:
    """Parse a numeric price from text."""

    if value is None:
        return None
    text = str(value).strip().replace(",", "")
    try:
        return float(text)
    except ValueError:
        return None


def _build_gold_dataset_rows(
    extracted_data: dict[str, Any],
    from_month: str = "2016-01",
) -> list[dict[str, Any]]:
    """Build normalized monthly gold-price rows from extracted source data."""

    rows: list[dict[str, Any]] = []
    for source_url, payload in extracted_data.items():
        if not isinstance(payload, dict):
            continue
        data_rows = payload.get("data")
        if not isinstance(data_rows, list):
            continue

        for entry in data_rows:
            if not isinstance(entry, dict):
                continue
            date_value = (
                entry.get("Date")
                or entry.get("date")
                or entry.get("Month")
                or entry.get("month")
            )
            price_value = (
                entry.get("Price")
                or entry.get("price")
                or entry.get("Close")
                or entry.get("close")
                or entry.get("Value")
                or entry.get("value")
            )
            month = _normalize_month(date_value)
            price = _parse_price(price_value)
            if not month or price is None:
                continue
            if month < from_month:
                continue
            rows.append(
                {
                    "month": month,
                    "gold_price_usd": price,
                    "source_link": source_url,
                }
            )

    dedup: dict[str, dict[str, Any]] = {}
    for row in rows:
        dedup[row["month"]] = row
    ordered = [dedup[key] for key in sorted(dedup.keys())]
    return ordered


def _should_run_python_sandbox(request: ScrapeRequest, extracted_data: dict[str, Any]) -> bool:
    """Decide whether sandbox analysis should run for current scrape output."""

    if request.python_code:
        return True
    if not isinstance(extracted_data, dict) or not extracted_data:
        return False

    if isinstance(extracted_data.get("rows"), list) and len(extracted_data.get("rows", [])) > 0:
        return True

    for value in extracted_data.values():
        if not isinstance(value, dict):
            continue
        if isinstance(value.get("data"), list) and len(value.get("data", [])) > 0:
            return True
        if isinstance(value.get("tables"), list) and len(value.get("tables", [])) > 0:
            return True

    return False


async def _store_url_memory(
    session_id: str,
    url: str,
    extracted: dict[str, Any],
    memory_manager: MemoryManager,
) -> None:
    """Store URL extraction in memory layers."""

    await memory_manager.store(
        key=f"scrape:{session_id}:url:{url}",
        value=extracted,
        memory_type=MemoryType.SHORT_TERM,
        tags=["scrape", "url"],
    )
    await memory_manager.store(
        key=f"scrape:{session_id}:lt:{url}",
        value=json.dumps(extracted, default=str),
        memory_type=MemoryType.LONG_TERM,
        metadata={"session_id": session_id, "url": url, "source": "scrape"},
    )


async def scrape_url(
    session: dict[str, Any],
    session_id: str,
    url: str,
    settings: Settings,
    request: ScrapeRequest,
    memory_manager: MemoryManager,
    enabled_plugins: list[str],
) -> AsyncGenerator[dict[str, Any], None]:
    """Scrape a single URL and yield progress events."""

    episode_id = f"{session_id}-{uuid.uuid4().hex[:8]}"

    try:
        env = create_environment(episode_id, settings)
        await env.reset(task_id=f"scrape_{session_id}")

        step_num = 0
        yield _record_step(
            session,
            ScrapeStep(
                step_number=step_num,
                action="initialize",
                url=url,
                status="completed",
                message=f"Initialized scraping for {url}",
                timestamp=_now_iso(),
            ),
        )

        step_num += 1
        step_start = time.time()
        navigate_action = Action(
            action_type=ActionType.NAVIGATE,
            parameters={"url": url},
            reasoning=f"Navigate to target URL: {url}",
        )
        nav_observation, reward, _, _, _, nav_info = await env.step(navigate_action)
        nav_result = nav_info.get("action_result", {})
        nav_success = bool(nav_result.get("success"))
        nav_error = nav_result.get("error")
        bypassed_tls = bool(nav_result.get("tls_verification_bypassed"))
        navigate_message = f"Navigated to {url}"
        if bypassed_tls:
            navigate_message = f"{navigate_message} (TLS verification bypassed after certificate failure)"
        yield _record_step(
            session,
            ScrapeStep(
                step_number=step_num,
                action="navigate",
                url=url,
                status="completed" if nav_success else "failed",
                message=navigate_message if nav_success else f"Failed to navigate: {nav_error or 'unknown error'}",
                reward=reward,
                duration_ms=(time.time() - step_start) * 1000,
                timestamp=_now_iso(),
            ),
        )

        if nav_observation.page_html:
            source_name = _safe_artifact_name(urlparse(url).netloc or url)
            _write_session_artifact(
                session,
                f"{source_name}_source.txt",
                nav_observation.page_html,
            )
        elif not nav_success:
            session["errors"].append(f"{url}: {nav_error or 'navigation failed'}")
            return

        extracted: dict[str, Any] = {}
        total_reward = reward
        fields_to_extract = _extract_fields_for_complexity(request.complexity)

        for field_name in fields_to_extract:
            if step_num >= request.max_steps:
                break

            step_num += 1
            step_start = time.time()
            yield _record_step(
                session,
                ScrapeStep(
                    step_number=step_num,
                    action="extract",
                    url=url,
                    status="running",
                    message=f"Extracting {field_name}...",
                    timestamp=_now_iso(),
                ),
            )

            extract_action = Action(
                action_type=ActionType.EXTRACT_FIELD,
                parameters={"field_name": field_name},
                reasoning=f"Extract {field_name} using: {request.instructions}",
            )
            observation, reward, _, terminated, truncated, _ = await env.step(extract_action)
            total_reward += reward

            if observation.extracted_so_far:
                for extracted_field in observation.extracted_so_far:
                    if extracted_field.field_name == field_name:
                        extracted[field_name] = extracted_field.value
                        break

            yield _record_step(
                session,
                ScrapeStep(
                    step_number=step_num,
                    action="extract",
                    url=url,
                    status="completed",
                    message=f"Extracted {field_name}",
                    reward=reward,
                    extracted_data={field_name: extracted.get(field_name)},
                    duration_ms=(time.time() - step_start) * 1000,
                    timestamp=_now_iso(),
                ),
            )

            if terminated or truncated:
                break

    except Exception as exc:
        error_message = f"{url}: {exc}"
        session["errors"].append(error_message)
        logger.exception("Error scraping URL", extra={"url": url, "session_id": session_id})
        yield {
            "type": "error",
            "data": {
                "url": url,
                "error": str(exc),
                "timestamp": _now_iso(),
            },
        }
    finally:
        remove_environment(episode_id)


def _agentic_live_llm_enabled() -> bool:
    """Return True when live LLM calls should be used for agentic planning/extraction."""

    if os.getenv("SCRAPERL_DISABLE_LIVE_LLM") == "1":
        return False
    if os.getenv("PYTEST_CURRENT_TEST"):
        return False
    return True


def _apply_text_render_proxy(url: str, force: bool = False) -> str:
    """Optionally route a URL through a text renderer for deterministic extraction."""

    normalized = _coerce_url_asset(url) or url
    if "://" not in normalized:
        normalized = f"https://{normalized}"

    if normalized.startswith("https://r.jina.ai/http://") or normalized.startswith("https://r.jina.ai/https://"):
        return normalized
    if force:
        return f"https://r.jina.ai/http://{normalized.split('://', 1)[1]}"
    return normalized


def _infer_navigation_paths(instructions: str | None) -> list[str]:
    """Infer common navigation paths based on user intent - works generically across sites."""
    
    if not instructions:
        return ["/"]  # Default to homepage
    
    instruction_text = instructions.lower()
    paths: list[str] = []
    
    # Trending/popular intent - common paths across many sites
    # Include "/" (homepage) because many sites show top content on homepage
    if any(token in instruction_text for token in ("trending", "popular", "top", "hot", "best")):
        paths.extend([
            "/",  # Homepage often shows top/trending content (HN, Reddit, etc.)
            "/trending",
            "/popular",
            "/explore",
            "/top",
            "/hot",
            "/discover",
        ])
    
    # Latest/new/recent intent
    if any(token in instruction_text for token in ("latest", "new", "recent", "today")):
        paths.extend([
            "/new",
            "/latest",
            "/recent",
            "/feed/new",
        ])
    
    # Category-specific paths based on content type mentioned
    if "music" in instruction_text or "song" in instruction_text:
        paths.extend(["/feed/trending?bp=4gINGgt5dG1hX2NoYXJ0cw%3D%3D", "/music", "/charts"])
    if "video" in instruction_text:
        paths.extend(["/feed/trending", "/videos"])
    if "game" in instruction_text or "gaming" in instruction_text:
        paths.extend(["/gaming", "/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D"])
    if "news" in instruction_text:
        paths.extend(["/news", "/feed/news"])
    if "movie" in instruction_text or "film" in instruction_text:
        paths.extend(["/feed/trending?bp=4gIKGgh0cmFpbGVycw%3D%3D", "/movies"])
    
    # Dedupe while preserving order
    seen: set[str] = set()
    unique_paths: list[str] = []
    for path in paths:
        if path not in seen:
            seen.add(path)
            unique_paths.append(path)
    
    return unique_paths


def _build_search_navigation_url(base_url: str, instructions: str | None) -> str | None:
    """Build a search URL when direct navigation paths don't exist - generic across sites."""
    
    if not instructions:
        return None
    
    parsed = urlparse(base_url)
    host = (parsed.hostname or "").lower()
    
    # Extract search terms from instructions
    keywords = _instruction_keywords(instructions, max_keywords=6)
    if not keywords:
        return None
    
    query_text = "+".join(keywords)
    
    # Common search URL patterns across sites (generic, not site-specific)
    search_patterns = [
        f"{parsed.scheme}://{parsed.netloc}/search?q={query_text}",
        f"{parsed.scheme}://{parsed.netloc}/results?search_query={query_text}",
        f"{parsed.scheme}://{parsed.netloc}/search?query={query_text}",
        f"{parsed.scheme}://{parsed.netloc}/?s={query_text}",
    ]
    
    return search_patterns[0] if search_patterns else None


def _fallback_navigation_url(
    base_url: str,
    instructions: str,
    navigation_plan: dict[str, Any],
) -> str:
    """Derive a deterministic navigation URL using plan/template hints when LLM is unavailable.
    
    Strategy: Prioritize DIRECT SITE ACCESS over search when user specifies a site.
    1. Template target URLs (if available)
    2. Inferred navigation paths (trending, popular, etc.)
    3. Search only for EXPLICIT search intent
    4. Return the base URL (trust the site content)
    """

    normalized = _coerce_url_asset(base_url) or base_url
    if "://" not in normalized:
        normalized = f"https://{normalized}"
    
    parsed = urlparse(normalized)
    instruction_text = (instructions or "").lower()
    
    # 1. Check template target URLs first (hints only)
    plan_targets = navigation_plan.get("target_urls") or []
    valid_targets = [target for target in plan_targets if isinstance(target, str) and _is_url_asset(target)]
    if valid_targets:
        ranked_intent = any(token in instruction_text for token in ("trending", "popular", "top", "latest"))
        if ranked_intent:
            keyword_target = next(
                (
                    target
                    for target in valid_targets
                    if any(token in target.lower() for token in ("trending", "popular", "explore", "discover", "new"))
                ),
                None,
            )
            if keyword_target:
                return _apply_text_render_proxy(keyword_target)

        search_intent = any(token in instruction_text for token in ("search", "query", "lookup"))
        if search_intent:
            search_target = next(
                (target for target in valid_targets if any(token in target.lower() for token in ("search", "query"))),
                None,
            )
            if search_target:
                return _apply_text_render_proxy(search_target)

    # 2. Try direct navigation paths FIRST (trending, hot, etc.)
    # These are direct site pages, not search queries
    inferred_paths = _infer_navigation_paths(instructions)
    if inferred_paths:
        best_path = inferred_paths[0]
        inferred_url = f"{parsed.scheme}://{parsed.netloc}{best_path}"
        return _apply_text_render_proxy(inferred_url)
    
    # 3. Only use site-internal search for EXPLICIT search intents
    search_intent = any(token in instruction_text for token in ("search for", "find ", "looking for", "search:"))
    if search_intent:
        search_url = _build_search_navigation_url(normalized, instructions)
        if search_url:
            return _apply_text_render_proxy(search_url)

    # 4. Return the base URL - trust the site content (homepage often has what user wants)
    return _apply_text_render_proxy(normalized)


def _requested_columns_from_output_instructions(output_instructions: str | None) -> list[str]:
    """Extract requested output columns from instructions like 'csv of username, repo, stars'."""

    if not output_instructions:
        return []

    cleaned = output_instructions.strip()
    cleaned = re.sub(r"^(?:csv|json|table)\s+of\s+", "", cleaned, flags=re.IGNORECASE)
    cleaned = cleaned.replace(" and ", ", ")
    columns: list[str] = []
    for piece in cleaned.split(","):
        candidate = re.sub(r"[^A-Za-z0-9_]+", " ", piece).strip().lower().replace(" ", "_")
        if candidate and candidate not in columns:
            columns.append(candidate)
    return columns


def _enforce_requested_schema(
    rows: list[dict[str, Any]],
    output_instructions: str | None,
) -> tuple[list[dict[str, Any]], list[str]]:
    """Project extracted rows onto requested columns from output instructions."""

    requested_columns = _requested_columns_from_output_instructions(output_instructions)
    if not requested_columns:
        if not rows:
            return rows, []
        inferred = list(rows[0].keys())
        return rows, inferred

    normalized_rows: list[dict[str, Any]] = []
    for row in rows:
        if not isinstance(row, dict):
            continue
        normalized_rows.append({column: row.get(column, "") for column in requested_columns})

    if not normalized_rows:
        normalized_rows = [{column: "" for column in requested_columns}]

    return normalized_rows, requested_columns


def _requested_row_limit(instructions: str | None, default_limit: int = 25) -> int:
    """Extract a requested row limit (e.g., 'top 5') from instructions."""

    if not instructions:
        return default_limit
    text = instructions.lower()
    match = re.search(r"\btop\s+(\d{1,3})\b", text) or re.search(
        r"\b(\d{1,3})\s+(?:rows|items|results|entries|records|repos|frameworks)\b",
        text,
    )
    if not match:
        return default_limit
    value = int(match.group(1))
    if value < 1:
        return default_limit
    return min(value, 100)


def _instruction_keywords(instructions: str | None, max_keywords: int = 8) -> list[str]:
    """Extract semantic keywords from user instructions for relevance checks."""

    if not instructions:
        return []
    tokens = re.findall(r"[a-zA-Z]{3,}", instructions.lower())
    stop_words = {
        "get",
        "give",
        "show",
        "find",
        "extract",
        "with",
        "from",
        "this",
        "that",
        "what",
        "where",
        "when",
        "which",
        "return",
        "output",
        "format",
        "data",
        "list",
        "site",
        "website",
        "page",
        "entries",
        "results",
        "items",
        "records",
        "details",
        "about",
        "across",
        "into",
        "only",
        "please",
        "the",
        "and",
    }
    keywords: list[str] = []
    for token in tokens:
        if token in stop_words:
            continue
        if token not in keywords:
            keywords.append(token)
        if len(keywords) >= max_keywords:
            break
    return keywords


def _rows_have_signal(rows: list[dict[str, Any]]) -> bool:
    """Return True when extracted rows contain at least one non-empty value."""

    for row in rows:
        if not isinstance(row, dict):
            continue
        for value in row.values():
            if value is None:
                continue
            if isinstance(value, str):
                if value.strip():
                    return True
            elif value:
                return True
    return False


def _rows_relevance_score(rows: list[dict[str, Any]], instructions: str | None) -> float:
    """Score row relevance against instruction keywords (0-1)."""

    if not rows:
        return 0.0
    keywords = _instruction_keywords(instructions, max_keywords=8)
    if not keywords:
        return 1.0

    row_scores: list[float] = []
    for row in rows:
        if not isinstance(row, dict):
            continue
        joined = " ".join(
            str(value).lower()
            for value in row.values()
            if value is not None and str(value).strip()
        )
        if not joined:
            continue
        hits = sum(1 for keyword in keywords if keyword in joined)
        row_scores.append(hits / len(keywords))

    if not row_scores:
        return 0.0

    row_scores.sort(reverse=True)
    top_n = max(1, min(3, len(row_scores)))
    return sum(row_scores[:top_n]) / top_n


def _parse_column_names(output_instructions: str | None) -> list[str]:
    """Parse column names from output instructions.
    
    Examples:
        "csv of title, points" -> ["title", "points"]
        "json with heading and description" -> ["heading", "description"]
        "title, url, views" -> ["title", "url", "views"]
    """
    if not output_instructions:
        return []
    
    # Remove common prefixes
    text = output_instructions.lower()
    for prefix in ["csv of ", "json of ", "json with ", "fields: "]:
        if text.startswith(prefix):
            text = text[len(prefix):]
            break
    
    # Split on commas and clean
    columns = [col.strip() for col in text.split(",")]
    
    # Also try splitting on "and" if no commas found
    if len(columns) == 1 and " and " in columns[0]:
        columns = [col.strip() for col in columns[0].split(" and ")]
    
    return [col for col in columns if col]


def _fallback_extraction_code(output_instructions: str | None, instructions: str | None = None) -> str:
    """Build deterministic extraction code when live LLM code generation is unavailable."""

    columns = _requested_columns_from_output_instructions(output_instructions) or [
        "title",
        "url",
        "content",
    ]
    keywords = _instruction_keywords(instructions, max_keywords=8)
    category_hint = keywords[0].title() if keywords else ""
    columns_literal = repr(columns)
    keywords_literal = repr(keywords)
    category_hint_literal = repr(category_hint)
    return f"""
columns = {columns_literal}
keywords = {keywords_literal}
category_hint = {category_hint_literal}
rows = []
candidate_rows = []
seen = set()
anchors = soup.select("a[href]")
noise_fragments = [
    "javascript is disabled",
    "please enable javascript",
    "skip to main content",
    "press enter to activate",
    "toggle navigation",
    "close menu",
    "open menu",
    "cookie settings",
]
boilerplate_labels = {{
    "home",
    "about",
    "contact",
    "contact us",
    "help",
    "search",
    "press",
    "copyright",
    "creator",
    "creators",
    "advertise",
    "developers",
    "terms",
    "privacy",
    "policy & safety",
    "how youtube works",
    "test new features",
    "nfl sunday ticket",
    "sign in",
    "log in",
    "sign up",
    "register",
    "settings",
    "report history",
    "send feedback",
    "learn more",
    "more info",
}}
boilerplate_url_tokens = (
    "privacy",
    "terms",
    "cookie",
    "contact",
    "advertis",
    "copyright",
    "policy",
    "press",
    "help",
    "about/",
    "/t/",
    "legal",
    "support",
    "feedback",
    "settings",
    "account",
    "login",
    "signin",
    "signup",
    "creators/",
    "howyoutubeworks",
)
ranked_intent = bool(re.search(r"\\b(top|trending|popular|latest|today|best)\\b", " ".join(keywords), re.IGNORECASE))

def _extract_metric(text, patterns):
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1)
    return ""

def _compact(value, limit):
    return re.sub(r"\\s+", " ", value).strip()[:limit]

def _metric_numeric(raw):
    normalized = str(raw or "").strip().lower().replace(",", "")
    if not normalized:
        return 0.0
    multiplier = 1.0
    if normalized.endswith("k"):
        multiplier = 1000.0
        normalized = normalized[:-1]
    elif normalized.endswith("m"):
        multiplier = 1000000.0
        normalized = normalized[:-1]
    try:
        return float(normalized) * multiplier
    except ValueError:
        return 0.0

for anchor in anchors:
    href = (anchor.get("href") or "").strip()
    text = anchor.get_text(" ", strip=True)
    if not href and not text:
        continue
    if href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
        continue
    full_href = urljoin(url, href)
    if not full_href.startswith("http"):
        continue
    if full_href.count("/") <= 2:
        continue

    parsed_href = urlparse(full_href)
    path_parts = [part for part in parsed_href.path.split("/") if part]
    slug_value = path_parts[-1].replace("-", " ").replace("_", " ").strip() if path_parts else ""
    container = anchor.find_parent(["article", "tr", "li", "div"])
    container_text = container.get_text(" ", strip=True) if container else text
    stars_value = _extract_metric(container_text, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:stars?|star)\\b"])
    forks_value = _extract_metric(container_text, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:forks?|fork)\\b"])
    views_value = _extract_metric(
        container_text,
        [r"([0-9][0-9,\\.kKmM]*)\\s*(?:views?|viewers?|watching|plays?)\\b"],
    )
    likes_value = _extract_metric(container_text, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:likes?|thumbs\\s*up)\\b"])
    comments_value = _extract_metric(container_text, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:comments?|replies)\\b"])
    date_value = _extract_metric(
        container_text,
        [
            r"\\b(today|yesterday|\\d+\\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\\s+ago)\\b",
            r"\\b(\\d{{4}}[-/]\\d{{1,2}}[-/]\\d{{1,2}})\\b",
            r"\\b(\\d{{1,2}}\\s+[A-Za-z]{{3,9}}\\s+\\d{{4}})\\b",
        ],
    )
    category_from_url = ""
    if len(path_parts) >= 3 and path_parts[0].lower() in {"category", "tags", "topic", "topics", "genres", "genre"}:
        category_from_url = path_parts[1].replace("-", " ").replace("_", " ").strip().title()

    label = (text or container_text).strip()
    if not label:
        continue
    lowered_label = label.lower()
    lowered_href = full_href.lower()
    if any(fragment in lowered_label for fragment in noise_fragments):
        continue
    if lowered_label in boilerplate_labels:
        continue
    if any(token in lowered_href for token in boilerplate_url_tokens):
        continue
    if len(label) > 180 or len(label.split()) > 22:
        continue
    if label.lower() in {{
        "main page", "home", "about", "contact", "help", "search", "read", "talk",
        "view source", "view history", "contents", "current events", "special pages",
    }}:
        continue

    score_text = " ".join([label, container_text, full_href]).lower()
    keyword_score = sum(1 for keyword in keywords if keyword in score_text)
    has_engagement_metric = any([views_value, likes_value, comments_value, date_value])
    if keywords and keyword_score == 0 and not has_engagement_metric:
        continue
    content_text = (container_text or label).strip()
    lowered_content_text = content_text.lower()
    if (
        len(content_text) > 220
        or " menu " in lowered_content_text
        or "dropdown" in lowered_content_text
        or "press enter to" in lowered_content_text
    ):
        content_text = label

    row = {{}}
    for column in columns:
        lower = column.lower()
        if lower in {{"url", "link", "href"}}:
            row[column] = full_href
        elif lower in {{"title", "name", "text"}}:
            row[column] = _compact(label, 160)
        elif lower in {{"content", "summary", "description"}}:
            row[column] = _compact(content_text, 320)
        elif lower in {{"streamer", "channel", "creator", "username", "user", "owner"}}:
            row[column] = _compact(slug_value or label, 120)
        elif lower in {{"repo", "repository", "repo_name"}}:
            row[column] = path_parts[1] if len(path_parts) >= 2 else _compact(slug_value, 120)
        elif lower in {{"stars", "star", "star_count"}}:
            row[column] = stars_value
        elif lower in {{"forks", "fork", "fork_count"}}:
            row[column] = forks_value
        elif lower in {{"views", "view_count", "viewers", "viewer_count", "watchers", "watching"}}:
            row[column] = views_value
        elif lower in {{"likes", "like_count"}}:
            row[column] = likes_value
        elif lower in {{"comments", "comment_count"}}:
            row[column] = comments_value
        elif lower in {{"date", "date_uploaded", "date_uplaoded", "published", "uploaded", "upload_date"}}:
            row[column] = date_value
        elif lower in {{"category", "game", "topic"}}:
            row[column] = category_from_url or category_hint
        else:
            row[column] = ""

    row_key = tuple(row.get(column, "") for column in columns)
    if row_key in seen:
        continue
    seen.add(row_key)

    if any(value for value in row.values()):
        quality_score = keyword_score
        if views_value:
            quality_score += 2
        if likes_value or comments_value:
            quality_score += 1
        candidate_rows.append((quality_score, row))

if not candidate_rows:
    raw_lines = [line.strip() for line in soup.get_text("\\n").splitlines() if line and line.strip()]
    for line in raw_lines:
        if len(line) < 15:
            continue
        lowered_line = line.lower()
        if any(fragment in lowered_line for fragment in noise_fragments):
            continue
        if len(line) > 260:
            continue
        if lowered_line.startswith(("title:", "url source:", "markdown content:")):
            continue
        if re.match(r"^\\*\\s+\\[(all|images|videos|news|maps|shopping)\\]", lowered_line):
            continue
        if re.match(r"^\\[[^\\]]+\\]\\(https?://duckduckgo\\.com/", lowered_line):
            continue
        if lowered_line in {"privacy", "terms", "advertising", "about duckduckgo"}:
            continue
        if lowered_line.startswith("![image"):
            continue
        if lowered_line in boilerplate_labels:
            continue
        keyword_score = sum(1 for keyword in keywords if keyword in lowered_line)
        views_value = _extract_metric(line, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:views?|viewers?|watching|plays?)\\b"])
        likes_value = _extract_metric(line, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:likes?|thumbs\\s*up)\\b"])
        comments_value = _extract_metric(line, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:comments?|replies)\\b"])
        date_value = _extract_metric(
            line,
            [
                r"\\b(today|yesterday|\\d+\\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\\s+ago)\\b",
                r"\\b(\\d{{4}}[-/]\\d{{1,2}}[-/]\\d{{1,2}})\\b",
                r"\\b(\\d{{1,2}}\\s+[A-Za-z]{{3,9}}\\s+\\d{{4}})\\b",
            ],
        )
        markdown_link_match = re.search(r"\\[([^\\]]+)\\]\\((https?://[^\\)]+)\\)", line)
        plain_link_match = re.search(r"https?://[^\\s\\)]+", line)
        if markdown_link_match:
            line_title = markdown_link_match.group(1).strip()
            line_link = markdown_link_match.group(2).strip()
        else:
            line_title = line.strip()
            line_link = plain_link_match.group(0).strip() if plain_link_match else url

        if ranked_intent and keywords and keyword_score == 0 and not any([views_value, likes_value, comments_value]):
            continue

        row = {{}}
        for column in columns:
            lower = column.lower()
            if lower in {{"url", "link", "href"}}:
                row[column] = line_link
            elif lower in {{"title", "name", "text"}}:
                row[column] = _compact(line_title, 160)
            elif lower in {{"content", "summary", "description"}}:
                row[column] = _compact(line, 320)
            elif lower in {{"streamer", "channel", "creator", "username", "user", "owner"}}:
                row[column] = _compact(line_title, 120)
            elif lower in {{"views", "view_count", "viewers", "viewer_count", "watchers", "watching"}}:
                row[column] = views_value
            elif lower in {{"likes", "like_count"}}:
                row[column] = likes_value
            elif lower in {{"comments", "comment_count"}}:
                row[column] = comments_value
            elif lower in {{"date", "date_uploaded", "date_uplaoded", "published", "uploaded", "upload_date"}}:
                row[column] = date_value
            elif lower in {{"category", "game", "topic"}}:
                row[column] = category_hint
            else:
                row[column] = ""
        row_key = tuple(row.get(column, "") for column in columns)
        if row_key in seen:
            continue
        seen.add(row_key)
        quality_score = max(keyword_score, 1)
        if views_value:
            quality_score += 2
        if likes_value or comments_value:
            quality_score += 1
        candidate_rows.append((quality_score, row))
        if len(candidate_rows) >= 40:
            break

ranking_column = next(
    (
        column
        for column in columns
        if column.lower() in {{
            "views",
            "view_count",
            "viewers",
            "viewer_count",
            "watchers",
            "watching",
            "likes",
            "like_count",
            "comments",
            "comment_count",
            "stars",
            "star_count",
            "forks",
            "fork_count",
        }}
    ),
    None,
)

if ranking_column:
    candidate_rows.sort(key=lambda pair: (_metric_numeric(pair[1].get(ranking_column, "")), pair[0]), reverse=True)
elif keywords:
    candidate_rows.sort(key=lambda pair: pair[0], reverse=True)

for _, row in candidate_rows:
    rows.append(row)
    if len(rows) >= 25:
        break

if not rows:
    rows = [{{column: "" for column in columns}}]

extracted_data = rows
"""


async def _scrape_with_agentic_llm(
    session: dict[str, Any],
    session_id: str,
    env,
    request: ScrapeRequest,
    navigation_plan: dict[str, Any],
    url: str,
    step_num: int,
    total_reward: float,
    model_router: SmartModelRouter,
) -> AsyncGenerator[dict[str, Any], None]:
    """Truly agentic scraping using LLM to decide navigation and extraction.
    
    This function uses the LLM to:
    1. Decide where to navigate based on instructions + template hints
    2. Analyze the HTML content
    3. Generate extraction code dynamically
    4. Format output according to output_instructions
    
    Templates serve as reference hints only, not rigid execution scripts.
    """
    
    # Get template hint if available (for reference only)
    template_hint = ""
    if navigation_plan.get("matched_template"):
        template = navigation_plan["matched_template"]
        template_hint = f"""
SITE TEMPLATE HINT (reference only, not mandatory):
- Domain: {template.get('domain', 'N/A')}
- Strategies: {', '.join(template.get('strategies', []))}
- Suggested output fields: {', '.join(template.get('output_fields', []))}
- Typical patterns: {template.get('patterns', 'N/A')}
"""
    
    # Step 1: Ask LLM to decide navigation strategy
    step_num += 1
    navigation_prompt = f"""You are a web scraping agent. Analyze the user's request and decide where to navigate.

USER REQUEST:
- Assets: {request.assets}
- Target: {url}
- Instructions: {request.instructions or 'Extract all relevant data'}
- Desired output format: {request.output_format.value}
- Output instructions: {request.output_instructions or 'All available data'}

{template_hint}

TASK: Decide the best URL to navigate to accomplish this task. Consider:
- If the user wants trending/popular content, should you go to a trending page?
- If the user wants specific data, do you need to navigate to a specific section?
- Use site template hints only as references, never as rigid rules.
- Return ONLY the URL to navigate to, nothing else.

URL:"""

    live_llm_enabled = _agentic_live_llm_enabled()
    target_url = _fallback_navigation_url(url, request.instructions, navigation_plan)
    navigation_mode = "heuristic"
    if live_llm_enabled:
        try:
            nav_response = await asyncio.wait_for(
                model_router.complete(
                    messages=[{"role": "user", "content": navigation_prompt}],
                    task_type=TaskType.REASONING,
                    model=request.model,
                ),
                timeout=12,
            )
            candidate = nav_response.content.strip()
            if candidate:
                if not candidate.startswith("http"):
                    if "://" not in url:
                        candidate = f"https://{url}/{candidate.lstrip('/')}"
                    else:
                        parsed = urlparse(url)
                        candidate = f"{parsed.scheme}://{parsed.netloc}/{candidate.lstrip('/')}"
                target_url = candidate
                navigation_mode = "llm"
        except Exception as e:
            logger.warning("LLM navigation decision failed, using heuristic fallback: %s", e)
    target_url = _apply_text_render_proxy(target_url)
    
    # Tool call: LLM navigation planning
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="complete",
            message=f"llm.plan_navigation() → {target_url}",
            extracted_data={
                "tool_name": "llm.plan_navigation",
                "tool_description": "LLM decides optimal navigation URL based on instructions",
                "parameters": {"instructions": request.instructions, "base_url": url},
                "result": target_url,
                "mode": navigation_mode,
            },
            reward=0.15,
            timestamp=_now_iso(),
        ),
    )
    total_reward += 0.15

    # Validate URL before navigation
    step_num += 1
    is_valid_target = _is_url_asset(target_url)
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="complete",
            message=f"validate.url(url='{target_url}') → {'valid' if is_valid_target else 'invalid'}",
            extracted_data={
                "tool_name": "validate.url",
                "tool_description": "Validate and normalize navigation URL",
                "parameters": {"url": target_url},
                "result": {
                    "valid": is_valid_target,
                    "normalized_url": _coerce_url_asset(target_url) or target_url,
                },
            },
            reward=0.05 if is_valid_target else 0.0,
            timestamp=_now_iso(),
        ),
    )
    total_reward += 0.05 if is_valid_target else 0.0
    
    # Step 2: Navigate to the decided URL
    step_num += 1
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="running",
            message=f"browser.navigate(url='{target_url}')",
            extracted_data={
                "tool_name": "browser.navigate",
                "tool_description": "Navigate browser to target URL",
                "parameters": {"url": target_url, "wait_for": "page_load"},
            },
            timestamp=_now_iso(),
        ),
    )
    
    navigate_action = Action(
        action_type=ActionType.NAVIGATE,
        parameters={"url": target_url},
        reasoning=f"Navigate to {target_url} based on LLM's decision",
    )
    
    nav_obs, nav_reward, _, _, _, nav_info = await env.step(navigate_action)
    total_reward += nav_reward
    
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="complete",
            message=f"browser.navigate() → Success",
            extracted_data={
                "tool_name": "browser.navigate",
                "tool_description": "Navigate browser to target URL",
                "parameters": {"url": target_url},
                "result": {"status_code": nav_obs.page_html is not None},
            },
            reward=nav_reward,
            timestamp=_now_iso(),
        ),
    )
    
    if not nav_obs.page_html:
        logger.error("Navigation failed - no HTML received")
        return
    
    # Step 3: Parse HTML
    step_num += 1
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="running",
            message="html.parse(html=page_content)",
            extracted_data={
                "tool_name": "html.parse",
                "tool_description": "Parse HTML into DOM structure",
                "parameters": {"content_length": len(nav_obs.page_html)},
            },
            timestamp=_now_iso(),
        ),
    )
    
    soup = BeautifulSoup(nav_obs.page_html, "html.parser")
    total_reward += 0.1
    
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="complete",
            message="html.parse() → DOM ready",
            extracted_data={
                "tool_name": "html.parse",
                "tool_description": "Parse HTML into DOM structure",
                "result": {"elements_count": len(soup.find_all())},
            },
            reward=0.1,
            timestamp=_now_iso(),
        ),
    )
    
    # Extract links for tool visibility and fallback processing
    step_num += 1
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="running",
            message="extract.urls(html)",
            extracted_data={
                "tool_name": "extract.urls",
                "tool_description": "Extract hyperlinks from parsed HTML",
                "parameters": {"scope": "document"},
            },
            timestamp=_now_iso(),
        ),
    )
    extracted_links: list[str] = []
    for anchor in soup.find_all("a", href=True):
        href = str(anchor.get("href", "")).strip()
        if not href:
            continue
        if href.startswith("/"):
            href = f"{target_url.rstrip('/')}{href}"
        if href not in extracted_links:
            extracted_links.append(href)
        if len(extracted_links) >= 200:
            break
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="complete",
            message=f"extract.urls() → {len(extracted_links)} links",
            extracted_data={
                "tool_name": "extract.urls",
                "result": {"count": len(extracted_links), "sample": extracted_links[:5]},
            },
            reward=0.05,
            timestamp=_now_iso(),
        ),
    )
    total_reward += 0.05

    # Extract emails for tool visibility and fallback processing
    step_num += 1
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="running",
            message="extract.emails(html)",
            extracted_data={
                "tool_name": "extract.emails",
                "tool_description": "Extract email addresses from page content",
                "parameters": {"pattern": "email regex"},
            },
            timestamp=_now_iso(),
        ),
    )
    extracted_emails = sorted(set(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", nav_obs.page_html)))
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="complete",
            message=f"extract.emails() → {len(extracted_emails)} emails",
            extracted_data={
                "tool_name": "extract.emails",
                "result": {"count": len(extracted_emails), "sample": extracted_emails[:5]},
            },
            reward=0.05,
            timestamp=_now_iso(),
        ),
    )
    total_reward += 0.05

    # Extract quick structural fields
    step_num += 1
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="running",
            message="html.extract(fields=['title','content','links'])",
            extracted_data={
                "tool_name": "html.extract",
                "tool_description": "Extract key structural fields for downstream processing",
                "parameters": {"fields": ["title", "content", "links"]},
            },
            timestamp=_now_iso(),
        ),
    )
    page_title = soup.title.get_text(strip=True) if soup.title else ""
    page_content = soup.get_text(" ", strip=True)
    quick_extract = {
        "title": page_title,
        "content": page_content[:2000],
        "links": extracted_links[:100],
    }
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="complete",
            message="html.extract() → fields ready",
            extracted_data={
                "tool_name": "html.extract",
                "result": {
                    "title_length": len(page_title),
                    "content_length": len(quick_extract["content"]),
                    "link_count": len(quick_extract["links"]),
                },
            },
            reward=0.05,
            timestamp=_now_iso(),
        ),
    )
    total_reward += 0.05

    # Step 4: Ask LLM to generate extraction code
    step_num += 1
    
    # Get a larger sample of the HTML for LLM analysis (first 15000 chars to include content)
    html_sample = nav_obs.page_html[:15000]
    
    # === AGENT TOOL CALLING: runtime-selected, registry-backed ===
    agent_tool_calls = []
    tool_call_results = []
    tool_observations = ""

    if live_llm_enabled:
        try:
            from app.agents.tool_caller import AgentToolCaller, ToolExecutor, summarize_tool_results

            tool_caller = AgentToolCaller(model_router)
            executor = ToolExecutor()

            agent_tool_calls = await tool_caller.decide_tools(
                task_description=(
                    f"Extract {request.output_instructions or 'data'} from page content. "
                    f"User instructions: {request.instructions}"
                ),
                context={
                    "url": target_url,
                    "html_length": len(nav_obs.page_html),
                    "instructions": request.instructions,
                    "output_format": request.output_format.value,
                    "tools_used": [],
                },
                model=request.model,
                max_tools=6,
            )

            if agent_tool_calls:
                tool_decision_step = _record_step(
                    session,
                    ScrapeStep(
                        step_number=len(session["steps"]),
                        action="agent_decision",
                        status="completed",
                        message=f"Agent selected {len(agent_tool_calls)} runtime tools",
                        reward=0.1,
                        extracted_data={
                            "tool_calls": [
                                {
                                    "tool": tool_call.tool_name,
                                    "params": tool_call.parameters,
                                    "reasoning": tool_call.reasoning,
                                }
                                for tool_call in agent_tool_calls
                            ],
                        },
                        timestamp=_now_iso(),
                    ),
                )
                yield tool_decision_step

            tool_context = {
                "soup": BeautifulSoup(nav_obs.page_html, "html.parser"),
                "html": nav_obs.page_html,
                "url": target_url,
                "instructions": request.instructions or "",
            }

            for tool_call in agent_tool_calls:
                result = await executor.execute_tool_call(tool_call, tool_context)
                tool_call_results.append(result)

                if result.success and isinstance(result.result, dict):
                    for context_key in ("rows", "text", "data"):
                        if context_key in result.result:
                            tool_context[context_key] = result.result[context_key]

                tool_exec_step = _record_step(
                    session,
                    ScrapeStep(
                        step_number=len(session["steps"]),
                        action="tool_call",
                        status="completed" if result.success else "failed",
                        message=f"Tool {result.tool_name}: {'ok' if result.success else 'failed'}",
                        reward=0.05 if result.success else -0.02,
                        extracted_data={
                            "tool": result.tool_name,
                            "success": result.success,
                            "result_preview": str(result.result)[:200] if result.result is not None else None,
                            "error": result.error,
                            "duration_ms": result.duration_ms,
                        },
                        timestamp=_now_iso(),
                    ),
                )
                yield tool_exec_step

            if tool_call_results:
                tool_observations = summarize_tool_results(tool_call_results)

        except Exception as e:
            logger.warning("Agent tool calling failed: %s", e)

    extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.

USER REQUEST:
- Assets: {request.assets}
- Instructions: {request.instructions or 'Extract all relevant data'}
- Output format: {request.output_format.value}
- Output instructions: {request.output_instructions or 'All available data'}

HTML SAMPLE (first 15000 chars):
```html
{html_sample}
```

{template_hint}

AGENT TOOL OBSERVATIONS (runtime execution, not hardcoded):
{tool_observations or "No additional tool observations collected."}

TASK: Generate Python code using BeautifulSoup to extract the requested data.

REQUIREMENTS:
1. The `soup` variable is already provided as a BeautifulSoup object
2. Extract data matching the user's output_instructions: "{request.output_instructions}"
3. Return `extracted_data` as a list of dictionaries
4. Column names MUST exactly match: {_parse_column_names(request.output_instructions) if request.output_instructions else []}
5. Handle missing data gracefully (use empty string "" for missing fields)
6. Extract ACTUAL text content from HTML elements, not empty strings
7. Look for the most relevant elements containing the requested data
8. If data appears in different formats (e.g., "123 points" or "123"), extract just the number
9. Do not include extra columns that were not requested

EXAMPLE OUTPUT FORMAT:
extracted_data = [
    {{"username": "google", "repo": "tensorflow", "stars": "12345", "forks": "6789"}},
    {{"username": "microsoft", "repo": "vscode", "stars": "11111", "forks": "2222"}},
]

Return ONLY executable Python code, no explanations or markdown:"""

    extraction_code = _fallback_extraction_code(
        request.output_instructions,
        request.instructions,
    )
    codegen_mode = "heuristic"
    if live_llm_enabled:
        try:
            code_response = await asyncio.wait_for(
                model_router.complete(
                    messages=[{"role": "user", "content": extraction_prompt}],
                    task_type=TaskType.CODE,
                    model=request.model,
                    temperature=0.3,
                ),
                timeout=12,
            )
            candidate_code = code_response.content.strip()
            if "```python" in candidate_code:
                candidate_code = candidate_code.split("```python")[1].split("```")[0].strip()
            elif "```" in candidate_code:
                candidate_code = candidate_code.split("```")[1].split("```")[0].strip()
            if candidate_code:
                extraction_code = candidate_code
                codegen_mode = "llm"
        except Exception as e:
            logger.warning("LLM code generation failed, using heuristic extraction code: %s", e)

    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="complete",
            message=f"{'llm' if codegen_mode == 'llm' else 'agent.fallback'}.generate_extraction_code() → {len(extraction_code)} chars",
            extracted_data={
                "tool_name": "llm.generate_extraction_code",
                "tool_description": "Generate extraction code from page context and requested output schema",
                "parameters": {
                    "html_sample_length": len(html_sample),
                    "instructions": request.instructions,
                    "output_format": request.output_format.value,
                },
                "result": {"code_length": len(extraction_code), "mode": codegen_mode},
            },
            reward=0.2 if codegen_mode == "llm" else 0.05,
            timestamp=_now_iso(),
        ),
    )
    total_reward += 0.2 if codegen_mode == "llm" else 0.05
    
    # Step 5: Execute generated code in sandbox
    step_num += 1
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="running",
            message="sandbox.execute(code=llm_generated_code)",
            extracted_data={
                "tool_name": "sandbox.execute",
                "tool_description": "Execute LLM-generated extraction code in sandboxed Python environment",
                "parameters": {"code_length": len(extraction_code), "timeout": 30},
            },
            timestamp=_now_iso(),
        ),
    )
    
    # Prepare execution context
    sandbox_globals = {
        "soup": soup,
        "html": nav_obs.page_html,
        "url": target_url,
        "re": re,
        "urljoin": urljoin,
        "urlparse": urlparse,
        "BeautifulSoup": BeautifulSoup,
        "extracted_data": [],  # LLM code should populate this
    }
    output_columns: list[str] = []
    execution_mode = codegen_mode
    
    try:
        # Execute the LLM-generated code
        exec(extraction_code, sandbox_globals)
        extracted_data = sandbox_globals.get("extracted_data", [])
        
        if not isinstance(extracted_data, list):
            extracted_data = [extracted_data] if extracted_data else []
        extracted_data, output_columns = _enforce_requested_schema(
            extracted_data,
            request.output_instructions,
        )
        requested_limit = _requested_row_limit(request.instructions, default_limit=25)
        extracted_data = extracted_data[:requested_limit]
        relevance_score = _rows_relevance_score(extracted_data, request.instructions)

        if not _rows_have_signal(extracted_data):
            if codegen_mode == "llm":
                try:
                    heuristic_code = _fallback_extraction_code(
                        request.output_instructions,
                        request.instructions,
                    )
                    heuristic_globals = {
                        **sandbox_globals,
                        "extracted_data": [],
                    }
                    exec(heuristic_code, heuristic_globals)
                    heuristic_data = heuristic_globals.get("extracted_data", [])
                    if not isinstance(heuristic_data, list):
                        heuristic_data = [heuristic_data] if heuristic_data else []
                    heuristic_data, heuristic_columns = _enforce_requested_schema(
                        heuristic_data,
                        request.output_instructions,
                    )
                    heuristic_data = heuristic_data[:requested_limit]
                    if _rows_have_signal(heuristic_data):
                        extracted_data = heuristic_data
                        output_columns = heuristic_columns or output_columns
                        execution_mode = "llm_with_heuristic_recovery"
                except Exception as recovery_error:
                    logger.warning("Heuristic recovery after empty LLM extraction failed: %s", recovery_error)

            if not _rows_have_signal(extracted_data):
                text_render_payload = _fetch_text_render_markdown(target_url, timeout_seconds=12)
                if text_render_payload:
                    text_markdown, text_render_url = text_render_payload
                    try:
                        text_data, text_columns = _extract_rows_from_text_render(
                            markdown=text_markdown,
                            source_url=text_render_url,
                            output_instructions=request.output_instructions,
                            instructions=request.instructions,
                            row_limit=requested_limit,
                        )
                        if _rows_have_signal(text_data):
                            extracted_data = text_data
                            output_columns = text_columns or output_columns
                            execution_mode = "text_render_recovery"
                            target_url = text_render_url
                    except Exception as text_recovery_error:
                        logger.warning("Text-render recovery after empty extraction failed: %s", text_recovery_error)

        relevance_score = _rows_relevance_score(extracted_data, request.instructions)
        recovery_keywords = _instruction_keywords(request.instructions, max_keywords=8)
        
        # Only attempt recovery if we have NO useful signal from the user's specified site
        # If we have data with signal, trust the user's site - don't go to external search
        if not _rows_have_signal(extracted_data) and recovery_keywords:
            step_num += 1
            yield _record_step(
                session,
                ScrapeStep(
                    step_number=step_num,
                    action="tool_call",
                    url=target_url,
                    status="running",
                    message="agent.recover_relevance(query)",
                    extracted_data={
                        "tool_name": "agent.recover_relevance",
                        "tool_description": "Search-guided relevance recovery for empty extraction output",
                        "parameters": {
                            "keywords": recovery_keywords,
                            "baseline_relevance": round(relevance_score, 3),
                        },
                    },
                    timestamp=_now_iso(),
                ),
            )

            recovered_rows, recovered_columns, recovered_source, recovered_score = await _search_recovery_rows(
                base_url=url,
                instructions=request.instructions,
                output_instructions=request.output_instructions,
                row_limit=requested_limit,
            )
            # Only use recovery data if it's significantly better AND provides signal
            improved = _rows_have_signal(recovered_rows) and recovered_score > 0.3 and len(recovered_rows) >= 3
            if improved:
                extracted_data = recovered_rows
                output_columns = recovered_columns or output_columns
                target_url = recovered_source or target_url
                execution_mode = "search_recovery"
                relevance_score = recovered_score

            yield _record_step(
                session,
                ScrapeStep(
                    step_number=step_num,
                    action="tool_call",
                    url=target_url,
                    status="complete",
                    message=(
                        f"agent.recover_relevance() → {'improved' if improved else 'no_change'} "
                        f"({relevance_score:.2f})"
                    ),
                    extracted_data={
                        "tool_name": "agent.recover_relevance",
                        "result": {
                            "improved": improved,
                            "relevance": round(relevance_score, 3),
                            "recovered_rows": len(recovered_rows),
                            "source": recovered_source,
                        },
                    },
                    reward=0.1 if improved else 0.0,
                    timestamp=_now_iso(),
                ),
            )
            if improved:
                total_reward += 0.1
        
        has_signal = _rows_have_signal(extracted_data)
        exec_reward = 0.5 if has_signal else 0.1
        total_reward += exec_reward
        
        yield _record_step(
            session,
            ScrapeStep(
                step_number=step_num,
                action="tool_call",
                url=target_url,
                status="complete",
                message=f"sandbox.execute() → Extracted {len(extracted_data)} items",
                extracted_data={
                    "tool_name": "sandbox.execute",
                    "tool_description": "Execute extraction code in sandbox",
                    "result": {
                        "items_extracted": len(extracted_data),
                        "has_signal": has_signal,
                        "relevance_score": round(relevance_score, 3),
                        "mode": execution_mode,
                        "columns": output_columns,
                        "sample": extracted_data[:2] if extracted_data else [],
                    },
                },
                reward=exec_reward,
                timestamp=_now_iso(),
            ),
        )
        
    except Exception as e:
        logger.error(f"Extraction code execution failed: {e}")
        logger.error(f"Generated code was:\n{extraction_code}")
        # Fallback: basic extraction
        extracted_data = [{
            "url": target_url,
            "title": soup.find("title").get_text() if soup.find("title") else "",
            "error": f"Extraction failed: {str(e)}",
        }]
        extracted_data, output_columns = _enforce_requested_schema(
            extracted_data,
            request.output_instructions,
        )
        requested_limit = _requested_row_limit(request.instructions, default_limit=25)
        extracted_data = extracted_data[:requested_limit]
        total_reward += 0.05
        
        yield _record_step(
            session,
            ScrapeStep(
                step_number=step_num,
                action="tool_call",
                url=target_url,
                status="complete",
                message=f"sandbox.execute() → Failed: {str(e)[:100]}",
                extracted_data={
                    "tool_name": "sandbox.execute",
                    "tool_description": "Execute extraction code (failed)",
                    "result": {"error": str(e)},
                },
                reward=0.05,
                timestamp=_now_iso(),
            ),
        )
    
    # Step 6: Format output according to requested format
    step_num += 1
    
    if request.output_format == OutputFormat.CSV:
        tool_name = "csv.generate"
        tool_desc = "Generate CSV output from extracted data"
    elif request.output_format == OutputFormat.JSON:
        tool_name = "json.dumps"
        tool_desc = "Format extracted data as JSON"
    else:
        tool_name = "data.format"
        tool_desc = "Format extracted data"
    
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="running",
            message=f"{tool_name}(data=extracted_items)",
            extracted_data={
                "tool_name": tool_name,
                "tool_description": tool_desc,
                "parameters": {"item_count": len(extracted_data)},
            },
            timestamp=_now_iso(),
        ),
    )
    
    # Store extracted data in session
    if request.output_format == OutputFormat.CSV and extracted_data:
        existing_rows: list[dict[str, Any]] = []
        existing_sources: list[str] = []
        existing_payload = session.get("extracted_data")
        if isinstance(existing_payload, dict):
            if isinstance(existing_payload.get("rows"), list):
                existing_rows = [row for row in existing_payload["rows"] if isinstance(row, dict)]
            if isinstance(existing_payload.get("sources"), list):
                existing_sources = [str(value) for value in existing_payload["sources"]]

        merged_rows = [*existing_rows, *extracted_data]
        fieldnames = output_columns or list(extracted_data[0].keys())

        deduped_rows: list[dict[str, Any]] = []
        seen_keys: set[tuple[str, ...]] = set()
        for row in merged_rows:
            normalized_row = {field: str(row.get(field, "")) for field in fieldnames}
            row_key = tuple(normalized_row[field] for field in fieldnames)
            if row_key in seen_keys:
                continue
            seen_keys.add(row_key)
            deduped_rows.append(normalized_row)

        requested_limit = _requested_row_limit(request.instructions, default_limit=25)
        deduped_rows = deduped_rows[:requested_limit]

        output_buffer = io.StringIO()
        writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(deduped_rows)

        merged_sources = [*existing_sources]
        if target_url not in merged_sources:
            merged_sources.append(target_url)
        
        session["extracted_data"] = {
            "csv_output": output_buffer.getvalue(),
            "rows": deduped_rows,
            "columns": fieldnames,
            "row_count": len(deduped_rows),
            "sources": merged_sources,
        }
    else:
        current_payload = session.get("extracted_data")
        merged_payload: dict[str, Any] = {}
        if isinstance(current_payload, dict) and "csv_output" not in current_payload:
            merged_payload.update(current_payload)
        merged_payload[target_url] = extracted_data
        session["extracted_data"] = merged_payload
    
    total_reward += 0.1
    
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="tool_call",
            url=target_url,
            status="complete",
            message=f"{tool_name}() → Output ready",
            extracted_data={
                "tool_name": tool_name,
                "tool_description": tool_desc,
                "result": {"format": request.output_format.value, "size": len(extracted_data)},
            },
            reward=0.1,
            timestamp=_now_iso(),
        ),
    )
    
    # Final completion
    step_num += 1
    yield _record_step(
        session,
        ScrapeStep(
            step_number=step_num,
            action="complete",
            url=target_url,
            status="complete",
            message=f"Agentic scraping complete: {len(extracted_data)} items extracted",
            extracted_data={"item_count": len(extracted_data)},
            reward=total_reward,
            timestamp=_now_iso(),
        ),
    )


async def scrape_url_intelligently(
    session: dict[str, Any],
    session_id: str,
    url: str,
    settings: Settings,
    request: ScrapeRequest,
    memory_manager: MemoryManager,
    enabled_plugins: list[str],
    navigation_plan: dict[str, Any],
) -> AsyncGenerator[dict[str, Any], None]:
    """Intelligent scraping using agentic LLM-driven approach.
    
    This function uses LLM to make ALL decisions:
    - Navigation: Where to go based on instructions
    - Extraction: What data to extract and how
    - Formatting: How to present the results
    
    Templates serve as reference hints only, NOT rigid scripts.
    """
    
    episode_id = f"{session_id}-{uuid.uuid4().hex[:8]}"
    
    try:
        env = create_environment(episode_id, settings)
        await env.reset(task_id=f"scrape_{session_id}")
        
        # Get model router
        model_router = get_model_router()
        if not model_router:
            logger.error("Model router not available")
            session["errors"].append("Model router not initialized")
            return
        
        step_num = 0
        total_reward = 0.0
        
        # ALWAYS use agentic approach - no hardcoded strategies
        async for event in _scrape_with_agentic_llm(
            session, 
            session_id, 
            env, 
            request, 
            navigation_plan, 
            url, 
            step_num, 
            total_reward,
            model_router,
        ):
            yield event
            
    except Exception as exc:
        logger.error(f"Intelligent scraping failed for {url}: {exc}")
        session["errors"].append(f"Scraping failed: {exc}")
        
async def scrape_stream(
    session_id: str,
    request: ScrapeRequest,
    settings: Settings,
    memory_manager: MemoryManager,
) -> AsyncGenerator[str, None]:
    """Stream scraping progress as SSE events and websocket broadcasts."""

    enabled_plugins, missing_plugins = _resolve_enabled_plugins(request.enable_plugins)
    session = create_session(session_id, request, enabled_plugins)
    python_plugin_ids = {
        "mcp-python-sandbox",
        "proc-python",
        "proc-pandas",
        "proc-numpy",
        "proc-bs4",
    }
    if missing_plugins:
        session["errors"].append(f"Unavailable plugins ignored: {', '.join(missing_plugins)}")

    manager = get_connection_manager()
    start_time = time.time()

    init_event = {"type": "init", "session_id": session_id}
    await manager.broadcast(init_event, session_id)
    yield _sse_event(init_event)

    # Create intelligent navigation plan based on instructions
    navigation_plan = _create_intelligent_navigation_plan(request.instructions, request.assets)
    
    plugin_event = _record_step(
        session,
        ScrapeStep(
            step_number=0,
            action="plugins",
            status="completed",
            message=(
                f"Enabled plugins: {enabled_plugins}" if enabled_plugins else "No plugins enabled"
            ),
            reward=0.1 if enabled_plugins else 0.0,  # Small reward for plugin setup
            extracted_data={
                "requested": request.enable_plugins, 
                "enabled": enabled_plugins, 
                "missing": missing_plugins,
                "navigation_strategy": navigation_plan["strategy"],
                "extraction_goal": navigation_plan["extraction_goal"],
                "site_template_id": navigation_plan.get("site_template_id"),
                "site_template_name": navigation_plan.get("site_template_name"),
                "site_template_domains": navigation_plan.get("site_template_domains", []),
            },
            timestamp=_now_iso(),
        ),
    )
    await manager.broadcast(plugin_event, session_id)
    yield _sse_event(plugin_event)

    resolved_assets, discoveries = await _resolve_assets(request.assets, enabled_plugins)
    if not resolved_assets:
        resolved_assets = request.assets
    session["resolved_assets"] = resolved_assets

    if discoveries:
        discovery_event = _record_step(
            session,
            ScrapeStep(
                step_number=1,
                action="mcp_search",
                status="completed",
                message="Resolved non-URL assets using search/discovery plugin logic",
                reward=0.2,  # Reward for successful discovery
                extracted_data={"discoveries": discoveries, "resolved_assets": resolved_assets},
                timestamp=_now_iso(),
            ),
        )
        await manager.broadcast(discovery_event, session_id)
        yield _sse_event(discovery_event)

    planner_site_template = match_site_template(request.instructions, resolved_assets)
    planner_template_payload = (
        serialize_site_template(planner_site_template) if planner_site_template else None
    )

    if request.enable_memory:
        try:
            await memory_manager.store(
                key=f"scrape:{session_id}:request",
                value={
                    "assets": request.assets,
                    "resolved_assets": resolved_assets,
                    "instructions": request.instructions,
                    "output_instructions": request.output_instructions,
                    "complexity": request.complexity.value,
                },
                memory_type=MemoryType.SHORT_TERM,
                tags=["scrape", "request"],
            )
            _write_session_json_artifact(
                session,
                "memory_request.json",
                {
                    "assets": request.assets,
                    "resolved_assets": resolved_assets,
                    "instructions": request.instructions,
                    "output_instructions": request.output_instructions,
                    "selected_agents": request.selected_agents,
                    "enabled_plugins": enabled_plugins,
                },
            )
        except Exception as exc:
            message = f"Failed to store request memory: {exc}"
            session["errors"].append(message)
            memory_error = {"type": "error", "data": {"url": None, "error": message, "timestamp": _now_iso()}}
            await manager.broadcast(memory_error, session_id)
            yield _sse_event(memory_error)

    planner_event = _record_step(
        session,
        ScrapeStep(
            step_number=len(session["steps"]) + 1,
            action="planner",
            status="completed",
            message=f"Planner created execution plan for {len(resolved_assets)} assets",
            reward=0.15,  # Reward for planning
            extracted_data={
                "assets": resolved_assets,
                "instructions": request.instructions,
                "output_instructions": request.output_instructions,
                "site_template": planner_template_payload,
            },
            timestamp=_now_iso(),
        ),
    )
    await manager.broadcast(planner_event, session_id)
    yield _sse_event(planner_event)

    if any(plugin_id in enabled_plugins for plugin_id in python_plugin_ids):
        planner_payload = {
            "phase": "planner",
            "instructions": request.instructions,
            "output_instructions": request.output_instructions,
            "resolved_assets": resolved_assets,
            "selected_agents": request.selected_agents,
            "site_template": planner_template_payload,
        }
        planner_code = (
            "result = {"
            "'phase': payload.get('phase'), "
            "'asset_count': len(payload.get('resolved_assets') or []), "
            "'selected_agents': payload.get('selected_agents') or [], "
            "'site_template_id': (payload.get('site_template') or {}).get('site_id'), "
            "'site_strategy': (payload.get('site_template') or {}).get('default_strategy')"
            "}"
        )
        
        # Tool call: sandbox.execute (planner)
        sandbox_tool_event = _record_step(
            session,
            ScrapeStep(
                step_number=len(session["steps"]) + 1,
                action="tool_call",
                status="running",
                message="sandbox.execute(code='planner_analysis')",
                extracted_data={
                    "tool_name": "sandbox.execute",
                    "tool_description": "Execute Python code in isolated sandbox environment",
                    "parameters": {
                        "code_type": "planner_analysis",
                        "imports": ["json"],
                        "payload_keys": list(planner_payload.keys()),
                    },
                },
                timestamp=_now_iso(),
            ),
        )
        await manager.broadcast(sandbox_tool_event, session_id)
        yield _sse_event(sandbox_tool_event)
        
        try:
            planner_sandbox = await asyncio.to_thread(
                execute_python_sandbox,
                planner_code,
                planner_payload,
                session_id=session_id,
                timeout_seconds=15,
            )
        except Exception as exc:
            planner_sandbox = SandboxExecutionResult(
                success=False,
                output=None,
                error=f"Planner sandbox setup failed: {exc}",
            )

        # Tool call result
        sandbox_result_event = _record_step(
            session,
            ScrapeStep(
                step_number=len(session["steps"]),
                action="tool_call",
                status="completed" if planner_sandbox.success else "failed",
                message=f"sandbox.execute() → {'success' if planner_sandbox.success else 'failed'}",
                reward=0.05 if planner_sandbox.success else 0.0,
                extracted_data={
                    "tool_name": "sandbox.execute",
                    "result": {
                        "success": planner_sandbox.success,
                        "output_keys": list(planner_sandbox.output.keys()) if planner_sandbox.output else [],
                        "error": planner_sandbox.error,
                    },
                },
                timestamp=_now_iso(),
            ),
        )
        await manager.broadcast(sandbox_result_event, session_id)
        yield _sse_event(sandbox_result_event)

        if planner_sandbox.success and planner_sandbox.output is not None:
            planner_python_event = _record_step(
                session,
                ScrapeStep(
                    step_number=len(session["steps"]) + 1,
                    action="planner_python",
                    status="completed",
                    message="Planner agent executed sandbox Python code",
                    reward=0.1,  # Reward for sandbox execution
                    extracted_data=planner_sandbox.output,
                    timestamp=_now_iso(),
                ),
            )
            await manager.broadcast(planner_python_event, session_id)
            yield _sse_event(planner_python_event)
        else:
            session["errors"].append(planner_sandbox.error or "Planner sandbox execution failed")

    # Tool call: url.parse (validate and parse URLs)
    url_parse_event = _create_tool_call_step(
        session,
        "url.parse",
        "Parse and validate target URLs",
        {"urls": resolved_assets, "count": len(resolved_assets)},
        status="running",
    )
    await manager.broadcast(url_parse_event, session_id)
    yield _sse_event(url_parse_event)
    
    parsed_urls = []
    for url in resolved_assets:
        parsed = urlparse(url)
        parsed_urls.append({
            "url": url,
            "scheme": parsed.scheme,
            "domain": parsed.netloc,
            "path": parsed.path,
        })
    
    url_parse_result = _create_tool_call_step(
        session,
        "url.parse",
        "Parse and validate target URLs",
        {"urls": resolved_assets},
        status="completed",
        result={"parsed": len(parsed_urls), "domains": list(set(p["domain"] for p in parsed_urls))},
        reward=0.05,
    )
    await manager.broadcast(url_parse_result, session_id)
    yield _sse_event(url_parse_result)

    for idx, url in enumerate(resolved_assets):
        session["current_url_index"] = idx
        url_navigation_plan = _create_intelligent_navigation_plan(request.instructions, [url])
        url_site_template = match_site_template(request.instructions, [url])
        url_template_payload = serialize_site_template(url_site_template) if url_site_template else None

        if url_template_payload:
            site_template_event = _record_step(
                session,
                ScrapeStep(
                    step_number=len(session["steps"]) + 1,
                    action="site_template",
                    url=url,
                    status="completed",
                    message=f"Navigator loaded site template: {url_template_payload['name']}",
                    reward=0.05,
                    extracted_data={
                        "site_id": url_template_payload["site_id"],
                        "strategy": url_navigation_plan["strategy"],
                        "domains": url_template_payload["domains"],
                    },
                    timestamp=_now_iso(),
                ),
            )
            await manager.broadcast(site_template_event, session_id)
            yield _sse_event(site_template_event)

        navigator_event = _record_step(
            session,
            ScrapeStep(
                step_number=len(session["steps"]) + 1,
                action="navigator",
                url=url,
                status="running",
                message=(
                    f"Navigator selected source {idx + 1}/{len(resolved_assets)} "
                    f"({url_navigation_plan['strategy']})"
                ),
                reward=0.05,  # Small reward for navigator selection
                extracted_data={
                    "site_template_id": url_navigation_plan.get("site_template_id"),
                    "site_template_name": url_navigation_plan.get("site_template_name"),
                },
                timestamp=_now_iso(),
            ),
        )
        await manager.broadcast(navigator_event, session_id)
        yield _sse_event(navigator_event)

        if any(plugin_id in enabled_plugins for plugin_id in python_plugin_ids):
            navigator_payload = {
                "phase": "navigator",
                "url": url,
                "index": idx,
                "total": len(resolved_assets),
                "site_template": url_template_payload,
                "navigation_strategy": url_navigation_plan["strategy"],
            }
            navigator_code = (
                "result = {"
                "'phase': payload.get('phase'), "
                "'selected_url': payload.get('url'), "
                "'progress': f\"{payload.get('index', 0) + 1}/{payload.get('total', 0)}\", "
                "'site_template_id': (payload.get('site_template') or {}).get('site_id'), "
                "'strategy': payload.get('navigation_strategy')"
                "}"
            )
            
            # Tool call: sandbox.execute (navigator)
            nav_sandbox_tool_event = _record_step(
                session,
                ScrapeStep(
                    step_number=len(session["steps"]) + 1,
                    action="tool_call",
                    url=url,
                    status="running",
                    message="sandbox.execute(code='navigator_analysis')",
                    extracted_data={
                        "tool_name": "sandbox.execute",
                        "tool_description": "Execute navigator analysis in sandbox",
                        "parameters": {
                            "code_type": "navigator_analysis",
                            "imports": ["json"],
                            "url": url,
                        },
                    },
                    timestamp=_now_iso(),
                ),
            )
            await manager.broadcast(nav_sandbox_tool_event, session_id)
            yield _sse_event(nav_sandbox_tool_event)
            
            try:
                navigator_sandbox = await asyncio.to_thread(
                    execute_python_sandbox,
                    navigator_code,
                    navigator_payload,
                    session_id=session_id,
                    timeout_seconds=15,
                )
            except Exception as exc:
                navigator_sandbox = SandboxExecutionResult(
                    success=False,
                    output=None,
                    error=f"Navigator sandbox setup failed: {exc}",
                )

            # Tool call result
            nav_sandbox_result_event = _record_step(
                session,
                ScrapeStep(
                    step_number=len(session["steps"]),
                    action="tool_call",
                    url=url,
                    status="completed" if navigator_sandbox.success else "failed",
                    message=f"sandbox.execute() → {'success' if navigator_sandbox.success else 'failed'}",
                    reward=0.05 if navigator_sandbox.success else 0.0,
                    extracted_data={
                        "tool_name": "sandbox.execute",
                        "result": {
                            "success": navigator_sandbox.success,
                            "output_keys": list(navigator_sandbox.output.keys()) if navigator_sandbox.output else [],
                        },
                    },
                    timestamp=_now_iso(),
                ),
            )
            await manager.broadcast(nav_sandbox_result_event, session_id)
            yield _sse_event(nav_sandbox_result_event)

            if navigator_sandbox.success and navigator_sandbox.output is not None:
                navigator_python_event = _record_step(
                    session,
                    ScrapeStep(
                        step_number=len(session["steps"]) + 1,
                        action="navigator_python",
                        url=url,
                        status="completed",
                        message="Navigator agent executed sandbox Python code",
                        reward=0.1,  # Reward for sandbox navigation
                        extracted_data=navigator_sandbox.output,
                        timestamp=_now_iso(),
                    ),
                )
                await manager.broadcast(navigator_python_event, session_id)
                yield _sse_event(navigator_python_event)
            else:
                session["errors"].append(navigator_sandbox.error or "Navigator sandbox execution failed")

        url_start_event = {"type": "url_start", "url": url, "index": idx, "total": len(resolved_assets)}
        await manager.broadcast(url_start_event, session_id)
        yield _sse_event(url_start_event)

        async for update in scrape_url_intelligently(
            session,
            session_id,
            url,
            settings,
            request,
            memory_manager,
            enabled_plugins,
            url_navigation_plan,
        ):
            await manager.broadcast(update, session_id)
            yield _sse_event(update)

        url_done_event = {"type": "url_complete", "url": url, "index": idx}
        await manager.broadcast(url_done_event, session_id)
        yield _sse_event(url_done_event)

    instruction_text = f"{request.instructions} {request.output_instructions} {' '.join(request.assets)}".lower()
    if "gold" in instruction_text and ("price" in instruction_text or "trend" in instruction_text):
        gold_rows = _build_gold_dataset_rows(session["extracted_data"], from_month="2016-01")
        if gold_rows:
            source_links = sorted({row["source_link"] for row in gold_rows})
            session["extracted_data"] = {
                "dataset_name": "gold_prices_monthly",
                "description": "Monthly gold prices in USD from 2016 onward",
                "columns": ["month", "gold_price_usd", "source_link"],
                "rows": gold_rows,
                "row_count": len(gold_rows),
                "from_month": "2016-01",
                "to_month": gold_rows[-1]["month"],
                "source_links": source_links,
            }
            quality_status = "completed" if len(gold_rows) >= 100 else "partial"
            quality_message = (
                f"Verifier assembled monthly gold dataset with {len(gold_rows)} rows"
                if quality_status == "completed"
                else f"Verifier assembled only {len(gold_rows)} rows; expected >= 100"
            )

            quality_event = _record_step(
                session,
                ScrapeStep(
                    step_number=len(session["steps"]) + 1,
                    action="verifier",
                    status=quality_status,
                    message=quality_message,
                    extracted_data={
                        "row_count": len(gold_rows),
                        "sources": source_links,
                    },
                    timestamp=_now_iso(),
                ),
            )
            await manager.broadcast(quality_event, session_id)
            yield _sse_event(quality_event)
        else:
            quality_event = _record_step(
                session,
                ScrapeStep(
                    step_number=len(session["steps"]) + 1,
                    action="verifier",
                    status="partial",
                    message="Verifier could not assemble monthly gold rows from resolved sources",
                    extracted_data={"row_count": 0, "sources": []},
                    timestamp=_now_iso(),
                ),
            )
            await manager.broadcast(quality_event, session_id)
            yield _sse_event(quality_event)

    if (
        any(plugin_id in enabled_plugins for plugin_id in python_plugin_ids)
        and _should_run_python_sandbox(request, session["extracted_data"])
    ):
        extracted_payload = session["extracted_data"]
        dataset_rows: list[dict[str, Any]] = []
        source_links: list[str] = []
        html_samples: dict[str, str] = {}

        if isinstance(extracted_payload, dict):
            if isinstance(extracted_payload.get("rows"), list):
                dataset_rows = [
                    row for row in extracted_payload.get("rows", []) if isinstance(row, dict)
                ]
            if isinstance(extracted_payload.get("source_links"), list):
                source_links = [str(link) for link in extracted_payload.get("source_links", [])]

            for source, payload in extracted_payload.items():
                if isinstance(payload, dict) and isinstance(payload.get("content"), str):
                    html_samples[str(source)] = payload.get("content", "")

        # Tool call: extract.urls (find URLs in content)
        if html_samples:
            extract_urls_event = _create_tool_call_step(
                session,
                "extract.urls",
                "Extract URLs from HTML content",
                {"sources": len(html_samples), "total_bytes": sum(len(h) for h in html_samples.values())},
                status="running",
            )
            await manager.broadcast(extract_urls_event, session_id)
            yield _sse_event(extract_urls_event)
            
            all_urls = []
            for html in html_samples.values():
                all_urls.extend(re.findall(r'href=["\']([^"\']+)["\']', html[:50000]))  # Limit search
            
            extract_urls_result = _create_tool_call_step(
                session,
                "extract.urls",
                "Extract URLs from HTML content",
                {"sources": len(html_samples)},
                status="completed",
                result={"urls_found": len(all_urls), "unique": len(set(all_urls))},
                reward=0.05,
            )
            await manager.broadcast(extract_urls_result, session_id)
            yield _sse_event(extract_urls_result)
            
            # Tool call: extract.emails (find emails in content)
            extract_emails_event = _create_tool_call_step(
                session,
                "extract.emails",
                "Extract email addresses from HTML content",
                {"sources": len(html_samples)},
                status="running",
            )
            await manager.broadcast(extract_emails_event, session_id)
            yield _sse_event(extract_emails_event)
            
            all_emails = []
            email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
            for html in html_samples.values():
                all_emails.extend(re.findall(email_pattern, html[:50000]))
            
            extract_emails_result = _create_tool_call_step(
                session,
                "extract.emails",
                "Extract email addresses from HTML content",
                {"sources": len(html_samples)},
                status="completed",
                result={"emails_found": len(all_emails), "unique": len(set(all_emails))},
                reward=0.02,
            )
            await manager.broadcast(extract_emails_result, session_id)
            yield _sse_event(extract_emails_result)

        analysis_payload = {
            "instructions": request.instructions,
            "output_instructions": request.output_instructions,
            "dataset_rows": dataset_rows,
            "source_links": source_links,
            "html_samples": html_samples,
            "extracted_data": extracted_payload,
        }

        sandbox_code = request.python_code or DEFAULT_ANALYSIS_CODE
        
        # Tool call: pandas.DataFrame (data analysis)
        pandas_tool_event = _record_step(
            session,
            ScrapeStep(
                step_number=len(session["steps"]) + 1,
                action="tool_call",
                status="running",
                message="pandas.DataFrame(rows)",
                extracted_data={
                    "tool_name": "pandas.DataFrame",
                    "tool_description": "Create DataFrame from extracted dataset rows",
                    "parameters": {
                        "row_count": len(dataset_rows),
                        "source_count": len(source_links),
                    },
                },
                timestamp=_now_iso(),
            ),
        )
        await manager.broadcast(pandas_tool_event, session_id)
        yield _sse_event(pandas_tool_event)
        
        # Tool call: bs4.BeautifulSoup (HTML analysis)
        if html_samples:
            bs4_tool_event = _record_step(
                session,
                ScrapeStep(
                    step_number=len(session["steps"]) + 1,
                    action="tool_call",
                    status="running",
                    message=f"bs4.BeautifulSoup(html, 'html.parser') × {len(html_samples)}",
                    extracted_data={
                        "tool_name": "bs4.BeautifulSoup",
                        "tool_description": "Parse HTML samples for link analysis",
                        "parameters": {
                            "parser": "html.parser",
                            "sample_count": len(html_samples),
                            "total_bytes": sum(len(h) for h in html_samples.values()),
                        },
                    },
                    timestamp=_now_iso(),
                ),
            )
            await manager.broadcast(bs4_tool_event, session_id)
            yield _sse_event(bs4_tool_event)
        
        # Tool call: sandbox.execute (analysis)
        analysis_sandbox_event = _record_step(
            session,
            ScrapeStep(
                step_number=len(session["steps"]) + 1,
                action="tool_call",
                status="running",
                message="sandbox.execute(code='data_analysis')",
                extracted_data={
                    "tool_name": "sandbox.execute",
                    "tool_description": "Run comprehensive data analysis in sandbox",
                    "parameters": {
                        "imports": ["pandas", "numpy", "bs4", "json"],
                        "dataset_rows": len(dataset_rows),
                        "html_samples": len(html_samples),
                        "custom_code": bool(request.python_code),
                    },
                },
                timestamp=_now_iso(),
            ),
        )
        await manager.broadcast(analysis_sandbox_event, session_id)
        yield _sse_event(analysis_sandbox_event)
        
        try:
            sandbox_result = await asyncio.to_thread(
                execute_python_sandbox,
                sandbox_code,
                analysis_payload,
                session_id=session_id,
                timeout_seconds=25,
            )
        except Exception as exc:
            sandbox_result = SandboxExecutionResult(
                success=False,
                output=None,
                error=f"Sandbox setup failed: {exc}",
                stderr="",
            )
        
        # Tool call result: sandbox.execute
        sandbox_exec_result_event = _record_step(
            session,
            ScrapeStep(
                step_number=len(session["steps"]),
                action="tool_call",
                status="completed" if sandbox_result.success else "failed",
                message=f"sandbox.execute() → {'analysis complete' if sandbox_result.success else 'failed'}",
                reward=0.1 if sandbox_result.success else 0.0,
                extracted_data={
                    "tool_name": "sandbox.execute",
                    "result": {
                        "success": sandbox_result.success,
                        "output_keys": list(sandbox_result.output.keys()) if sandbox_result.output else [],
                        "error": sandbox_result.error if not sandbox_result.success else None,
                    },
                },
                timestamp=_now_iso(),
            ),
        )
        await manager.broadcast(sandbox_exec_result_event, session_id)
        yield _sse_event(sandbox_exec_result_event)

        if sandbox_result.success and sandbox_result.output is not None:
            if isinstance(session["extracted_data"], dict):
                session["extracted_data"]["python_analysis"] = sandbox_result.output
            else:
                session["extracted_data"] = {
                    "result": session["extracted_data"],
                    "python_analysis": sandbox_result.output,
                }

            sandbox_event = _record_step(
                session,
                ScrapeStep(
                    step_number=len(session["steps"]) + 1,
                    action="python_sandbox",
                    status="completed",
                    message="Sandboxed Python plugin executed successfully",
                    extracted_data={"analysis_keys": sorted(sandbox_result.output.keys())},
                    timestamp=_now_iso(),
                ),
            )
            await manager.broadcast(sandbox_event, session_id)
            yield _sse_event(sandbox_event)
        else:
            error = sandbox_result.error or "Sandboxed Python execution failed"
            session["errors"].append(error)
            sandbox_event = _record_step(
                session,
                ScrapeStep(
                    step_number=len(session["steps"]) + 1,
                    action="python_sandbox",
                    status="failed",
                    message=error,
                    extracted_data={"stderr": sandbox_result.stderr[:500]},
                    timestamp=_now_iso(),
                ),
            )
            await manager.broadcast(sandbox_event, session_id)
            yield _sse_event(sandbox_event)

    duration = time.time() - start_time
    
    # Tool call: json.dumps (output formatting)
    json_format_event = _record_step(
        session,
        ScrapeStep(
            step_number=len(session["steps"]) + 1,
            action="tool_call",
            status="running",
            message=f"json.dumps(data, format='{request.output_format.value}')",
            extracted_data={
                "tool_name": "json.dumps",
                "tool_description": f"Format extracted data as {request.output_format.value.upper()}",
                "parameters": {
                    "output_format": request.output_format.value,
                    "data_keys": list(session["extracted_data"].keys()) if isinstance(session["extracted_data"], dict) else ["data"],
                },
            },
            timestamp=_now_iso(),
        ),
    )
    await manager.broadcast(json_format_event, session_id)
    yield _sse_event(json_format_event)
    
    output = await format_output(
        session["extracted_data"],
        request.output_format,
        request.output_instructions,
    )
    
    json_format_result_event = _record_step(
        session,
        ScrapeStep(
            step_number=len(session["steps"]),
            action="tool_call",
            status="completed",
            message=f"json.dumps() → {len(output)} bytes",
            reward=0.05,
            extracted_data={
                "tool_name": "json.dumps",
                "result": {
                    "output_length": len(output),
                    "format": request.output_format.value,
                },
            },
            timestamp=_now_iso(),
        ),
    )
    await manager.broadcast(json_format_result_event, session_id)
    yield _sse_event(json_format_result_event)
    
    output_ext = request.output_format.value
    _write_session_artifact(session, f"final_output.{output_ext}", output)
    _write_session_json_artifact(session, "final_extracted_data.json", session["extracted_data"])

    if request.enable_memory:
        # Tool call: memory.store
        memory_store_event = _record_step(
            session,
            ScrapeStep(
                step_number=len(session["steps"]) + 1,
                action="tool_call",
                status="running",
                message="memory.store(key='summary', type='LONG_TERM')",
                extracted_data={
                    "tool_name": "memory.store",
                    "tool_description": "Store scrape summary in long-term memory",
                    "parameters": {
                        "key": f"scrape:{session_id}:summary",
                        "memory_type": "LONG_TERM",
                        "output_length": len(output),
                    },
                },
                timestamp=_now_iso(),
            ),
        )
        await manager.broadcast(memory_store_event, session_id)
        yield _sse_event(memory_store_event)
        
        try:
            await memory_manager.store(
                key=f"scrape:{session_id}:summary",
                value=output,
                memory_type=MemoryType.LONG_TERM,
                metadata={
                    "session_id": session_id,
                    "complexity": request.complexity.value,
                    "provider": request.provider,
                    "model": request.model,
                },
            )
            _write_session_artifact(session, "memory_summary.txt", output)
            
            # Tool call result: memory.store
            memory_store_result_event = _record_step(
                session,
                ScrapeStep(
                    step_number=len(session["steps"]),
                    action="tool_call",
                    status="completed",
                    message="memory.store() → stored",
                    reward=0.05,
                    extracted_data={
                        "tool_name": "memory.store",
                        "result": {"stored": True, "key": f"scrape:{session_id}:summary"},
                    },
                    timestamp=_now_iso(),
                ),
            )
            await manager.broadcast(memory_store_result_event, session_id)
            yield _sse_event(memory_store_result_event)
        except Exception as exc:
            session["errors"].append(f"Failed to store summary memory: {exc}")
            memory_store_fail_event = _record_step(
                session,
                ScrapeStep(
                    step_number=len(session["steps"]),
                    action="tool_call",
                    status="failed",
                    message=f"memory.store() → {str(exc)[:50]}",
                    extracted_data={
                        "tool_name": "memory.store",
                        "result": {"stored": False, "error": str(exc)[:100]},
                    },
                    timestamp=_now_iso(),
                ),
            )
            await manager.broadcast(memory_store_fail_event, session_id)
            yield _sse_event(memory_store_fail_event)

    response = ScrapeResponse(
        session_id=session_id,
        status="completed" if not session["errors"] else "partial",
        total_steps=len(session["steps"]),
        total_reward=session["total_reward"],
        extracted_data=session["extracted_data"],
        output=output,
        output_format=request.output_format,
        duration_seconds=duration,
        urls_processed=len(resolved_assets),
        errors=session["errors"],
        enabled_plugins=enabled_plugins,
        requested_plugins=request.enable_plugins,
        selected_agents=request.selected_agents,
        memory_enabled=request.enable_memory,
        sandbox_artifacts=_list_session_artifacts(session),
    )

    complete_event = {"type": "complete", "data": response.model_dump()}
    await manager.broadcast(complete_event, session_id)
    yield _sse_event(complete_event)

    session["status"] = response.status
    session["duration"] = duration


@router.post("/stream")
async def scrape_with_stream(
    request: ScrapeRequest,
    settings: SettingsDep,
    memory_manager: MemoryManagerDep,
) -> StreamingResponse:
    """Start a scrape run and stream updates via SSE."""

    if not request.assets:
        raise HTTPException(status_code=400, detail="At least one asset URL is required")

    session_id = request.session_id or str(uuid.uuid4())
    if get_session(session_id):
        raise HTTPException(status_code=409, detail=f"Session {session_id} already exists")
    return StreamingResponse(
        scrape_stream(session_id, request, settings, memory_manager),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "X-Session-Id": session_id,
        },
    )


@router.post("/")
async def scrape_sync(
    request: ScrapeRequest,
    settings: SettingsDep,
    memory_manager: MemoryManagerDep,
    background_tasks: BackgroundTasks,
) -> dict[str, Any]:
    """Start a scrape run in the background and return session ID."""

    if not request.assets:
        raise HTTPException(status_code=400, detail="At least one asset URL is required")

    session_id = request.session_id or str(uuid.uuid4())
    if get_session(session_id):
        raise HTTPException(status_code=409, detail=f"Session {session_id} already exists")

    async def run_scrape() -> None:
        try:
            async for _ in scrape_stream(session_id, request, settings, memory_manager):
                pass
        except Exception as exc:
            logger.exception("Background scrape failed", extra={"session_id": session_id})
            update_session(session_id, {"status": "failed", "errors": [str(exc)]})

    background_tasks.add_task(run_scrape)
    return {
        "session_id": session_id,
        "status": "started",
        "message": f"Scraping {len(request.assets)} URLs",
        "assets": request.assets,
        "selected_agents": request.selected_agents,
    }


@router.get("/sessions")
async def list_sessions() -> dict[str, Any]:
    """List all active scrape sessions."""

    sessions = [
        {
            "session_id": session_id,
            "status": session["status"],
            "urls_count": len(session.get("resolved_assets") or session["request"].assets),
            "current_index": session.get("current_url_index", 0),
            "total_reward": session["total_reward"],
            "steps": len(session["steps"]),
        }
        for session_id, session in _active_sessions.items()
    ]
    return {"sessions": sessions, "count": len(sessions)}


@router.get("/{session_id}/status")
async def get_scrape_status(session_id: str) -> dict[str, Any]:
    """Get current status for one scrape session."""

    session = get_session(session_id)
    if not session:
        raise HTTPException(status_code=404, detail="Session not found")

    duration = (
        time.time() - session["start_time"]
        if session["status"] == "running"
        else session.get("duration", 0.0)
    )
    return {
        "session_id": session_id,
        "status": session["status"],
        "current_url_index": session.get("current_url_index", 0),
        "total_urls": len(session.get("resolved_assets") or session["request"].assets),
        "total_reward": session["total_reward"],
        "extracted_count": len(session["extracted_data"]),
        "steps_count": len(session["steps"]),
        "errors": session["errors"],
        "enabled_plugins": session.get("enabled_plugins", []),
        "selected_agents": session["request"].selected_agents,
        "sandbox_artifacts": _list_session_artifacts(session),
        "duration": duration,
    }


@router.get("/{session_id}/sandbox/files")
async def list_sandbox_files(session_id: str) -> dict[str, Any]:
    """List sandbox artifacts for a scrape session."""

    session = get_session(session_id)
    if not session:
        raise HTTPException(status_code=404, detail="Session not found")

    sandbox_dir = session.get("sandbox_dir")
    if not sandbox_dir:
        return {"session_id": session_id, "files": [], "count": 0}

    base = Path(sandbox_dir)
    if not base.exists():
        return {"session_id": session_id, "files": [], "count": 0}

    files: list[dict[str, Any]] = []
    for file in base.iterdir():
        if not file.is_file():
            continue
        files.append(
            {
                "name": file.name,
                "size_bytes": file.stat().st_size,
            }
        )

    files.sort(key=lambda item: item["name"])
    return {"session_id": session_id, "files": files, "count": len(files)}


@router.get("/{session_id}/sandbox/files/{file_name}")
async def read_sandbox_file(session_id: str, file_name: str) -> dict[str, Any]:
    """Read a sandbox file content from the current session."""

    session = get_session(session_id)
    if not session:
        raise HTTPException(status_code=404, detail="Session not found")

    sandbox_dir = session.get("sandbox_dir")
    if not sandbox_dir:
        raise HTTPException(status_code=404, detail="Sandbox not available for session")

    safe_name = Path(file_name).name
    file_path = Path(sandbox_dir) / safe_name
    if not file_path.exists() or not file_path.is_file():
        raise HTTPException(status_code=404, detail="Sandbox file not found")

    content = file_path.read_text(encoding="utf-8", errors="ignore")
    return {
        "session_id": session_id,
        "file_name": safe_name,
        "size_bytes": file_path.stat().st_size,
        "content": content,
    }


@router.get("/{session_id}/result")
async def get_scrape_result(session_id: str) -> ScrapeResponse:
    """Get final result for one scrape session."""

    session = get_session(session_id)
    if not session:
        raise HTTPException(status_code=404, detail="Session not found")

    if session["status"] == "running":
        raise HTTPException(status_code=400, detail="Scraping still in progress")

    request: ScrapeRequest = session["request"]
    duration = session.get("duration", time.time() - session["start_time"])
    output = await format_output(
        session["extracted_data"],
        request.output_format,
        request.output_instructions,
    )
    return ScrapeResponse(
        session_id=session_id,
        status=session["status"],
        total_steps=len(session["steps"]),
        total_reward=session["total_reward"],
        extracted_data=session["extracted_data"],
        output=output,
        output_format=request.output_format,
        duration_seconds=duration,
        urls_processed=len(session.get("resolved_assets") or request.assets),
        errors=session["errors"],
        enabled_plugins=session.get("enabled_plugins", []),
        requested_plugins=request.enable_plugins,
        selected_agents=request.selected_agents,
        memory_enabled=request.enable_memory,
        sandbox_artifacts=_list_session_artifacts(session),
    )


@router.delete("/{session_id}")
async def cancel_scrape(session_id: str) -> dict[str, str]:
    """Cancel a running scrape session."""

    session = get_session(session_id)
    if not session:
        raise HTTPException(status_code=404, detail="Session not found")

    update_session(session_id, {"status": "cancelled"})
    return {"status": "cancelled", "session_id": session_id}


@router.delete("/{session_id}/cleanup")
async def cleanup_scrape(session_id: str) -> dict[str, str]:
    """Delete a completed/cancelled session."""

    removed = remove_session(session_id)
    if not removed:
        raise HTTPException(status_code=404, detail="Session not found")
    return {"status": "removed", "session_id": session_id}