Spaces:
Sleeping
Sleeping
Commit ·
f946069
1
Parent(s): 4ece098
feat: Replace hardcoded scraping with truly agentic LLM-driven approach
Browse files- Add _scrape_with_agentic_llm() function that uses model router for all decisions
- LLM now decides navigation URLs based on user instructions + template hints
- LLM generates BeautifulSoup extraction code dynamically from HTML + instructions
- Execute generated code in sandbox for flexible data extraction
- Templates serve as reference hints only, not rigid execution scripts
- Works even without templates (pure agentic mode)
- Output columns now driven by user's output_instructions
- Replace all hardcoded strategy routing (github_trending, reddit_trending, etc) with single agentic path
- backend/app/api/routes/scrape.py +470 -28
backend/app/api/routes/scrape.py
CHANGED
|
@@ -29,9 +29,11 @@ from app.config import Settings
|
|
| 29 |
from app.api.deps import (
|
| 30 |
MemoryManagerDep,
|
| 31 |
SettingsDep,
|
|
|
|
| 32 |
create_environment,
|
| 33 |
remove_environment,
|
| 34 |
)
|
|
|
|
| 35 |
from app.api.routes.plugins import PLUGIN_REGISTRY
|
| 36 |
from app.api.routes.websocket import get_connection_manager
|
| 37 |
from app.core.action import Action, ActionType
|
|
@@ -949,6 +951,445 @@ async def scrape_url(
|
|
| 949 |
remove_environment(episode_id)
|
| 950 |
|
| 951 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 952 |
async def scrape_url_intelligently(
|
| 953 |
session: dict[str, Any],
|
| 954 |
session_id: str,
|
|
@@ -959,7 +1400,15 @@ async def scrape_url_intelligently(
|
|
| 959 |
enabled_plugins: list[str],
|
| 960 |
navigation_plan: dict[str, Any],
|
| 961 |
) -> AsyncGenerator[dict[str, Any], None]:
|
| 962 |
-
"""Intelligent scraping
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 963 |
|
| 964 |
episode_id = f"{session_id}-{uuid.uuid4().hex[:8]}"
|
| 965 |
|
|
@@ -967,36 +1416,29 @@ async def scrape_url_intelligently(
|
|
| 967 |
env = create_environment(episode_id, settings)
|
| 968 |
await env.reset(task_id=f"scrape_{session_id}")
|
| 969 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
step_num = 0
|
| 971 |
total_reward = 0.0
|
| 972 |
|
| 973 |
-
#
|
| 974 |
-
|
| 975 |
-
|
| 976 |
-
|
| 977 |
-
|
| 978 |
-
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
|
| 982 |
-
|
| 983 |
-
|
| 984 |
-
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
# General exploration strategy
|
| 988 |
-
elif navigation_plan["strategy"] == "intelligent_exploration":
|
| 989 |
-
async for event in _scrape_with_exploration(
|
| 990 |
-
session, session_id, env, request, navigation_plan, url, step_num, total_reward
|
| 991 |
-
):
|
| 992 |
-
yield event
|
| 993 |
-
|
| 994 |
-
# Default single page
|
| 995 |
-
else:
|
| 996 |
-
async for event in _scrape_single_page(
|
| 997 |
-
session, session_id, env, request, url, step_num, total_reward
|
| 998 |
-
):
|
| 999 |
-
yield event
|
| 1000 |
|
| 1001 |
except Exception as exc:
|
| 1002 |
logger.error(f"Intelligent scraping failed for {url}: {exc}")
|
|
|
|
| 29 |
from app.api.deps import (
|
| 30 |
MemoryManagerDep,
|
| 31 |
SettingsDep,
|
| 32 |
+
get_model_router,
|
| 33 |
create_environment,
|
| 34 |
remove_environment,
|
| 35 |
)
|
| 36 |
+
from app.models.router import SmartModelRouter, TaskType
|
| 37 |
from app.api.routes.plugins import PLUGIN_REGISTRY
|
| 38 |
from app.api.routes.websocket import get_connection_manager
|
| 39 |
from app.core.action import Action, ActionType
|
|
|
|
| 951 |
remove_environment(episode_id)
|
| 952 |
|
| 953 |
|
| 954 |
+
async def _scrape_with_agentic_llm(
|
| 955 |
+
session: dict[str, Any],
|
| 956 |
+
session_id: str,
|
| 957 |
+
env,
|
| 958 |
+
request: ScrapeRequest,
|
| 959 |
+
navigation_plan: dict[str, Any],
|
| 960 |
+
url: str,
|
| 961 |
+
step_num: int,
|
| 962 |
+
total_reward: float,
|
| 963 |
+
model_router: SmartModelRouter,
|
| 964 |
+
) -> AsyncGenerator[dict[str, Any], None]:
|
| 965 |
+
"""Truly agentic scraping using LLM to decide navigation and extraction.
|
| 966 |
+
|
| 967 |
+
This function uses the LLM to:
|
| 968 |
+
1. Decide where to navigate based on instructions + template hints
|
| 969 |
+
2. Analyze the HTML content
|
| 970 |
+
3. Generate extraction code dynamically
|
| 971 |
+
4. Format output according to output_instructions
|
| 972 |
+
|
| 973 |
+
Templates serve as reference hints only, not rigid execution scripts.
|
| 974 |
+
"""
|
| 975 |
+
|
| 976 |
+
# Get template hint if available (for reference only)
|
| 977 |
+
template_hint = ""
|
| 978 |
+
if navigation_plan.get("matched_template"):
|
| 979 |
+
template = navigation_plan["matched_template"]
|
| 980 |
+
template_hint = f"""
|
| 981 |
+
SITE TEMPLATE HINT (reference only, not mandatory):
|
| 982 |
+
- Domain: {template.get('domain', 'N/A')}
|
| 983 |
+
- Strategies: {', '.join(template.get('strategies', []))}
|
| 984 |
+
- Suggested output fields: {', '.join(template.get('output_fields', []))}
|
| 985 |
+
- Typical patterns: {template.get('patterns', 'N/A')}
|
| 986 |
+
"""
|
| 987 |
+
|
| 988 |
+
# Step 1: Ask LLM to decide navigation strategy
|
| 989 |
+
step_num += 1
|
| 990 |
+
navigation_prompt = f"""You are a web scraping agent. Analyze the user's request and decide where to navigate.
|
| 991 |
+
|
| 992 |
+
USER REQUEST:
|
| 993 |
+
- Target: {url}
|
| 994 |
+
- Instructions: {request.instructions or 'Extract all relevant data'}
|
| 995 |
+
- Desired output format: {request.output_format.value}
|
| 996 |
+
- Output instructions: {request.output_instructions or 'All available data'}
|
| 997 |
+
|
| 998 |
+
{template_hint}
|
| 999 |
+
|
| 1000 |
+
TASK: Decide the best URL to navigate to accomplish this task. Consider:
|
| 1001 |
+
- If the user wants trending/popular content, should you go to a trending page?
|
| 1002 |
+
- If the user wants specific data, do you need to navigate to a specific section?
|
| 1003 |
+
- Return ONLY the URL to navigate to, nothing else.
|
| 1004 |
+
|
| 1005 |
+
URL:"""
|
| 1006 |
+
|
| 1007 |
+
try:
|
| 1008 |
+
nav_response = await model_router.complete(
|
| 1009 |
+
messages=[{"role": "user", "content": navigation_prompt}],
|
| 1010 |
+
task_type=TaskType.REASONING,
|
| 1011 |
+
model=request.model,
|
| 1012 |
+
)
|
| 1013 |
+
target_url = nav_response.content.strip()
|
| 1014 |
+
|
| 1015 |
+
# Validate and clean URL
|
| 1016 |
+
if not target_url.startswith("http"):
|
| 1017 |
+
if "://" not in url:
|
| 1018 |
+
target_url = f"https://{url}/{target_url.lstrip('/')}"
|
| 1019 |
+
else:
|
| 1020 |
+
parsed = urlparse(url)
|
| 1021 |
+
target_url = f"{parsed.scheme}://{parsed.netloc}/{target_url.lstrip('/')}"
|
| 1022 |
+
|
| 1023 |
+
except Exception as e:
|
| 1024 |
+
logger.error(f"LLM navigation decision failed: {e}")
|
| 1025 |
+
target_url = url # Fall back to original URL
|
| 1026 |
+
|
| 1027 |
+
# Tool call: LLM navigation planning
|
| 1028 |
+
yield _record_step(
|
| 1029 |
+
session,
|
| 1030 |
+
ScrapeStep(
|
| 1031 |
+
step_number=step_num,
|
| 1032 |
+
action="tool_call",
|
| 1033 |
+
url=target_url,
|
| 1034 |
+
status="complete",
|
| 1035 |
+
message=f"llm.plan_navigation() → {target_url}",
|
| 1036 |
+
extracted_data={
|
| 1037 |
+
"tool_name": "llm.plan_navigation",
|
| 1038 |
+
"tool_description": "LLM decides optimal navigation URL based on instructions",
|
| 1039 |
+
"parameters": {"instructions": request.instructions, "base_url": url},
|
| 1040 |
+
"result": target_url,
|
| 1041 |
+
},
|
| 1042 |
+
reward=0.15,
|
| 1043 |
+
timestamp=_now_iso(),
|
| 1044 |
+
),
|
| 1045 |
+
)
|
| 1046 |
+
total_reward += 0.15
|
| 1047 |
+
|
| 1048 |
+
# Step 2: Navigate to the decided URL
|
| 1049 |
+
step_num += 1
|
| 1050 |
+
yield _record_step(
|
| 1051 |
+
session,
|
| 1052 |
+
ScrapeStep(
|
| 1053 |
+
step_number=step_num,
|
| 1054 |
+
action="tool_call",
|
| 1055 |
+
url=target_url,
|
| 1056 |
+
status="running",
|
| 1057 |
+
message=f"browser.navigate(url='{target_url}')",
|
| 1058 |
+
extracted_data={
|
| 1059 |
+
"tool_name": "browser.navigate",
|
| 1060 |
+
"tool_description": "Navigate browser to target URL",
|
| 1061 |
+
"parameters": {"url": target_url, "wait_for": "page_load"},
|
| 1062 |
+
},
|
| 1063 |
+
timestamp=_now_iso(),
|
| 1064 |
+
),
|
| 1065 |
+
)
|
| 1066 |
+
|
| 1067 |
+
navigate_action = Action(
|
| 1068 |
+
action_type=ActionType.NAVIGATE,
|
| 1069 |
+
parameters={"url": target_url},
|
| 1070 |
+
reasoning=f"Navigate to {target_url} based on LLM's decision",
|
| 1071 |
+
)
|
| 1072 |
+
|
| 1073 |
+
nav_obs, nav_reward, _, _, _, nav_info = await env.step(navigate_action)
|
| 1074 |
+
total_reward += nav_reward
|
| 1075 |
+
|
| 1076 |
+
yield _record_step(
|
| 1077 |
+
session,
|
| 1078 |
+
ScrapeStep(
|
| 1079 |
+
step_number=step_num,
|
| 1080 |
+
action="tool_call",
|
| 1081 |
+
url=target_url,
|
| 1082 |
+
status="complete",
|
| 1083 |
+
message=f"browser.navigate() → Success",
|
| 1084 |
+
extracted_data={
|
| 1085 |
+
"tool_name": "browser.navigate",
|
| 1086 |
+
"tool_description": "Navigate browser to target URL",
|
| 1087 |
+
"parameters": {"url": target_url},
|
| 1088 |
+
"result": {"status_code": nav_obs.page_html is not None},
|
| 1089 |
+
},
|
| 1090 |
+
reward=nav_reward,
|
| 1091 |
+
timestamp=_now_iso(),
|
| 1092 |
+
),
|
| 1093 |
+
)
|
| 1094 |
+
|
| 1095 |
+
if not nav_obs.page_html:
|
| 1096 |
+
logger.error("Navigation failed - no HTML received")
|
| 1097 |
+
return
|
| 1098 |
+
|
| 1099 |
+
# Step 3: Parse HTML
|
| 1100 |
+
step_num += 1
|
| 1101 |
+
yield _record_step(
|
| 1102 |
+
session,
|
| 1103 |
+
ScrapeStep(
|
| 1104 |
+
step_number=step_num,
|
| 1105 |
+
action="tool_call",
|
| 1106 |
+
url=target_url,
|
| 1107 |
+
status="running",
|
| 1108 |
+
message="html.parse(html=page_content)",
|
| 1109 |
+
extracted_data={
|
| 1110 |
+
"tool_name": "html.parse",
|
| 1111 |
+
"tool_description": "Parse HTML into DOM structure",
|
| 1112 |
+
"parameters": {"content_length": len(nav_obs.page_html)},
|
| 1113 |
+
},
|
| 1114 |
+
timestamp=_now_iso(),
|
| 1115 |
+
),
|
| 1116 |
+
)
|
| 1117 |
+
|
| 1118 |
+
soup = BeautifulSoup(nav_obs.page_html, "html.parser")
|
| 1119 |
+
total_reward += 0.1
|
| 1120 |
+
|
| 1121 |
+
yield _record_step(
|
| 1122 |
+
session,
|
| 1123 |
+
ScrapeStep(
|
| 1124 |
+
step_number=step_num,
|
| 1125 |
+
action="tool_call",
|
| 1126 |
+
url=target_url,
|
| 1127 |
+
status="complete",
|
| 1128 |
+
message="html.parse() → DOM ready",
|
| 1129 |
+
extracted_data={
|
| 1130 |
+
"tool_name": "html.parse",
|
| 1131 |
+
"tool_description": "Parse HTML into DOM structure",
|
| 1132 |
+
"result": {"elements_count": len(soup.find_all())},
|
| 1133 |
+
},
|
| 1134 |
+
reward=0.1,
|
| 1135 |
+
timestamp=_now_iso(),
|
| 1136 |
+
),
|
| 1137 |
+
)
|
| 1138 |
+
|
| 1139 |
+
# Step 4: Ask LLM to generate extraction code
|
| 1140 |
+
step_num += 1
|
| 1141 |
+
|
| 1142 |
+
# Get a sample of the HTML for LLM analysis (first 5000 chars)
|
| 1143 |
+
html_sample = nav_obs.page_html[:5000]
|
| 1144 |
+
|
| 1145 |
+
extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
|
| 1146 |
+
|
| 1147 |
+
USER REQUEST:
|
| 1148 |
+
- Instructions: {request.instructions or 'Extract all relevant data'}
|
| 1149 |
+
- Output format: {request.output_format.value}
|
| 1150 |
+
- Output instructions: {request.output_instructions or 'All available data'}
|
| 1151 |
+
|
| 1152 |
+
HTML SAMPLE (first 5000 chars):
|
| 1153 |
+
```html
|
| 1154 |
+
{html_sample}
|
| 1155 |
+
```
|
| 1156 |
+
|
| 1157 |
+
{template_hint}
|
| 1158 |
+
|
| 1159 |
+
TASK: Generate Python code using BeautifulSoup to extract the requested data. The code should:
|
| 1160 |
+
1. Parse the HTML (soup is already provided as `soup` variable)
|
| 1161 |
+
2. Extract data matching the user's output_instructions
|
| 1162 |
+
3. Return a list of dictionaries with the exact columns specified in output_instructions
|
| 1163 |
+
4. Handle missing data gracefully
|
| 1164 |
+
|
| 1165 |
+
REQUIREMENTS:
|
| 1166 |
+
- Return ONLY executable Python code, no explanations
|
| 1167 |
+
- Use `soup` variable (already a BeautifulSoup object)
|
| 1168 |
+
- Return `extracted_data` as a list of dictionaries
|
| 1169 |
+
- Column names MUST match what the user requested in output_instructions
|
| 1170 |
+
- Example: if user wants "csv of username, repo, stars", return dicts with keys: username, repo, stars
|
| 1171 |
+
|
| 1172 |
+
CODE:"""
|
| 1173 |
+
|
| 1174 |
+
try:
|
| 1175 |
+
code_response = await model_router.complete(
|
| 1176 |
+
messages=[{"role": "user", "content": extraction_prompt}],
|
| 1177 |
+
task_type=TaskType.CODE,
|
| 1178 |
+
model=request.model,
|
| 1179 |
+
)
|
| 1180 |
+
|
| 1181 |
+
# Extract code from response (handle markdown code blocks)
|
| 1182 |
+
extraction_code = code_response.content.strip()
|
| 1183 |
+
if "```python" in extraction_code:
|
| 1184 |
+
extraction_code = extraction_code.split("```python")[1].split("```")[0].strip()
|
| 1185 |
+
elif "```" in extraction_code:
|
| 1186 |
+
extraction_code = extraction_code.split("```")[1].split("```")[0].strip()
|
| 1187 |
+
|
| 1188 |
+
# Tool call: LLM code generation
|
| 1189 |
+
yield _record_step(
|
| 1190 |
+
session,
|
| 1191 |
+
ScrapeStep(
|
| 1192 |
+
step_number=step_num,
|
| 1193 |
+
action="tool_call",
|
| 1194 |
+
url=target_url,
|
| 1195 |
+
status="complete",
|
| 1196 |
+
message=f"llm.generate_extraction_code() → {len(extraction_code)} chars",
|
| 1197 |
+
extracted_data={
|
| 1198 |
+
"tool_name": "llm.generate_extraction_code",
|
| 1199 |
+
"tool_description": "LLM generates BeautifulSoup extraction code based on HTML and instructions",
|
| 1200 |
+
"parameters": {
|
| 1201 |
+
"html_sample_length": len(html_sample),
|
| 1202 |
+
"instructions": request.instructions,
|
| 1203 |
+
"output_format": request.output_format.value,
|
| 1204 |
+
},
|
| 1205 |
+
"result": {"code_length": len(extraction_code)},
|
| 1206 |
+
},
|
| 1207 |
+
reward=0.2,
|
| 1208 |
+
timestamp=_now_iso(),
|
| 1209 |
+
),
|
| 1210 |
+
)
|
| 1211 |
+
total_reward += 0.2
|
| 1212 |
+
|
| 1213 |
+
except Exception as e:
|
| 1214 |
+
logger.error(f"LLM code generation failed: {e}")
|
| 1215 |
+
extraction_code = DEFAULT_ANALYSIS_CODE # Fallback to default extraction
|
| 1216 |
+
|
| 1217 |
+
# Step 5: Execute generated code in sandbox
|
| 1218 |
+
step_num += 1
|
| 1219 |
+
yield _record_step(
|
| 1220 |
+
session,
|
| 1221 |
+
ScrapeStep(
|
| 1222 |
+
step_number=step_num,
|
| 1223 |
+
action="tool_call",
|
| 1224 |
+
url=target_url,
|
| 1225 |
+
status="running",
|
| 1226 |
+
message="sandbox.execute(code=llm_generated_code)",
|
| 1227 |
+
extracted_data={
|
| 1228 |
+
"tool_name": "sandbox.execute",
|
| 1229 |
+
"tool_description": "Execute LLM-generated extraction code in sandboxed Python environment",
|
| 1230 |
+
"parameters": {"code_length": len(extraction_code), "timeout": 30},
|
| 1231 |
+
},
|
| 1232 |
+
timestamp=_now_iso(),
|
| 1233 |
+
),
|
| 1234 |
+
)
|
| 1235 |
+
|
| 1236 |
+
# Prepare execution context
|
| 1237 |
+
sandbox_globals = {
|
| 1238 |
+
"soup": soup,
|
| 1239 |
+
"html": nav_obs.page_html,
|
| 1240 |
+
"url": target_url,
|
| 1241 |
+
"BeautifulSoup": BeautifulSoup,
|
| 1242 |
+
"extracted_data": [], # LLM code should populate this
|
| 1243 |
+
}
|
| 1244 |
+
|
| 1245 |
+
try:
|
| 1246 |
+
# Execute the LLM-generated code
|
| 1247 |
+
exec(extraction_code, sandbox_globals)
|
| 1248 |
+
extracted_data = sandbox_globals.get("extracted_data", [])
|
| 1249 |
+
|
| 1250 |
+
if not isinstance(extracted_data, list):
|
| 1251 |
+
extracted_data = [extracted_data] if extracted_data else []
|
| 1252 |
+
|
| 1253 |
+
exec_reward = 0.5 if extracted_data else 0.1
|
| 1254 |
+
total_reward += exec_reward
|
| 1255 |
+
|
| 1256 |
+
yield _record_step(
|
| 1257 |
+
session,
|
| 1258 |
+
ScrapeStep(
|
| 1259 |
+
step_number=step_num,
|
| 1260 |
+
action="tool_call",
|
| 1261 |
+
url=target_url,
|
| 1262 |
+
status="complete",
|
| 1263 |
+
message=f"sandbox.execute() → Extracted {len(extracted_data)} items",
|
| 1264 |
+
extracted_data={
|
| 1265 |
+
"tool_name": "sandbox.execute",
|
| 1266 |
+
"tool_description": "Execute extraction code in sandbox",
|
| 1267 |
+
"result": {
|
| 1268 |
+
"items_extracted": len(extracted_data),
|
| 1269 |
+
"sample": extracted_data[:2] if extracted_data else [],
|
| 1270 |
+
},
|
| 1271 |
+
},
|
| 1272 |
+
reward=exec_reward,
|
| 1273 |
+
timestamp=_now_iso(),
|
| 1274 |
+
),
|
| 1275 |
+
)
|
| 1276 |
+
|
| 1277 |
+
except Exception as e:
|
| 1278 |
+
logger.error(f"Extraction code execution failed: {e}")
|
| 1279 |
+
# Fallback: basic extraction
|
| 1280 |
+
extracted_data = [{
|
| 1281 |
+
"url": target_url,
|
| 1282 |
+
"title": soup.find("title").get_text() if soup.find("title") else "",
|
| 1283 |
+
"error": f"Extraction failed: {str(e)}",
|
| 1284 |
+
}]
|
| 1285 |
+
total_reward += 0.05
|
| 1286 |
+
|
| 1287 |
+
yield _record_step(
|
| 1288 |
+
session,
|
| 1289 |
+
ScrapeStep(
|
| 1290 |
+
step_number=step_num,
|
| 1291 |
+
action="tool_call",
|
| 1292 |
+
url=target_url,
|
| 1293 |
+
status="complete",
|
| 1294 |
+
message=f"sandbox.execute() → Failed: {str(e)[:100]}",
|
| 1295 |
+
extracted_data={
|
| 1296 |
+
"tool_name": "sandbox.execute",
|
| 1297 |
+
"tool_description": "Execute extraction code (failed)",
|
| 1298 |
+
"result": {"error": str(e)},
|
| 1299 |
+
},
|
| 1300 |
+
reward=0.05,
|
| 1301 |
+
timestamp=_now_iso(),
|
| 1302 |
+
),
|
| 1303 |
+
)
|
| 1304 |
+
|
| 1305 |
+
# Step 6: Format output according to requested format
|
| 1306 |
+
step_num += 1
|
| 1307 |
+
|
| 1308 |
+
if request.output_format == OutputFormat.CSV:
|
| 1309 |
+
tool_name = "csv.generate"
|
| 1310 |
+
tool_desc = "Generate CSV output from extracted data"
|
| 1311 |
+
elif request.output_format == OutputFormat.JSON:
|
| 1312 |
+
tool_name = "json.dumps"
|
| 1313 |
+
tool_desc = "Format extracted data as JSON"
|
| 1314 |
+
else:
|
| 1315 |
+
tool_name = "data.format"
|
| 1316 |
+
tool_desc = "Format extracted data"
|
| 1317 |
+
|
| 1318 |
+
yield _record_step(
|
| 1319 |
+
session,
|
| 1320 |
+
ScrapeStep(
|
| 1321 |
+
step_number=step_num,
|
| 1322 |
+
action="tool_call",
|
| 1323 |
+
url=target_url,
|
| 1324 |
+
status="running",
|
| 1325 |
+
message=f"{tool_name}(data=extracted_items)",
|
| 1326 |
+
extracted_data={
|
| 1327 |
+
"tool_name": tool_name,
|
| 1328 |
+
"tool_description": tool_desc,
|
| 1329 |
+
"parameters": {"item_count": len(extracted_data)},
|
| 1330 |
+
},
|
| 1331 |
+
timestamp=_now_iso(),
|
| 1332 |
+
),
|
| 1333 |
+
)
|
| 1334 |
+
|
| 1335 |
+
# Store extracted data in session
|
| 1336 |
+
if request.output_format == OutputFormat.CSV and extracted_data:
|
| 1337 |
+
# Generate CSV output
|
| 1338 |
+
output_buffer = io.StringIO()
|
| 1339 |
+
if extracted_data:
|
| 1340 |
+
fieldnames = list(extracted_data[0].keys())
|
| 1341 |
+
writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
|
| 1342 |
+
writer.writeheader()
|
| 1343 |
+
writer.writerows(extracted_data)
|
| 1344 |
+
|
| 1345 |
+
session["extracted_data"] = {
|
| 1346 |
+
"csv_output": output_buffer.getvalue(),
|
| 1347 |
+
"rows": extracted_data,
|
| 1348 |
+
"columns": list(extracted_data[0].keys()) if extracted_data else [],
|
| 1349 |
+
"row_count": len(extracted_data),
|
| 1350 |
+
}
|
| 1351 |
+
else:
|
| 1352 |
+
session["extracted_data"] = {
|
| 1353 |
+
target_url: extracted_data
|
| 1354 |
+
}
|
| 1355 |
+
|
| 1356 |
+
total_reward += 0.1
|
| 1357 |
+
|
| 1358 |
+
yield _record_step(
|
| 1359 |
+
session,
|
| 1360 |
+
ScrapeStep(
|
| 1361 |
+
step_number=step_num,
|
| 1362 |
+
action="tool_call",
|
| 1363 |
+
url=target_url,
|
| 1364 |
+
status="complete",
|
| 1365 |
+
message=f"{tool_name}() → Output ready",
|
| 1366 |
+
extracted_data={
|
| 1367 |
+
"tool_name": tool_name,
|
| 1368 |
+
"tool_description": tool_desc,
|
| 1369 |
+
"result": {"format": request.output_format.value, "size": len(extracted_data)},
|
| 1370 |
+
},
|
| 1371 |
+
reward=0.1,
|
| 1372 |
+
timestamp=_now_iso(),
|
| 1373 |
+
),
|
| 1374 |
+
)
|
| 1375 |
+
|
| 1376 |
+
# Final completion
|
| 1377 |
+
step_num += 1
|
| 1378 |
+
yield _record_step(
|
| 1379 |
+
session,
|
| 1380 |
+
ScrapeStep(
|
| 1381 |
+
step_number=step_num,
|
| 1382 |
+
action="complete",
|
| 1383 |
+
url=target_url,
|
| 1384 |
+
status="complete",
|
| 1385 |
+
message=f"Agentic scraping complete: {len(extracted_data)} items extracted",
|
| 1386 |
+
extracted_data={"item_count": len(extracted_data)},
|
| 1387 |
+
reward=total_reward,
|
| 1388 |
+
timestamp=_now_iso(),
|
| 1389 |
+
),
|
| 1390 |
+
)
|
| 1391 |
+
|
| 1392 |
+
|
| 1393 |
async def scrape_url_intelligently(
|
| 1394 |
session: dict[str, Any],
|
| 1395 |
session_id: str,
|
|
|
|
| 1400 |
enabled_plugins: list[str],
|
| 1401 |
navigation_plan: dict[str, Any],
|
| 1402 |
) -> AsyncGenerator[dict[str, Any], None]:
|
| 1403 |
+
"""Intelligent scraping using agentic LLM-driven approach.
|
| 1404 |
+
|
| 1405 |
+
This function uses LLM to make ALL decisions:
|
| 1406 |
+
- Navigation: Where to go based on instructions
|
| 1407 |
+
- Extraction: What data to extract and how
|
| 1408 |
+
- Formatting: How to present the results
|
| 1409 |
+
|
| 1410 |
+
Templates serve as reference hints only, NOT rigid scripts.
|
| 1411 |
+
"""
|
| 1412 |
|
| 1413 |
episode_id = f"{session_id}-{uuid.uuid4().hex[:8]}"
|
| 1414 |
|
|
|
|
| 1416 |
env = create_environment(episode_id, settings)
|
| 1417 |
await env.reset(task_id=f"scrape_{session_id}")
|
| 1418 |
|
| 1419 |
+
# Get model router
|
| 1420 |
+
model_router = get_model_router()
|
| 1421 |
+
if not model_router:
|
| 1422 |
+
logger.error("Model router not available")
|
| 1423 |
+
session["errors"].append("Model router not initialized")
|
| 1424 |
+
return
|
| 1425 |
+
|
| 1426 |
step_num = 0
|
| 1427 |
total_reward = 0.0
|
| 1428 |
|
| 1429 |
+
# ALWAYS use agentic approach - no hardcoded strategies
|
| 1430 |
+
async for event in _scrape_with_agentic_llm(
|
| 1431 |
+
session,
|
| 1432 |
+
session_id,
|
| 1433 |
+
env,
|
| 1434 |
+
request,
|
| 1435 |
+
navigation_plan,
|
| 1436 |
+
url,
|
| 1437 |
+
step_num,
|
| 1438 |
+
total_reward,
|
| 1439 |
+
model_router,
|
| 1440 |
+
):
|
| 1441 |
+
yield event
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1442 |
|
| 1443 |
except Exception as exc:
|
| 1444 |
logger.error(f"Intelligent scraping failed for {url}: {exc}")
|