NeerajCodz commited on
Commit
f946069
·
1 Parent(s): 4ece098

feat: Replace hardcoded scraping with truly agentic LLM-driven approach

Browse files

- Add _scrape_with_agentic_llm() function that uses model router for all decisions
- LLM now decides navigation URLs based on user instructions + template hints
- LLM generates BeautifulSoup extraction code dynamically from HTML + instructions
- Execute generated code in sandbox for flexible data extraction
- Templates serve as reference hints only, not rigid execution scripts
- Works even without templates (pure agentic mode)
- Output columns now driven by user's output_instructions
- Replace all hardcoded strategy routing (github_trending, reddit_trending, etc) with single agentic path

Files changed (1) hide show
  1. backend/app/api/routes/scrape.py +470 -28
backend/app/api/routes/scrape.py CHANGED
@@ -29,9 +29,11 @@ from app.config import Settings
29
  from app.api.deps import (
30
  MemoryManagerDep,
31
  SettingsDep,
 
32
  create_environment,
33
  remove_environment,
34
  )
 
35
  from app.api.routes.plugins import PLUGIN_REGISTRY
36
  from app.api.routes.websocket import get_connection_manager
37
  from app.core.action import Action, ActionType
@@ -949,6 +951,445 @@ async def scrape_url(
949
  remove_environment(episode_id)
950
 
951
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
952
  async def scrape_url_intelligently(
953
  session: dict[str, Any],
954
  session_id: str,
@@ -959,7 +1400,15 @@ async def scrape_url_intelligently(
959
  enabled_plugins: list[str],
960
  navigation_plan: dict[str, Any],
961
  ) -> AsyncGenerator[dict[str, Any], None]:
962
- """Intelligent scraping that follows navigation plan."""
 
 
 
 
 
 
 
 
963
 
964
  episode_id = f"{session_id}-{uuid.uuid4().hex[:8]}"
965
 
@@ -967,36 +1416,29 @@ async def scrape_url_intelligently(
967
  env = create_environment(episode_id, settings)
968
  await env.reset(task_id=f"scrape_{session_id}")
969
 
 
 
 
 
 
 
 
970
  step_num = 0
971
  total_reward = 0.0
972
 
973
- # GitHub trending strategy
974
- if navigation_plan["strategy"] == "github_trending":
975
- async for event in _scrape_github_trending(
976
- session, session_id, env, request, navigation_plan, step_num, total_reward
977
- ):
978
- yield event
979
-
980
- # Reddit popular/trending communities strategy
981
- elif navigation_plan["strategy"] == "reddit_trending":
982
- async for event in _scrape_reddit_trending(
983
- session, session_id, env, request, url, step_num, total_reward
984
- ):
985
- yield event
986
-
987
- # General exploration strategy
988
- elif navigation_plan["strategy"] == "intelligent_exploration":
989
- async for event in _scrape_with_exploration(
990
- session, session_id, env, request, navigation_plan, url, step_num, total_reward
991
- ):
992
- yield event
993
-
994
- # Default single page
995
- else:
996
- async for event in _scrape_single_page(
997
- session, session_id, env, request, url, step_num, total_reward
998
- ):
999
- yield event
1000
 
1001
  except Exception as exc:
1002
  logger.error(f"Intelligent scraping failed for {url}: {exc}")
 
29
  from app.api.deps import (
30
  MemoryManagerDep,
31
  SettingsDep,
32
+ get_model_router,
33
  create_environment,
34
  remove_environment,
35
  )
36
+ from app.models.router import SmartModelRouter, TaskType
37
  from app.api.routes.plugins import PLUGIN_REGISTRY
38
  from app.api.routes.websocket import get_connection_manager
39
  from app.core.action import Action, ActionType
 
951
  remove_environment(episode_id)
952
 
953
 
954
+ async def _scrape_with_agentic_llm(
955
+ session: dict[str, Any],
956
+ session_id: str,
957
+ env,
958
+ request: ScrapeRequest,
959
+ navigation_plan: dict[str, Any],
960
+ url: str,
961
+ step_num: int,
962
+ total_reward: float,
963
+ model_router: SmartModelRouter,
964
+ ) -> AsyncGenerator[dict[str, Any], None]:
965
+ """Truly agentic scraping using LLM to decide navigation and extraction.
966
+
967
+ This function uses the LLM to:
968
+ 1. Decide where to navigate based on instructions + template hints
969
+ 2. Analyze the HTML content
970
+ 3. Generate extraction code dynamically
971
+ 4. Format output according to output_instructions
972
+
973
+ Templates serve as reference hints only, not rigid execution scripts.
974
+ """
975
+
976
+ # Get template hint if available (for reference only)
977
+ template_hint = ""
978
+ if navigation_plan.get("matched_template"):
979
+ template = navigation_plan["matched_template"]
980
+ template_hint = f"""
981
+ SITE TEMPLATE HINT (reference only, not mandatory):
982
+ - Domain: {template.get('domain', 'N/A')}
983
+ - Strategies: {', '.join(template.get('strategies', []))}
984
+ - Suggested output fields: {', '.join(template.get('output_fields', []))}
985
+ - Typical patterns: {template.get('patterns', 'N/A')}
986
+ """
987
+
988
+ # Step 1: Ask LLM to decide navigation strategy
989
+ step_num += 1
990
+ navigation_prompt = f"""You are a web scraping agent. Analyze the user's request and decide where to navigate.
991
+
992
+ USER REQUEST:
993
+ - Target: {url}
994
+ - Instructions: {request.instructions or 'Extract all relevant data'}
995
+ - Desired output format: {request.output_format.value}
996
+ - Output instructions: {request.output_instructions or 'All available data'}
997
+
998
+ {template_hint}
999
+
1000
+ TASK: Decide the best URL to navigate to accomplish this task. Consider:
1001
+ - If the user wants trending/popular content, should you go to a trending page?
1002
+ - If the user wants specific data, do you need to navigate to a specific section?
1003
+ - Return ONLY the URL to navigate to, nothing else.
1004
+
1005
+ URL:"""
1006
+
1007
+ try:
1008
+ nav_response = await model_router.complete(
1009
+ messages=[{"role": "user", "content": navigation_prompt}],
1010
+ task_type=TaskType.REASONING,
1011
+ model=request.model,
1012
+ )
1013
+ target_url = nav_response.content.strip()
1014
+
1015
+ # Validate and clean URL
1016
+ if not target_url.startswith("http"):
1017
+ if "://" not in url:
1018
+ target_url = f"https://{url}/{target_url.lstrip('/')}"
1019
+ else:
1020
+ parsed = urlparse(url)
1021
+ target_url = f"{parsed.scheme}://{parsed.netloc}/{target_url.lstrip('/')}"
1022
+
1023
+ except Exception as e:
1024
+ logger.error(f"LLM navigation decision failed: {e}")
1025
+ target_url = url # Fall back to original URL
1026
+
1027
+ # Tool call: LLM navigation planning
1028
+ yield _record_step(
1029
+ session,
1030
+ ScrapeStep(
1031
+ step_number=step_num,
1032
+ action="tool_call",
1033
+ url=target_url,
1034
+ status="complete",
1035
+ message=f"llm.plan_navigation() → {target_url}",
1036
+ extracted_data={
1037
+ "tool_name": "llm.plan_navigation",
1038
+ "tool_description": "LLM decides optimal navigation URL based on instructions",
1039
+ "parameters": {"instructions": request.instructions, "base_url": url},
1040
+ "result": target_url,
1041
+ },
1042
+ reward=0.15,
1043
+ timestamp=_now_iso(),
1044
+ ),
1045
+ )
1046
+ total_reward += 0.15
1047
+
1048
+ # Step 2: Navigate to the decided URL
1049
+ step_num += 1
1050
+ yield _record_step(
1051
+ session,
1052
+ ScrapeStep(
1053
+ step_number=step_num,
1054
+ action="tool_call",
1055
+ url=target_url,
1056
+ status="running",
1057
+ message=f"browser.navigate(url='{target_url}')",
1058
+ extracted_data={
1059
+ "tool_name": "browser.navigate",
1060
+ "tool_description": "Navigate browser to target URL",
1061
+ "parameters": {"url": target_url, "wait_for": "page_load"},
1062
+ },
1063
+ timestamp=_now_iso(),
1064
+ ),
1065
+ )
1066
+
1067
+ navigate_action = Action(
1068
+ action_type=ActionType.NAVIGATE,
1069
+ parameters={"url": target_url},
1070
+ reasoning=f"Navigate to {target_url} based on LLM's decision",
1071
+ )
1072
+
1073
+ nav_obs, nav_reward, _, _, _, nav_info = await env.step(navigate_action)
1074
+ total_reward += nav_reward
1075
+
1076
+ yield _record_step(
1077
+ session,
1078
+ ScrapeStep(
1079
+ step_number=step_num,
1080
+ action="tool_call",
1081
+ url=target_url,
1082
+ status="complete",
1083
+ message=f"browser.navigate() → Success",
1084
+ extracted_data={
1085
+ "tool_name": "browser.navigate",
1086
+ "tool_description": "Navigate browser to target URL",
1087
+ "parameters": {"url": target_url},
1088
+ "result": {"status_code": nav_obs.page_html is not None},
1089
+ },
1090
+ reward=nav_reward,
1091
+ timestamp=_now_iso(),
1092
+ ),
1093
+ )
1094
+
1095
+ if not nav_obs.page_html:
1096
+ logger.error("Navigation failed - no HTML received")
1097
+ return
1098
+
1099
+ # Step 3: Parse HTML
1100
+ step_num += 1
1101
+ yield _record_step(
1102
+ session,
1103
+ ScrapeStep(
1104
+ step_number=step_num,
1105
+ action="tool_call",
1106
+ url=target_url,
1107
+ status="running",
1108
+ message="html.parse(html=page_content)",
1109
+ extracted_data={
1110
+ "tool_name": "html.parse",
1111
+ "tool_description": "Parse HTML into DOM structure",
1112
+ "parameters": {"content_length": len(nav_obs.page_html)},
1113
+ },
1114
+ timestamp=_now_iso(),
1115
+ ),
1116
+ )
1117
+
1118
+ soup = BeautifulSoup(nav_obs.page_html, "html.parser")
1119
+ total_reward += 0.1
1120
+
1121
+ yield _record_step(
1122
+ session,
1123
+ ScrapeStep(
1124
+ step_number=step_num,
1125
+ action="tool_call",
1126
+ url=target_url,
1127
+ status="complete",
1128
+ message="html.parse() → DOM ready",
1129
+ extracted_data={
1130
+ "tool_name": "html.parse",
1131
+ "tool_description": "Parse HTML into DOM structure",
1132
+ "result": {"elements_count": len(soup.find_all())},
1133
+ },
1134
+ reward=0.1,
1135
+ timestamp=_now_iso(),
1136
+ ),
1137
+ )
1138
+
1139
+ # Step 4: Ask LLM to generate extraction code
1140
+ step_num += 1
1141
+
1142
+ # Get a sample of the HTML for LLM analysis (first 5000 chars)
1143
+ html_sample = nav_obs.page_html[:5000]
1144
+
1145
+ extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
1146
+
1147
+ USER REQUEST:
1148
+ - Instructions: {request.instructions or 'Extract all relevant data'}
1149
+ - Output format: {request.output_format.value}
1150
+ - Output instructions: {request.output_instructions or 'All available data'}
1151
+
1152
+ HTML SAMPLE (first 5000 chars):
1153
+ ```html
1154
+ {html_sample}
1155
+ ```
1156
+
1157
+ {template_hint}
1158
+
1159
+ TASK: Generate Python code using BeautifulSoup to extract the requested data. The code should:
1160
+ 1. Parse the HTML (soup is already provided as `soup` variable)
1161
+ 2. Extract data matching the user's output_instructions
1162
+ 3. Return a list of dictionaries with the exact columns specified in output_instructions
1163
+ 4. Handle missing data gracefully
1164
+
1165
+ REQUIREMENTS:
1166
+ - Return ONLY executable Python code, no explanations
1167
+ - Use `soup` variable (already a BeautifulSoup object)
1168
+ - Return `extracted_data` as a list of dictionaries
1169
+ - Column names MUST match what the user requested in output_instructions
1170
+ - Example: if user wants "csv of username, repo, stars", return dicts with keys: username, repo, stars
1171
+
1172
+ CODE:"""
1173
+
1174
+ try:
1175
+ code_response = await model_router.complete(
1176
+ messages=[{"role": "user", "content": extraction_prompt}],
1177
+ task_type=TaskType.CODE,
1178
+ model=request.model,
1179
+ )
1180
+
1181
+ # Extract code from response (handle markdown code blocks)
1182
+ extraction_code = code_response.content.strip()
1183
+ if "```python" in extraction_code:
1184
+ extraction_code = extraction_code.split("```python")[1].split("```")[0].strip()
1185
+ elif "```" in extraction_code:
1186
+ extraction_code = extraction_code.split("```")[1].split("```")[0].strip()
1187
+
1188
+ # Tool call: LLM code generation
1189
+ yield _record_step(
1190
+ session,
1191
+ ScrapeStep(
1192
+ step_number=step_num,
1193
+ action="tool_call",
1194
+ url=target_url,
1195
+ status="complete",
1196
+ message=f"llm.generate_extraction_code() → {len(extraction_code)} chars",
1197
+ extracted_data={
1198
+ "tool_name": "llm.generate_extraction_code",
1199
+ "tool_description": "LLM generates BeautifulSoup extraction code based on HTML and instructions",
1200
+ "parameters": {
1201
+ "html_sample_length": len(html_sample),
1202
+ "instructions": request.instructions,
1203
+ "output_format": request.output_format.value,
1204
+ },
1205
+ "result": {"code_length": len(extraction_code)},
1206
+ },
1207
+ reward=0.2,
1208
+ timestamp=_now_iso(),
1209
+ ),
1210
+ )
1211
+ total_reward += 0.2
1212
+
1213
+ except Exception as e:
1214
+ logger.error(f"LLM code generation failed: {e}")
1215
+ extraction_code = DEFAULT_ANALYSIS_CODE # Fallback to default extraction
1216
+
1217
+ # Step 5: Execute generated code in sandbox
1218
+ step_num += 1
1219
+ yield _record_step(
1220
+ session,
1221
+ ScrapeStep(
1222
+ step_number=step_num,
1223
+ action="tool_call",
1224
+ url=target_url,
1225
+ status="running",
1226
+ message="sandbox.execute(code=llm_generated_code)",
1227
+ extracted_data={
1228
+ "tool_name": "sandbox.execute",
1229
+ "tool_description": "Execute LLM-generated extraction code in sandboxed Python environment",
1230
+ "parameters": {"code_length": len(extraction_code), "timeout": 30},
1231
+ },
1232
+ timestamp=_now_iso(),
1233
+ ),
1234
+ )
1235
+
1236
+ # Prepare execution context
1237
+ sandbox_globals = {
1238
+ "soup": soup,
1239
+ "html": nav_obs.page_html,
1240
+ "url": target_url,
1241
+ "BeautifulSoup": BeautifulSoup,
1242
+ "extracted_data": [], # LLM code should populate this
1243
+ }
1244
+
1245
+ try:
1246
+ # Execute the LLM-generated code
1247
+ exec(extraction_code, sandbox_globals)
1248
+ extracted_data = sandbox_globals.get("extracted_data", [])
1249
+
1250
+ if not isinstance(extracted_data, list):
1251
+ extracted_data = [extracted_data] if extracted_data else []
1252
+
1253
+ exec_reward = 0.5 if extracted_data else 0.1
1254
+ total_reward += exec_reward
1255
+
1256
+ yield _record_step(
1257
+ session,
1258
+ ScrapeStep(
1259
+ step_number=step_num,
1260
+ action="tool_call",
1261
+ url=target_url,
1262
+ status="complete",
1263
+ message=f"sandbox.execute() → Extracted {len(extracted_data)} items",
1264
+ extracted_data={
1265
+ "tool_name": "sandbox.execute",
1266
+ "tool_description": "Execute extraction code in sandbox",
1267
+ "result": {
1268
+ "items_extracted": len(extracted_data),
1269
+ "sample": extracted_data[:2] if extracted_data else [],
1270
+ },
1271
+ },
1272
+ reward=exec_reward,
1273
+ timestamp=_now_iso(),
1274
+ ),
1275
+ )
1276
+
1277
+ except Exception as e:
1278
+ logger.error(f"Extraction code execution failed: {e}")
1279
+ # Fallback: basic extraction
1280
+ extracted_data = [{
1281
+ "url": target_url,
1282
+ "title": soup.find("title").get_text() if soup.find("title") else "",
1283
+ "error": f"Extraction failed: {str(e)}",
1284
+ }]
1285
+ total_reward += 0.05
1286
+
1287
+ yield _record_step(
1288
+ session,
1289
+ ScrapeStep(
1290
+ step_number=step_num,
1291
+ action="tool_call",
1292
+ url=target_url,
1293
+ status="complete",
1294
+ message=f"sandbox.execute() → Failed: {str(e)[:100]}",
1295
+ extracted_data={
1296
+ "tool_name": "sandbox.execute",
1297
+ "tool_description": "Execute extraction code (failed)",
1298
+ "result": {"error": str(e)},
1299
+ },
1300
+ reward=0.05,
1301
+ timestamp=_now_iso(),
1302
+ ),
1303
+ )
1304
+
1305
+ # Step 6: Format output according to requested format
1306
+ step_num += 1
1307
+
1308
+ if request.output_format == OutputFormat.CSV:
1309
+ tool_name = "csv.generate"
1310
+ tool_desc = "Generate CSV output from extracted data"
1311
+ elif request.output_format == OutputFormat.JSON:
1312
+ tool_name = "json.dumps"
1313
+ tool_desc = "Format extracted data as JSON"
1314
+ else:
1315
+ tool_name = "data.format"
1316
+ tool_desc = "Format extracted data"
1317
+
1318
+ yield _record_step(
1319
+ session,
1320
+ ScrapeStep(
1321
+ step_number=step_num,
1322
+ action="tool_call",
1323
+ url=target_url,
1324
+ status="running",
1325
+ message=f"{tool_name}(data=extracted_items)",
1326
+ extracted_data={
1327
+ "tool_name": tool_name,
1328
+ "tool_description": tool_desc,
1329
+ "parameters": {"item_count": len(extracted_data)},
1330
+ },
1331
+ timestamp=_now_iso(),
1332
+ ),
1333
+ )
1334
+
1335
+ # Store extracted data in session
1336
+ if request.output_format == OutputFormat.CSV and extracted_data:
1337
+ # Generate CSV output
1338
+ output_buffer = io.StringIO()
1339
+ if extracted_data:
1340
+ fieldnames = list(extracted_data[0].keys())
1341
+ writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
1342
+ writer.writeheader()
1343
+ writer.writerows(extracted_data)
1344
+
1345
+ session["extracted_data"] = {
1346
+ "csv_output": output_buffer.getvalue(),
1347
+ "rows": extracted_data,
1348
+ "columns": list(extracted_data[0].keys()) if extracted_data else [],
1349
+ "row_count": len(extracted_data),
1350
+ }
1351
+ else:
1352
+ session["extracted_data"] = {
1353
+ target_url: extracted_data
1354
+ }
1355
+
1356
+ total_reward += 0.1
1357
+
1358
+ yield _record_step(
1359
+ session,
1360
+ ScrapeStep(
1361
+ step_number=step_num,
1362
+ action="tool_call",
1363
+ url=target_url,
1364
+ status="complete",
1365
+ message=f"{tool_name}() → Output ready",
1366
+ extracted_data={
1367
+ "tool_name": tool_name,
1368
+ "tool_description": tool_desc,
1369
+ "result": {"format": request.output_format.value, "size": len(extracted_data)},
1370
+ },
1371
+ reward=0.1,
1372
+ timestamp=_now_iso(),
1373
+ ),
1374
+ )
1375
+
1376
+ # Final completion
1377
+ step_num += 1
1378
+ yield _record_step(
1379
+ session,
1380
+ ScrapeStep(
1381
+ step_number=step_num,
1382
+ action="complete",
1383
+ url=target_url,
1384
+ status="complete",
1385
+ message=f"Agentic scraping complete: {len(extracted_data)} items extracted",
1386
+ extracted_data={"item_count": len(extracted_data)},
1387
+ reward=total_reward,
1388
+ timestamp=_now_iso(),
1389
+ ),
1390
+ )
1391
+
1392
+
1393
  async def scrape_url_intelligently(
1394
  session: dict[str, Any],
1395
  session_id: str,
 
1400
  enabled_plugins: list[str],
1401
  navigation_plan: dict[str, Any],
1402
  ) -> AsyncGenerator[dict[str, Any], None]:
1403
+ """Intelligent scraping using agentic LLM-driven approach.
1404
+
1405
+ This function uses LLM to make ALL decisions:
1406
+ - Navigation: Where to go based on instructions
1407
+ - Extraction: What data to extract and how
1408
+ - Formatting: How to present the results
1409
+
1410
+ Templates serve as reference hints only, NOT rigid scripts.
1411
+ """
1412
 
1413
  episode_id = f"{session_id}-{uuid.uuid4().hex[:8]}"
1414
 
 
1416
  env = create_environment(episode_id, settings)
1417
  await env.reset(task_id=f"scrape_{session_id}")
1418
 
1419
+ # Get model router
1420
+ model_router = get_model_router()
1421
+ if not model_router:
1422
+ logger.error("Model router not available")
1423
+ session["errors"].append("Model router not initialized")
1424
+ return
1425
+
1426
  step_num = 0
1427
  total_reward = 0.0
1428
 
1429
+ # ALWAYS use agentic approach - no hardcoded strategies
1430
+ async for event in _scrape_with_agentic_llm(
1431
+ session,
1432
+ session_id,
1433
+ env,
1434
+ request,
1435
+ navigation_plan,
1436
+ url,
1437
+ step_num,
1438
+ total_reward,
1439
+ model_router,
1440
+ ):
1441
+ yield event
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1442
 
1443
  except Exception as exc:
1444
  logger.error(f"Intelligent scraping failed for {url}: {exc}")