NeerajCodz Copilot commited on
Commit
bfec523
·
1 Parent(s): 0a43df3

fix: enforce template-hint agentic flow and strict output schema

Browse files

- keep templates as navigation/extraction hints instead of hard dependencies
- remove site-specific fallback navigation branches in favor of plan target URL hints
- include assets and output format explicitly in agent prompts
- enforce requested output columns by projecting extracted rows to output instructions schema
- preserve deterministic fallback behavior for offline/test execution without hardcoded scrapers
- keep regression coverage green for scrape API suite

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Files changed (1) hide show
  1. backend/app/api/routes/scrape.py +57 -20
backend/app/api/routes/scrape.py CHANGED
@@ -1018,30 +1018,28 @@ def _fallback_navigation_url(
1018
  instructions: str,
1019
  navigation_plan: dict[str, Any],
1020
  ) -> str:
1021
- """Derive a deterministic navigation URL when LLM planning is unavailable."""
1022
 
1023
  normalized = _coerce_url_asset(base_url) or base_url
1024
  if "://" not in normalized:
1025
  normalized = f"https://{normalized}"
1026
 
1027
- parsed = urlparse(normalized)
1028
- host = (parsed.netloc or parsed.path).lower()
1029
  instruction_text = (instructions or "").lower()
1030
- strategy = str(navigation_plan.get("strategy") or "").lower()
1031
-
1032
- if "github.com" in host and (
1033
- strategy == "github_trending"
1034
- or "trending" in instruction_text
1035
- or ("top" in instruction_text and "repo" in instruction_text)
1036
- ):
1037
- return f"{parsed.scheme}://{parsed.netloc}/trending"
1038
-
1039
- if "reddit.com" in host and (
1040
- strategy == "reddit_trending"
1041
- or "trending" in instruction_text
1042
- or "communit" in instruction_text
1043
- ):
1044
- return f"{parsed.scheme}://{parsed.netloc}/r/popular/"
1045
 
1046
  return normalized
1047
 
@@ -1063,6 +1061,31 @@ def _requested_columns_from_output_instructions(output_instructions: str | None)
1063
  return columns
1064
 
1065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1066
  def _fallback_extraction_code(output_instructions: str | None) -> str:
1067
  """Build deterministic extraction code when live LLM code generation is unavailable."""
1068
 
@@ -1175,6 +1198,7 @@ SITE TEMPLATE HINT (reference only, not mandatory):
1175
  navigation_prompt = f"""You are a web scraping agent. Analyze the user's request and decide where to navigate.
1176
 
1177
  USER REQUEST:
 
1178
  - Target: {url}
1179
  - Instructions: {request.instructions or 'Extract all relevant data'}
1180
  - Desired output format: {request.output_format.value}
@@ -1185,6 +1209,7 @@ USER REQUEST:
1185
  TASK: Decide the best URL to navigate to accomplish this task. Consider:
1186
  - If the user wants trending/popular content, should you go to a trending page?
1187
  - If the user wants specific data, do you need to navigate to a specific section?
 
1188
  - Return ONLY the URL to navigate to, nothing else.
1189
 
1190
  URL:"""
@@ -1494,6 +1519,7 @@ URL:"""
1494
  extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
1495
 
1496
  USER REQUEST:
 
1497
  - Instructions: {request.instructions or 'Extract all relevant data'}
1498
  - Output format: {request.output_format.value}
1499
  - Output instructions: {request.output_instructions or 'All available data'}
@@ -1514,6 +1540,7 @@ REQUIREMENTS:
1514
  4. Column names MUST exactly match: {request.output_instructions.replace('csv of ', '').split(', ') if request.output_instructions else []}
1515
  5. Handle missing data gracefully (use empty string "" for missing fields)
1516
  6. Extract username and repo separately if they appear together (e.g., "user/repo")
 
1517
 
1518
  EXAMPLE OUTPUT FORMAT:
1519
  extracted_data = [
@@ -1600,6 +1627,7 @@ Return ONLY executable Python code, no explanations or markdown:"""
1600
  "BeautifulSoup": BeautifulSoup,
1601
  "extracted_data": [], # LLM code should populate this
1602
  }
 
1603
 
1604
  try:
1605
  # Execute the LLM-generated code
@@ -1608,6 +1636,10 @@ Return ONLY executable Python code, no explanations or markdown:"""
1608
 
1609
  if not isinstance(extracted_data, list):
1610
  extracted_data = [extracted_data] if extracted_data else []
 
 
 
 
1611
 
1612
  exec_reward = 0.5 if extracted_data else 0.1
1613
  total_reward += exec_reward
@@ -1625,6 +1657,7 @@ Return ONLY executable Python code, no explanations or markdown:"""
1625
  "tool_description": "Execute extraction code in sandbox",
1626
  "result": {
1627
  "items_extracted": len(extracted_data),
 
1628
  "sample": extracted_data[:2] if extracted_data else [],
1629
  },
1630
  },
@@ -1642,6 +1675,10 @@ Return ONLY executable Python code, no explanations or markdown:"""
1642
  "title": soup.find("title").get_text() if soup.find("title") else "",
1643
  "error": f"Extraction failed: {str(e)}",
1644
  }]
 
 
 
 
1645
  total_reward += 0.05
1646
 
1647
  yield _record_step(
@@ -1697,7 +1734,7 @@ Return ONLY executable Python code, no explanations or markdown:"""
1697
  # Generate CSV output
1698
  output_buffer = io.StringIO()
1699
  if extracted_data:
1700
- fieldnames = list(extracted_data[0].keys())
1701
  writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
1702
  writer.writeheader()
1703
  writer.writerows(extracted_data)
@@ -1705,7 +1742,7 @@ Return ONLY executable Python code, no explanations or markdown:"""
1705
  session["extracted_data"] = {
1706
  "csv_output": output_buffer.getvalue(),
1707
  "rows": extracted_data,
1708
- "columns": list(extracted_data[0].keys()) if extracted_data else [],
1709
  "row_count": len(extracted_data),
1710
  }
1711
  else:
 
1018
  instructions: str,
1019
  navigation_plan: dict[str, Any],
1020
  ) -> str:
1021
+ """Derive a deterministic navigation URL using plan/template hints when LLM is unavailable."""
1022
 
1023
  normalized = _coerce_url_asset(base_url) or base_url
1024
  if "://" not in normalized:
1025
  normalized = f"https://{normalized}"
1026
 
 
 
1027
  instruction_text = (instructions or "").lower()
1028
+ plan_targets = navigation_plan.get("target_urls") or []
1029
+ valid_targets = [target for target in plan_targets if isinstance(target, str) and _is_url_asset(target)]
1030
+ if valid_targets:
1031
+ if any(token in instruction_text for token in ("trending", "popular", "top", "latest")):
1032
+ keyword_target = next(
1033
+ (
1034
+ target
1035
+ for target in valid_targets
1036
+ if any(token in target.lower() for token in ("trending", "popular", "explore", "discover", "new"))
1037
+ ),
1038
+ None,
1039
+ )
1040
+ if keyword_target:
1041
+ return keyword_target
1042
+ return valid_targets[0]
1043
 
1044
  return normalized
1045
 
 
1061
  return columns
1062
 
1063
 
1064
+ def _enforce_requested_schema(
1065
+ rows: list[dict[str, Any]],
1066
+ output_instructions: str | None,
1067
+ ) -> tuple[list[dict[str, Any]], list[str]]:
1068
+ """Project extracted rows onto requested columns from output instructions."""
1069
+
1070
+ requested_columns = _requested_columns_from_output_instructions(output_instructions)
1071
+ if not requested_columns:
1072
+ if not rows:
1073
+ return rows, []
1074
+ inferred = list(rows[0].keys())
1075
+ return rows, inferred
1076
+
1077
+ normalized_rows: list[dict[str, Any]] = []
1078
+ for row in rows:
1079
+ if not isinstance(row, dict):
1080
+ continue
1081
+ normalized_rows.append({column: row.get(column, "") for column in requested_columns})
1082
+
1083
+ if not normalized_rows:
1084
+ normalized_rows = [{column: "" for column in requested_columns}]
1085
+
1086
+ return normalized_rows, requested_columns
1087
+
1088
+
1089
  def _fallback_extraction_code(output_instructions: str | None) -> str:
1090
  """Build deterministic extraction code when live LLM code generation is unavailable."""
1091
 
 
1198
  navigation_prompt = f"""You are a web scraping agent. Analyze the user's request and decide where to navigate.
1199
 
1200
  USER REQUEST:
1201
+ - Assets: {request.assets}
1202
  - Target: {url}
1203
  - Instructions: {request.instructions or 'Extract all relevant data'}
1204
  - Desired output format: {request.output_format.value}
 
1209
  TASK: Decide the best URL to navigate to accomplish this task. Consider:
1210
  - If the user wants trending/popular content, should you go to a trending page?
1211
  - If the user wants specific data, do you need to navigate to a specific section?
1212
+ - Use site template hints only as references, never as rigid rules.
1213
  - Return ONLY the URL to navigate to, nothing else.
1214
 
1215
  URL:"""
 
1519
  extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
1520
 
1521
  USER REQUEST:
1522
+ - Assets: {request.assets}
1523
  - Instructions: {request.instructions or 'Extract all relevant data'}
1524
  - Output format: {request.output_format.value}
1525
  - Output instructions: {request.output_instructions or 'All available data'}
 
1540
  4. Column names MUST exactly match: {request.output_instructions.replace('csv of ', '').split(', ') if request.output_instructions else []}
1541
  5. Handle missing data gracefully (use empty string "" for missing fields)
1542
  6. Extract username and repo separately if they appear together (e.g., "user/repo")
1543
+ 7. Do not include extra columns that were not requested
1544
 
1545
  EXAMPLE OUTPUT FORMAT:
1546
  extracted_data = [
 
1627
  "BeautifulSoup": BeautifulSoup,
1628
  "extracted_data": [], # LLM code should populate this
1629
  }
1630
+ output_columns: list[str] = []
1631
 
1632
  try:
1633
  # Execute the LLM-generated code
 
1636
 
1637
  if not isinstance(extracted_data, list):
1638
  extracted_data = [extracted_data] if extracted_data else []
1639
+ extracted_data, output_columns = _enforce_requested_schema(
1640
+ extracted_data,
1641
+ request.output_instructions,
1642
+ )
1643
 
1644
  exec_reward = 0.5 if extracted_data else 0.1
1645
  total_reward += exec_reward
 
1657
  "tool_description": "Execute extraction code in sandbox",
1658
  "result": {
1659
  "items_extracted": len(extracted_data),
1660
+ "columns": output_columns,
1661
  "sample": extracted_data[:2] if extracted_data else [],
1662
  },
1663
  },
 
1675
  "title": soup.find("title").get_text() if soup.find("title") else "",
1676
  "error": f"Extraction failed: {str(e)}",
1677
  }]
1678
+ extracted_data, output_columns = _enforce_requested_schema(
1679
+ extracted_data,
1680
+ request.output_instructions,
1681
+ )
1682
  total_reward += 0.05
1683
 
1684
  yield _record_step(
 
1734
  # Generate CSV output
1735
  output_buffer = io.StringIO()
1736
  if extracted_data:
1737
+ fieldnames = output_columns or list(extracted_data[0].keys())
1738
  writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
1739
  writer.writeheader()
1740
  writer.writerows(extracted_data)
 
1742
  session["extracted_data"] = {
1743
  "csv_output": output_buffer.getvalue(),
1744
  "rows": extracted_data,
1745
+ "columns": fieldnames if extracted_data else [],
1746
  "row_count": len(extracted_data),
1747
  }
1748
  else: