Spaces:
Sleeping
Sleeping
Commit ·
bfec523
1
Parent(s): 0a43df3
fix: enforce template-hint agentic flow and strict output schema
Browse files- keep templates as navigation/extraction hints instead of hard dependencies
- remove site-specific fallback navigation branches in favor of plan target URL hints
- include assets and output format explicitly in agent prompts
- enforce requested output columns by projecting extracted rows to output instructions schema
- preserve deterministic fallback behavior for offline/test execution without hardcoded scrapers
- keep regression coverage green for scrape API suite
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
- backend/app/api/routes/scrape.py +57 -20
backend/app/api/routes/scrape.py
CHANGED
|
@@ -1018,30 +1018,28 @@ def _fallback_navigation_url(
|
|
| 1018 |
instructions: str,
|
| 1019 |
navigation_plan: dict[str, Any],
|
| 1020 |
) -> str:
|
| 1021 |
-
"""Derive a deterministic navigation URL when LLM
|
| 1022 |
|
| 1023 |
normalized = _coerce_url_asset(base_url) or base_url
|
| 1024 |
if "://" not in normalized:
|
| 1025 |
normalized = f"https://{normalized}"
|
| 1026 |
|
| 1027 |
-
parsed = urlparse(normalized)
|
| 1028 |
-
host = (parsed.netloc or parsed.path).lower()
|
| 1029 |
instruction_text = (instructions or "").lower()
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
-
if
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
return
|
| 1045 |
|
| 1046 |
return normalized
|
| 1047 |
|
|
@@ -1063,6 +1061,31 @@ def _requested_columns_from_output_instructions(output_instructions: str | None)
|
|
| 1063 |
return columns
|
| 1064 |
|
| 1065 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1066 |
def _fallback_extraction_code(output_instructions: str | None) -> str:
|
| 1067 |
"""Build deterministic extraction code when live LLM code generation is unavailable."""
|
| 1068 |
|
|
@@ -1175,6 +1198,7 @@ SITE TEMPLATE HINT (reference only, not mandatory):
|
|
| 1175 |
navigation_prompt = f"""You are a web scraping agent. Analyze the user's request and decide where to navigate.
|
| 1176 |
|
| 1177 |
USER REQUEST:
|
|
|
|
| 1178 |
- Target: {url}
|
| 1179 |
- Instructions: {request.instructions or 'Extract all relevant data'}
|
| 1180 |
- Desired output format: {request.output_format.value}
|
|
@@ -1185,6 +1209,7 @@ USER REQUEST:
|
|
| 1185 |
TASK: Decide the best URL to navigate to accomplish this task. Consider:
|
| 1186 |
- If the user wants trending/popular content, should you go to a trending page?
|
| 1187 |
- If the user wants specific data, do you need to navigate to a specific section?
|
|
|
|
| 1188 |
- Return ONLY the URL to navigate to, nothing else.
|
| 1189 |
|
| 1190 |
URL:"""
|
|
@@ -1494,6 +1519,7 @@ URL:"""
|
|
| 1494 |
extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
|
| 1495 |
|
| 1496 |
USER REQUEST:
|
|
|
|
| 1497 |
- Instructions: {request.instructions or 'Extract all relevant data'}
|
| 1498 |
- Output format: {request.output_format.value}
|
| 1499 |
- Output instructions: {request.output_instructions or 'All available data'}
|
|
@@ -1514,6 +1540,7 @@ REQUIREMENTS:
|
|
| 1514 |
4. Column names MUST exactly match: {request.output_instructions.replace('csv of ', '').split(', ') if request.output_instructions else []}
|
| 1515 |
5. Handle missing data gracefully (use empty string "" for missing fields)
|
| 1516 |
6. Extract username and repo separately if they appear together (e.g., "user/repo")
|
|
|
|
| 1517 |
|
| 1518 |
EXAMPLE OUTPUT FORMAT:
|
| 1519 |
extracted_data = [
|
|
@@ -1600,6 +1627,7 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 1600 |
"BeautifulSoup": BeautifulSoup,
|
| 1601 |
"extracted_data": [], # LLM code should populate this
|
| 1602 |
}
|
|
|
|
| 1603 |
|
| 1604 |
try:
|
| 1605 |
# Execute the LLM-generated code
|
|
@@ -1608,6 +1636,10 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 1608 |
|
| 1609 |
if not isinstance(extracted_data, list):
|
| 1610 |
extracted_data = [extracted_data] if extracted_data else []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1611 |
|
| 1612 |
exec_reward = 0.5 if extracted_data else 0.1
|
| 1613 |
total_reward += exec_reward
|
|
@@ -1625,6 +1657,7 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 1625 |
"tool_description": "Execute extraction code in sandbox",
|
| 1626 |
"result": {
|
| 1627 |
"items_extracted": len(extracted_data),
|
|
|
|
| 1628 |
"sample": extracted_data[:2] if extracted_data else [],
|
| 1629 |
},
|
| 1630 |
},
|
|
@@ -1642,6 +1675,10 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 1642 |
"title": soup.find("title").get_text() if soup.find("title") else "",
|
| 1643 |
"error": f"Extraction failed: {str(e)}",
|
| 1644 |
}]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1645 |
total_reward += 0.05
|
| 1646 |
|
| 1647 |
yield _record_step(
|
|
@@ -1697,7 +1734,7 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 1697 |
# Generate CSV output
|
| 1698 |
output_buffer = io.StringIO()
|
| 1699 |
if extracted_data:
|
| 1700 |
-
fieldnames = list(extracted_data[0].keys())
|
| 1701 |
writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
|
| 1702 |
writer.writeheader()
|
| 1703 |
writer.writerows(extracted_data)
|
|
@@ -1705,7 +1742,7 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 1705 |
session["extracted_data"] = {
|
| 1706 |
"csv_output": output_buffer.getvalue(),
|
| 1707 |
"rows": extracted_data,
|
| 1708 |
-
"columns":
|
| 1709 |
"row_count": len(extracted_data),
|
| 1710 |
}
|
| 1711 |
else:
|
|
|
|
| 1018 |
instructions: str,
|
| 1019 |
navigation_plan: dict[str, Any],
|
| 1020 |
) -> str:
|
| 1021 |
+
"""Derive a deterministic navigation URL using plan/template hints when LLM is unavailable."""
|
| 1022 |
|
| 1023 |
normalized = _coerce_url_asset(base_url) or base_url
|
| 1024 |
if "://" not in normalized:
|
| 1025 |
normalized = f"https://{normalized}"
|
| 1026 |
|
|
|
|
|
|
|
| 1027 |
instruction_text = (instructions or "").lower()
|
| 1028 |
+
plan_targets = navigation_plan.get("target_urls") or []
|
| 1029 |
+
valid_targets = [target for target in plan_targets if isinstance(target, str) and _is_url_asset(target)]
|
| 1030 |
+
if valid_targets:
|
| 1031 |
+
if any(token in instruction_text for token in ("trending", "popular", "top", "latest")):
|
| 1032 |
+
keyword_target = next(
|
| 1033 |
+
(
|
| 1034 |
+
target
|
| 1035 |
+
for target in valid_targets
|
| 1036 |
+
if any(token in target.lower() for token in ("trending", "popular", "explore", "discover", "new"))
|
| 1037 |
+
),
|
| 1038 |
+
None,
|
| 1039 |
+
)
|
| 1040 |
+
if keyword_target:
|
| 1041 |
+
return keyword_target
|
| 1042 |
+
return valid_targets[0]
|
| 1043 |
|
| 1044 |
return normalized
|
| 1045 |
|
|
|
|
| 1061 |
return columns
|
| 1062 |
|
| 1063 |
|
| 1064 |
+
def _enforce_requested_schema(
|
| 1065 |
+
rows: list[dict[str, Any]],
|
| 1066 |
+
output_instructions: str | None,
|
| 1067 |
+
) -> tuple[list[dict[str, Any]], list[str]]:
|
| 1068 |
+
"""Project extracted rows onto requested columns from output instructions."""
|
| 1069 |
+
|
| 1070 |
+
requested_columns = _requested_columns_from_output_instructions(output_instructions)
|
| 1071 |
+
if not requested_columns:
|
| 1072 |
+
if not rows:
|
| 1073 |
+
return rows, []
|
| 1074 |
+
inferred = list(rows[0].keys())
|
| 1075 |
+
return rows, inferred
|
| 1076 |
+
|
| 1077 |
+
normalized_rows: list[dict[str, Any]] = []
|
| 1078 |
+
for row in rows:
|
| 1079 |
+
if not isinstance(row, dict):
|
| 1080 |
+
continue
|
| 1081 |
+
normalized_rows.append({column: row.get(column, "") for column in requested_columns})
|
| 1082 |
+
|
| 1083 |
+
if not normalized_rows:
|
| 1084 |
+
normalized_rows = [{column: "" for column in requested_columns}]
|
| 1085 |
+
|
| 1086 |
+
return normalized_rows, requested_columns
|
| 1087 |
+
|
| 1088 |
+
|
| 1089 |
def _fallback_extraction_code(output_instructions: str | None) -> str:
|
| 1090 |
"""Build deterministic extraction code when live LLM code generation is unavailable."""
|
| 1091 |
|
|
|
|
| 1198 |
navigation_prompt = f"""You are a web scraping agent. Analyze the user's request and decide where to navigate.
|
| 1199 |
|
| 1200 |
USER REQUEST:
|
| 1201 |
+
- Assets: {request.assets}
|
| 1202 |
- Target: {url}
|
| 1203 |
- Instructions: {request.instructions or 'Extract all relevant data'}
|
| 1204 |
- Desired output format: {request.output_format.value}
|
|
|
|
| 1209 |
TASK: Decide the best URL to navigate to accomplish this task. Consider:
|
| 1210 |
- If the user wants trending/popular content, should you go to a trending page?
|
| 1211 |
- If the user wants specific data, do you need to navigate to a specific section?
|
| 1212 |
+
- Use site template hints only as references, never as rigid rules.
|
| 1213 |
- Return ONLY the URL to navigate to, nothing else.
|
| 1214 |
|
| 1215 |
URL:"""
|
|
|
|
| 1519 |
extraction_prompt = f"""You are a web scraping expert. Generate Python code to extract data from HTML.
|
| 1520 |
|
| 1521 |
USER REQUEST:
|
| 1522 |
+
- Assets: {request.assets}
|
| 1523 |
- Instructions: {request.instructions or 'Extract all relevant data'}
|
| 1524 |
- Output format: {request.output_format.value}
|
| 1525 |
- Output instructions: {request.output_instructions or 'All available data'}
|
|
|
|
| 1540 |
4. Column names MUST exactly match: {request.output_instructions.replace('csv of ', '').split(', ') if request.output_instructions else []}
|
| 1541 |
5. Handle missing data gracefully (use empty string "" for missing fields)
|
| 1542 |
6. Extract username and repo separately if they appear together (e.g., "user/repo")
|
| 1543 |
+
7. Do not include extra columns that were not requested
|
| 1544 |
|
| 1545 |
EXAMPLE OUTPUT FORMAT:
|
| 1546 |
extracted_data = [
|
|
|
|
| 1627 |
"BeautifulSoup": BeautifulSoup,
|
| 1628 |
"extracted_data": [], # LLM code should populate this
|
| 1629 |
}
|
| 1630 |
+
output_columns: list[str] = []
|
| 1631 |
|
| 1632 |
try:
|
| 1633 |
# Execute the LLM-generated code
|
|
|
|
| 1636 |
|
| 1637 |
if not isinstance(extracted_data, list):
|
| 1638 |
extracted_data = [extracted_data] if extracted_data else []
|
| 1639 |
+
extracted_data, output_columns = _enforce_requested_schema(
|
| 1640 |
+
extracted_data,
|
| 1641 |
+
request.output_instructions,
|
| 1642 |
+
)
|
| 1643 |
|
| 1644 |
exec_reward = 0.5 if extracted_data else 0.1
|
| 1645 |
total_reward += exec_reward
|
|
|
|
| 1657 |
"tool_description": "Execute extraction code in sandbox",
|
| 1658 |
"result": {
|
| 1659 |
"items_extracted": len(extracted_data),
|
| 1660 |
+
"columns": output_columns,
|
| 1661 |
"sample": extracted_data[:2] if extracted_data else [],
|
| 1662 |
},
|
| 1663 |
},
|
|
|
|
| 1675 |
"title": soup.find("title").get_text() if soup.find("title") else "",
|
| 1676 |
"error": f"Extraction failed: {str(e)}",
|
| 1677 |
}]
|
| 1678 |
+
extracted_data, output_columns = _enforce_requested_schema(
|
| 1679 |
+
extracted_data,
|
| 1680 |
+
request.output_instructions,
|
| 1681 |
+
)
|
| 1682 |
total_reward += 0.05
|
| 1683 |
|
| 1684 |
yield _record_step(
|
|
|
|
| 1734 |
# Generate CSV output
|
| 1735 |
output_buffer = io.StringIO()
|
| 1736 |
if extracted_data:
|
| 1737 |
+
fieldnames = output_columns or list(extracted_data[0].keys())
|
| 1738 |
writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
|
| 1739 |
writer.writeheader()
|
| 1740 |
writer.writerows(extracted_data)
|
|
|
|
| 1742 |
session["extracted_data"] = {
|
| 1743 |
"csv_output": output_buffer.getvalue(),
|
| 1744 |
"rows": extracted_data,
|
| 1745 |
+
"columns": fieldnames if extracted_data else [],
|
| 1746 |
"row_count": len(extracted_data),
|
| 1747 |
}
|
| 1748 |
else:
|