Spaces:
Sleeping
Sleeping
Commit ·
fc5088a
1
Parent(s): 922337a
feat: prioritize user-site extraction over external search, improve metric patterns for HN-style sites
Browse files- backend/app/api/routes/scrape.py +67 -26
backend/app/api/routes/scrape.py
CHANGED
|
@@ -707,8 +707,8 @@ def _extract_markdown_link_rows(
|
|
| 707 |
# Match complex links with embedded images: [ Text](link_url)
|
| 708 |
# This captures the text after the image and the final link
|
| 709 |
complex_link_pattern = re.compile(r'\[!\[Image[^\]]*\]\([^\)]+\)\s*([^\]]+)\]\((https?://[^\s"\)]+)\)')
|
| 710 |
-
# Match view/viewer counts anywhere (including "47.2K viewers" format)
|
| 711 |
-
views_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*(?:views?|viewers?)', re.IGNORECASE)
|
| 712 |
likes_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*likes?', re.IGNORECASE)
|
| 713 |
comments_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*comments?', re.IGNORECASE)
|
| 714 |
date_pattern = re.compile(r'\b(today|yesterday|\d+\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\s+ago)\b', re.IGNORECASE)
|
|
@@ -873,7 +873,7 @@ def _extract_markdown_link_rows(
|
|
| 873 |
row[col] = clean_title[:160]
|
| 874 |
elif lower_col in {"content", "summary", "description"}:
|
| 875 |
row[col] = clean_title[:320]
|
| 876 |
-
elif lower_col in {"views", "view_count", "viewers"}:
|
| 877 |
row[col] = metrics["views"]
|
| 878 |
elif lower_col in {"likes", "like_count"}:
|
| 879 |
row[col] = metrics["likes"]
|
|
@@ -1012,20 +1012,61 @@ async def _search_recovery_rows(
|
|
| 1012 |
output_instructions: str | None,
|
| 1013 |
row_limit: int,
|
| 1014 |
) -> tuple[list[dict[str, Any]], list[str], str | None, float]:
|
| 1015 |
-
"""Search-guided generic recovery for low-relevance extraction results.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1016 |
|
| 1017 |
best_rows: list[dict[str, Any]] = []
|
| 1018 |
best_columns: list[str] = []
|
| 1019 |
best_source: str | None = None
|
| 1020 |
best_score = 0.0
|
| 1021 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1022 |
queries = _build_recovery_queries(base_url, instructions)
|
| 1023 |
-
for query in queries[:
|
| 1024 |
-
discovered_urls = await _search_urls_with_mcp(query, max_results=
|
| 1025 |
if not discovered_urls:
|
| 1026 |
discovered_urls = _discover_assets_for_query(query)
|
| 1027 |
|
| 1028 |
-
for candidate_url in discovered_urls[:
|
| 1029 |
text_payload = _fetch_text_render_markdown(candidate_url, timeout_seconds=12)
|
| 1030 |
if not text_payload:
|
| 1031 |
continue
|
|
@@ -1468,15 +1509,16 @@ def _infer_navigation_paths(instructions: str | None) -> list[str]:
|
|
| 1468 |
"""Infer common navigation paths based on user intent - works generically across sites."""
|
| 1469 |
|
| 1470 |
if not instructions:
|
| 1471 |
-
return []
|
| 1472 |
|
| 1473 |
instruction_text = instructions.lower()
|
| 1474 |
paths: list[str] = []
|
| 1475 |
|
| 1476 |
# Trending/popular intent - common paths across many sites
|
|
|
|
| 1477 |
if any(token in instruction_text for token in ("trending", "popular", "top", "hot", "best")):
|
| 1478 |
paths.extend([
|
| 1479 |
-
"/
|
| 1480 |
"/trending",
|
| 1481 |
"/popular",
|
| 1482 |
"/explore",
|
|
@@ -1551,10 +1593,11 @@ def _fallback_navigation_url(
|
|
| 1551 |
) -> str:
|
| 1552 |
"""Derive a deterministic navigation URL using plan/template hints when LLM is unavailable.
|
| 1553 |
|
| 1554 |
-
|
| 1555 |
1. Template target URLs (if available)
|
| 1556 |
-
2.
|
| 1557 |
-
3.
|
|
|
|
| 1558 |
"""
|
| 1559 |
|
| 1560 |
normalized = _coerce_url_asset(base_url) or base_url
|
|
@@ -1590,28 +1633,22 @@ def _fallback_navigation_url(
|
|
| 1590 |
if search_target:
|
| 1591 |
return _apply_text_render_proxy(search_target)
|
| 1592 |
|
| 1593 |
-
# 2.
|
| 1594 |
-
#
|
| 1595 |
-
ranked_intent = any(token in instruction_text for token in ("trending", "popular", "top", "best", "music", "video"))
|
| 1596 |
-
if ranked_intent:
|
| 1597 |
-
search_url = _build_search_navigation_url(normalized, instructions)
|
| 1598 |
-
if search_url:
|
| 1599 |
-
return _apply_text_render_proxy(search_url)
|
| 1600 |
-
|
| 1601 |
-
# 3. Try direct navigation paths as fallback
|
| 1602 |
inferred_paths = _infer_navigation_paths(instructions)
|
| 1603 |
if inferred_paths:
|
| 1604 |
best_path = inferred_paths[0]
|
| 1605 |
inferred_url = f"{parsed.scheme}://{parsed.netloc}{best_path}"
|
| 1606 |
return _apply_text_render_proxy(inferred_url)
|
| 1607 |
|
| 1608 |
-
#
|
| 1609 |
-
search_intent = any(token in instruction_text for token in ("search", "find", "looking for"))
|
| 1610 |
if search_intent:
|
| 1611 |
search_url = _build_search_navigation_url(normalized, instructions)
|
| 1612 |
if search_url:
|
| 1613 |
return _apply_text_render_proxy(search_url)
|
| 1614 |
|
|
|
|
| 1615 |
return _apply_text_render_proxy(normalized)
|
| 1616 |
|
| 1617 |
|
|
@@ -2664,7 +2701,10 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 2664 |
|
| 2665 |
relevance_score = _rows_relevance_score(extracted_data, request.instructions)
|
| 2666 |
recovery_keywords = _instruction_keywords(request.instructions, max_keywords=8)
|
| 2667 |
-
|
|
|
|
|
|
|
|
|
|
| 2668 |
step_num += 1
|
| 2669 |
yield _record_step(
|
| 2670 |
session,
|
|
@@ -2676,7 +2716,7 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 2676 |
message="agent.recover_relevance(query)",
|
| 2677 |
extracted_data={
|
| 2678 |
"tool_name": "agent.recover_relevance",
|
| 2679 |
-
"tool_description": "Search-guided relevance recovery for
|
| 2680 |
"parameters": {
|
| 2681 |
"keywords": recovery_keywords,
|
| 2682 |
"baseline_relevance": round(relevance_score, 3),
|
|
@@ -2692,7 +2732,8 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 2692 |
output_instructions=request.output_instructions,
|
| 2693 |
row_limit=requested_limit,
|
| 2694 |
)
|
| 2695 |
-
|
|
|
|
| 2696 |
if improved:
|
| 2697 |
extracted_data = recovered_rows
|
| 2698 |
output_columns = recovered_columns or output_columns
|
|
|
|
| 707 |
# Match complex links with embedded images: [ Text](link_url)
|
| 708 |
# This captures the text after the image and the final link
|
| 709 |
complex_link_pattern = re.compile(r'\[!\[Image[^\]]*\]\([^\)]+\)\s*([^\]]+)\]\((https?://[^\s"\)]+)\)')
|
| 710 |
+
# Match view/viewer/point counts anywhere (including "47.2K viewers", "787 points" format)
|
| 711 |
+
views_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*(?:views?|viewers?|points?)', re.IGNORECASE)
|
| 712 |
likes_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*likes?', re.IGNORECASE)
|
| 713 |
comments_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*comments?', re.IGNORECASE)
|
| 714 |
date_pattern = re.compile(r'\b(today|yesterday|\d+\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\s+ago)\b', re.IGNORECASE)
|
|
|
|
| 873 |
row[col] = clean_title[:160]
|
| 874 |
elif lower_col in {"content", "summary", "description"}:
|
| 875 |
row[col] = clean_title[:320]
|
| 876 |
+
elif lower_col in {"views", "view_count", "viewers", "points", "score", "upvotes"}:
|
| 877 |
row[col] = metrics["views"]
|
| 878 |
elif lower_col in {"likes", "like_count"}:
|
| 879 |
row[col] = metrics["likes"]
|
|
|
|
| 1012 |
output_instructions: str | None,
|
| 1013 |
row_limit: int,
|
| 1014 |
) -> tuple[list[dict[str, Any]], list[str], str | None, float]:
|
| 1015 |
+
"""Search-guided generic recovery for low-relevance extraction results.
|
| 1016 |
+
|
| 1017 |
+
IMPORTANT: Prioritize the user's specified site - try alternative paths on the same domain
|
| 1018 |
+
before resorting to external search engines.
|
| 1019 |
+
"""
|
| 1020 |
|
| 1021 |
best_rows: list[dict[str, Any]] = []
|
| 1022 |
best_columns: list[str] = []
|
| 1023 |
best_source: str | None = None
|
| 1024 |
best_score = 0.0
|
| 1025 |
|
| 1026 |
+
# Normalize the base URL
|
| 1027 |
+
normalized = _coerce_url_asset(base_url) or base_url
|
| 1028 |
+
if "://" not in normalized:
|
| 1029 |
+
normalized = f"https://{normalized}"
|
| 1030 |
+
parsed = urlparse(normalized)
|
| 1031 |
+
|
| 1032 |
+
# FIRST: Try alternative paths on the SAME SITE (stay on user's specified domain)
|
| 1033 |
+
alternative_paths = _infer_navigation_paths(instructions)
|
| 1034 |
+
for alt_path in alternative_paths[:4]:
|
| 1035 |
+
alt_url = f"{parsed.scheme}://{parsed.netloc}{alt_path}"
|
| 1036 |
+
text_payload = _fetch_text_render_markdown(alt_url, timeout_seconds=12)
|
| 1037 |
+
if not text_payload:
|
| 1038 |
+
continue
|
| 1039 |
+
markdown, source_url = text_payload
|
| 1040 |
+
rows, columns = _extract_rows_from_text_render(
|
| 1041 |
+
markdown=markdown,
|
| 1042 |
+
source_url=source_url,
|
| 1043 |
+
output_instructions=output_instructions,
|
| 1044 |
+
instructions=instructions,
|
| 1045 |
+
row_limit=row_limit,
|
| 1046 |
+
)
|
| 1047 |
+
if not _rows_have_signal(rows):
|
| 1048 |
+
continue
|
| 1049 |
+
score = _rows_relevance_score(rows, instructions)
|
| 1050 |
+
if score > best_score or (
|
| 1051 |
+
abs(score - best_score) <= 0.0001 and len(rows) > len(best_rows)
|
| 1052 |
+
):
|
| 1053 |
+
best_rows = rows
|
| 1054 |
+
best_columns = columns
|
| 1055 |
+
best_source = source_url
|
| 1056 |
+
best_score = score
|
| 1057 |
+
|
| 1058 |
+
# If we found good data on the user's site, return it
|
| 1059 |
+
if best_score > 0.25:
|
| 1060 |
+
return best_rows, best_columns, best_source, best_score
|
| 1061 |
+
|
| 1062 |
+
# SECOND: Only as last resort, try external search (duckduckgo)
|
| 1063 |
queries = _build_recovery_queries(base_url, instructions)
|
| 1064 |
+
for query in queries[:2]:
|
| 1065 |
+
discovered_urls = await _search_urls_with_mcp(query, max_results=5)
|
| 1066 |
if not discovered_urls:
|
| 1067 |
discovered_urls = _discover_assets_for_query(query)
|
| 1068 |
|
| 1069 |
+
for candidate_url in discovered_urls[:3]:
|
| 1070 |
text_payload = _fetch_text_render_markdown(candidate_url, timeout_seconds=12)
|
| 1071 |
if not text_payload:
|
| 1072 |
continue
|
|
|
|
| 1509 |
"""Infer common navigation paths based on user intent - works generically across sites."""
|
| 1510 |
|
| 1511 |
if not instructions:
|
| 1512 |
+
return ["/"] # Default to homepage
|
| 1513 |
|
| 1514 |
instruction_text = instructions.lower()
|
| 1515 |
paths: list[str] = []
|
| 1516 |
|
| 1517 |
# Trending/popular intent - common paths across many sites
|
| 1518 |
+
# Include "/" (homepage) because many sites show top content on homepage
|
| 1519 |
if any(token in instruction_text for token in ("trending", "popular", "top", "hot", "best")):
|
| 1520 |
paths.extend([
|
| 1521 |
+
"/", # Homepage often shows top/trending content (HN, Reddit, etc.)
|
| 1522 |
"/trending",
|
| 1523 |
"/popular",
|
| 1524 |
"/explore",
|
|
|
|
| 1593 |
) -> str:
|
| 1594 |
"""Derive a deterministic navigation URL using plan/template hints when LLM is unavailable.
|
| 1595 |
|
| 1596 |
+
Strategy: Prioritize DIRECT SITE ACCESS over search when user specifies a site.
|
| 1597 |
1. Template target URLs (if available)
|
| 1598 |
+
2. Inferred navigation paths (trending, popular, etc.)
|
| 1599 |
+
3. Search only for EXPLICIT search intent
|
| 1600 |
+
4. Return the base URL (trust the site content)
|
| 1601 |
"""
|
| 1602 |
|
| 1603 |
normalized = _coerce_url_asset(base_url) or base_url
|
|
|
|
| 1633 |
if search_target:
|
| 1634 |
return _apply_text_render_proxy(search_target)
|
| 1635 |
|
| 1636 |
+
# 2. Try direct navigation paths FIRST (trending, hot, etc.)
|
| 1637 |
+
# These are direct site pages, not search queries
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1638 |
inferred_paths = _infer_navigation_paths(instructions)
|
| 1639 |
if inferred_paths:
|
| 1640 |
best_path = inferred_paths[0]
|
| 1641 |
inferred_url = f"{parsed.scheme}://{parsed.netloc}{best_path}"
|
| 1642 |
return _apply_text_render_proxy(inferred_url)
|
| 1643 |
|
| 1644 |
+
# 3. Only use site-internal search for EXPLICIT search intents
|
| 1645 |
+
search_intent = any(token in instruction_text for token in ("search for", "find ", "looking for", "search:"))
|
| 1646 |
if search_intent:
|
| 1647 |
search_url = _build_search_navigation_url(normalized, instructions)
|
| 1648 |
if search_url:
|
| 1649 |
return _apply_text_render_proxy(search_url)
|
| 1650 |
|
| 1651 |
+
# 4. Return the base URL - trust the site content (homepage often has what user wants)
|
| 1652 |
return _apply_text_render_proxy(normalized)
|
| 1653 |
|
| 1654 |
|
|
|
|
| 2701 |
|
| 2702 |
relevance_score = _rows_relevance_score(extracted_data, request.instructions)
|
| 2703 |
recovery_keywords = _instruction_keywords(request.instructions, max_keywords=8)
|
| 2704 |
+
|
| 2705 |
+
# Only attempt recovery if we have NO useful signal from the user's specified site
|
| 2706 |
+
# If we have data with signal, trust the user's site - don't go to external search
|
| 2707 |
+
if not _rows_have_signal(extracted_data) and recovery_keywords:
|
| 2708 |
step_num += 1
|
| 2709 |
yield _record_step(
|
| 2710 |
session,
|
|
|
|
| 2716 |
message="agent.recover_relevance(query)",
|
| 2717 |
extracted_data={
|
| 2718 |
"tool_name": "agent.recover_relevance",
|
| 2719 |
+
"tool_description": "Search-guided relevance recovery for empty extraction output",
|
| 2720 |
"parameters": {
|
| 2721 |
"keywords": recovery_keywords,
|
| 2722 |
"baseline_relevance": round(relevance_score, 3),
|
|
|
|
| 2732 |
output_instructions=request.output_instructions,
|
| 2733 |
row_limit=requested_limit,
|
| 2734 |
)
|
| 2735 |
+
# Only use recovery data if it's significantly better AND provides signal
|
| 2736 |
+
improved = _rows_have_signal(recovered_rows) and recovered_score > 0.3 and len(recovered_rows) >= 3
|
| 2737 |
if improved:
|
| 2738 |
extracted_data = recovered_rows
|
| 2739 |
output_columns = recovered_columns or output_columns
|