NeerajCodz commited on
Commit
fc5088a
·
1 Parent(s): 922337a

feat: prioritize user-site extraction over external search, improve metric patterns for HN-style sites

Browse files
Files changed (1) hide show
  1. backend/app/api/routes/scrape.py +67 -26
backend/app/api/routes/scrape.py CHANGED
@@ -707,8 +707,8 @@ def _extract_markdown_link_rows(
707
  # Match complex links with embedded images: [![Image](img_url) Text](link_url)
708
  # This captures the text after the image and the final link
709
  complex_link_pattern = re.compile(r'\[!\[Image[^\]]*\]\([^\)]+\)\s*([^\]]+)\]\((https?://[^\s"\)]+)\)')
710
- # Match view/viewer counts anywhere (including "47.2K viewers" format)
711
- views_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*(?:views?|viewers?)', re.IGNORECASE)
712
  likes_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*likes?', re.IGNORECASE)
713
  comments_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*comments?', re.IGNORECASE)
714
  date_pattern = re.compile(r'\b(today|yesterday|\d+\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\s+ago)\b', re.IGNORECASE)
@@ -873,7 +873,7 @@ def _extract_markdown_link_rows(
873
  row[col] = clean_title[:160]
874
  elif lower_col in {"content", "summary", "description"}:
875
  row[col] = clean_title[:320]
876
- elif lower_col in {"views", "view_count", "viewers"}:
877
  row[col] = metrics["views"]
878
  elif lower_col in {"likes", "like_count"}:
879
  row[col] = metrics["likes"]
@@ -1012,20 +1012,61 @@ async def _search_recovery_rows(
1012
  output_instructions: str | None,
1013
  row_limit: int,
1014
  ) -> tuple[list[dict[str, Any]], list[str], str | None, float]:
1015
- """Search-guided generic recovery for low-relevance extraction results."""
 
 
 
 
1016
 
1017
  best_rows: list[dict[str, Any]] = []
1018
  best_columns: list[str] = []
1019
  best_source: str | None = None
1020
  best_score = 0.0
1021
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1022
  queries = _build_recovery_queries(base_url, instructions)
1023
- for query in queries[:3]:
1024
- discovered_urls = await _search_urls_with_mcp(query, max_results=8)
1025
  if not discovered_urls:
1026
  discovered_urls = _discover_assets_for_query(query)
1027
 
1028
- for candidate_url in discovered_urls[:5]:
1029
  text_payload = _fetch_text_render_markdown(candidate_url, timeout_seconds=12)
1030
  if not text_payload:
1031
  continue
@@ -1468,15 +1509,16 @@ def _infer_navigation_paths(instructions: str | None) -> list[str]:
1468
  """Infer common navigation paths based on user intent - works generically across sites."""
1469
 
1470
  if not instructions:
1471
- return []
1472
 
1473
  instruction_text = instructions.lower()
1474
  paths: list[str] = []
1475
 
1476
  # Trending/popular intent - common paths across many sites
 
1477
  if any(token in instruction_text for token in ("trending", "popular", "top", "hot", "best")):
1478
  paths.extend([
1479
- "/feed/trending",
1480
  "/trending",
1481
  "/popular",
1482
  "/explore",
@@ -1551,10 +1593,11 @@ def _fallback_navigation_url(
1551
  ) -> str:
1552
  """Derive a deterministic navigation URL using plan/template hints when LLM is unavailable.
1553
 
1554
- Uses intelligent path inference that works generically across sites:
1555
  1. Template target URLs (if available)
1556
- 2. For top/trending/popular requests: PREFER SEARCH URLs (work without auth)
1557
- 3. Direct path navigation as fallback
 
1558
  """
1559
 
1560
  normalized = _coerce_url_asset(base_url) or base_url
@@ -1590,28 +1633,22 @@ def _fallback_navigation_url(
1590
  if search_target:
1591
  return _apply_text_render_proxy(search_target)
1592
 
1593
- # 2. For "top/trending/popular" queries, PREFER SEARCH URLs
1594
- # Search results typically work without authentication and show actual content
1595
- ranked_intent = any(token in instruction_text for token in ("trending", "popular", "top", "best", "music", "video"))
1596
- if ranked_intent:
1597
- search_url = _build_search_navigation_url(normalized, instructions)
1598
- if search_url:
1599
- return _apply_text_render_proxy(search_url)
1600
-
1601
- # 3. Try direct navigation paths as fallback
1602
  inferred_paths = _infer_navigation_paths(instructions)
1603
  if inferred_paths:
1604
  best_path = inferred_paths[0]
1605
  inferred_url = f"{parsed.scheme}://{parsed.netloc}{best_path}"
1606
  return _apply_text_render_proxy(inferred_url)
1607
 
1608
- # 4. For explicit search intents, build a search URL
1609
- search_intent = any(token in instruction_text for token in ("search", "find", "looking for"))
1610
  if search_intent:
1611
  search_url = _build_search_navigation_url(normalized, instructions)
1612
  if search_url:
1613
  return _apply_text_render_proxy(search_url)
1614
 
 
1615
  return _apply_text_render_proxy(normalized)
1616
 
1617
 
@@ -2664,7 +2701,10 @@ Return ONLY executable Python code, no explanations or markdown:"""
2664
 
2665
  relevance_score = _rows_relevance_score(extracted_data, request.instructions)
2666
  recovery_keywords = _instruction_keywords(request.instructions, max_keywords=8)
2667
- if _rows_have_signal(extracted_data) and recovery_keywords and relevance_score < 0.22:
 
 
 
2668
  step_num += 1
2669
  yield _record_step(
2670
  session,
@@ -2676,7 +2716,7 @@ Return ONLY executable Python code, no explanations or markdown:"""
2676
  message="agent.recover_relevance(query)",
2677
  extracted_data={
2678
  "tool_name": "agent.recover_relevance",
2679
- "tool_description": "Search-guided relevance recovery for low-signal extraction output",
2680
  "parameters": {
2681
  "keywords": recovery_keywords,
2682
  "baseline_relevance": round(relevance_score, 3),
@@ -2692,7 +2732,8 @@ Return ONLY executable Python code, no explanations or markdown:"""
2692
  output_instructions=request.output_instructions,
2693
  row_limit=requested_limit,
2694
  )
2695
- improved = _rows_have_signal(recovered_rows) and recovered_score > (relevance_score + 0.05)
 
2696
  if improved:
2697
  extracted_data = recovered_rows
2698
  output_columns = recovered_columns or output_columns
 
707
  # Match complex links with embedded images: [![Image](img_url) Text](link_url)
708
  # This captures the text after the image and the final link
709
  complex_link_pattern = re.compile(r'\[!\[Image[^\]]*\]\([^\)]+\)\s*([^\]]+)\]\((https?://[^\s"\)]+)\)')
710
+ # Match view/viewer/point counts anywhere (including "47.2K viewers", "787 points" format)
711
+ views_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*(?:views?|viewers?|points?)', re.IGNORECASE)
712
  likes_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*likes?', re.IGNORECASE)
713
  comments_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*comments?', re.IGNORECASE)
714
  date_pattern = re.compile(r'\b(today|yesterday|\d+\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\s+ago)\b', re.IGNORECASE)
 
873
  row[col] = clean_title[:160]
874
  elif lower_col in {"content", "summary", "description"}:
875
  row[col] = clean_title[:320]
876
+ elif lower_col in {"views", "view_count", "viewers", "points", "score", "upvotes"}:
877
  row[col] = metrics["views"]
878
  elif lower_col in {"likes", "like_count"}:
879
  row[col] = metrics["likes"]
 
1012
  output_instructions: str | None,
1013
  row_limit: int,
1014
  ) -> tuple[list[dict[str, Any]], list[str], str | None, float]:
1015
+ """Search-guided generic recovery for low-relevance extraction results.
1016
+
1017
+ IMPORTANT: Prioritize the user's specified site - try alternative paths on the same domain
1018
+ before resorting to external search engines.
1019
+ """
1020
 
1021
  best_rows: list[dict[str, Any]] = []
1022
  best_columns: list[str] = []
1023
  best_source: str | None = None
1024
  best_score = 0.0
1025
 
1026
+ # Normalize the base URL
1027
+ normalized = _coerce_url_asset(base_url) or base_url
1028
+ if "://" not in normalized:
1029
+ normalized = f"https://{normalized}"
1030
+ parsed = urlparse(normalized)
1031
+
1032
+ # FIRST: Try alternative paths on the SAME SITE (stay on user's specified domain)
1033
+ alternative_paths = _infer_navigation_paths(instructions)
1034
+ for alt_path in alternative_paths[:4]:
1035
+ alt_url = f"{parsed.scheme}://{parsed.netloc}{alt_path}"
1036
+ text_payload = _fetch_text_render_markdown(alt_url, timeout_seconds=12)
1037
+ if not text_payload:
1038
+ continue
1039
+ markdown, source_url = text_payload
1040
+ rows, columns = _extract_rows_from_text_render(
1041
+ markdown=markdown,
1042
+ source_url=source_url,
1043
+ output_instructions=output_instructions,
1044
+ instructions=instructions,
1045
+ row_limit=row_limit,
1046
+ )
1047
+ if not _rows_have_signal(rows):
1048
+ continue
1049
+ score = _rows_relevance_score(rows, instructions)
1050
+ if score > best_score or (
1051
+ abs(score - best_score) <= 0.0001 and len(rows) > len(best_rows)
1052
+ ):
1053
+ best_rows = rows
1054
+ best_columns = columns
1055
+ best_source = source_url
1056
+ best_score = score
1057
+
1058
+ # If we found good data on the user's site, return it
1059
+ if best_score > 0.25:
1060
+ return best_rows, best_columns, best_source, best_score
1061
+
1062
+ # SECOND: Only as last resort, try external search (duckduckgo)
1063
  queries = _build_recovery_queries(base_url, instructions)
1064
+ for query in queries[:2]:
1065
+ discovered_urls = await _search_urls_with_mcp(query, max_results=5)
1066
  if not discovered_urls:
1067
  discovered_urls = _discover_assets_for_query(query)
1068
 
1069
+ for candidate_url in discovered_urls[:3]:
1070
  text_payload = _fetch_text_render_markdown(candidate_url, timeout_seconds=12)
1071
  if not text_payload:
1072
  continue
 
1509
  """Infer common navigation paths based on user intent - works generically across sites."""
1510
 
1511
  if not instructions:
1512
+ return ["/"] # Default to homepage
1513
 
1514
  instruction_text = instructions.lower()
1515
  paths: list[str] = []
1516
 
1517
  # Trending/popular intent - common paths across many sites
1518
+ # Include "/" (homepage) because many sites show top content on homepage
1519
  if any(token in instruction_text for token in ("trending", "popular", "top", "hot", "best")):
1520
  paths.extend([
1521
+ "/", # Homepage often shows top/trending content (HN, Reddit, etc.)
1522
  "/trending",
1523
  "/popular",
1524
  "/explore",
 
1593
  ) -> str:
1594
  """Derive a deterministic navigation URL using plan/template hints when LLM is unavailable.
1595
 
1596
+ Strategy: Prioritize DIRECT SITE ACCESS over search when user specifies a site.
1597
  1. Template target URLs (if available)
1598
+ 2. Inferred navigation paths (trending, popular, etc.)
1599
+ 3. Search only for EXPLICIT search intent
1600
+ 4. Return the base URL (trust the site content)
1601
  """
1602
 
1603
  normalized = _coerce_url_asset(base_url) or base_url
 
1633
  if search_target:
1634
  return _apply_text_render_proxy(search_target)
1635
 
1636
+ # 2. Try direct navigation paths FIRST (trending, hot, etc.)
1637
+ # These are direct site pages, not search queries
 
 
 
 
 
 
 
1638
  inferred_paths = _infer_navigation_paths(instructions)
1639
  if inferred_paths:
1640
  best_path = inferred_paths[0]
1641
  inferred_url = f"{parsed.scheme}://{parsed.netloc}{best_path}"
1642
  return _apply_text_render_proxy(inferred_url)
1643
 
1644
+ # 3. Only use site-internal search for EXPLICIT search intents
1645
+ search_intent = any(token in instruction_text for token in ("search for", "find ", "looking for", "search:"))
1646
  if search_intent:
1647
  search_url = _build_search_navigation_url(normalized, instructions)
1648
  if search_url:
1649
  return _apply_text_render_proxy(search_url)
1650
 
1651
+ # 4. Return the base URL - trust the site content (homepage often has what user wants)
1652
  return _apply_text_render_proxy(normalized)
1653
 
1654
 
 
2701
 
2702
  relevance_score = _rows_relevance_score(extracted_data, request.instructions)
2703
  recovery_keywords = _instruction_keywords(request.instructions, max_keywords=8)
2704
+
2705
+ # Only attempt recovery if we have NO useful signal from the user's specified site
2706
+ # If we have data with signal, trust the user's site - don't go to external search
2707
+ if not _rows_have_signal(extracted_data) and recovery_keywords:
2708
  step_num += 1
2709
  yield _record_step(
2710
  session,
 
2716
  message="agent.recover_relevance(query)",
2717
  extracted_data={
2718
  "tool_name": "agent.recover_relevance",
2719
+ "tool_description": "Search-guided relevance recovery for empty extraction output",
2720
  "parameters": {
2721
  "keywords": recovery_keywords,
2722
  "baseline_relevance": round(relevance_score, 3),
 
2732
  output_instructions=request.output_instructions,
2733
  row_limit=requested_limit,
2734
  )
2735
+ # Only use recovery data if it's significantly better AND provides signal
2736
+ improved = _rows_have_signal(recovered_rows) and recovered_score > 0.3 and len(recovered_rows) >= 3
2737
  if improved:
2738
  extracted_data = recovered_rows
2739
  output_columns = recovered_columns or output_columns