NeerajCodz commited on
Commit
a04acb3
·
1 Parent(s): 82fb385

fix: intelligent navigation scraper with GitHub trending support

Browse files
Files changed (1) hide show
  1. backend/app/api/routes/scrape.py +151 -73
backend/app/api/routes/scrape.py CHANGED
@@ -18,6 +18,7 @@ from pathlib import Path
18
  from typing import Any, AsyncGenerator
19
  from urllib.parse import quote_plus, urlparse
20
 
 
21
  from fastapi import APIRouter, BackgroundTasks, HTTPException
22
  from fastapi.responses import StreamingResponse
23
  from pydantic import BaseModel, Field
@@ -45,6 +46,11 @@ logger = logging.getLogger(__name__)
45
  router = APIRouter(prefix="/scrape", tags=["Scraping"])
46
 
47
 
 
 
 
 
 
48
  class OutputFormat(str, Enum):
49
  """Supported output formats."""
50
 
@@ -719,6 +725,25 @@ async def scrape_url(
719
  ),
720
  )
721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  async def scrape_url_intelligently(
723
  session: dict[str, Any],
724
  session_id: str,
@@ -742,21 +767,24 @@ async def scrape_url_intelligently(
742
 
743
  # GitHub trending strategy
744
  if navigation_plan["strategy"] == "github_trending":
745
- yield from _scrape_github_trending(
746
  session, session_id, env, request, navigation_plan, step_num, total_reward
747
- )
 
748
 
749
  # General exploration strategy
750
  elif navigation_plan["strategy"] == "intelligent_exploration":
751
- yield from _scrape_with_exploration(
752
  session, session_id, env, request, navigation_plan, url, step_num, total_reward
753
- )
 
754
 
755
  # Default single page
756
  else:
757
- yield from _scrape_single_page(
758
  session, session_id, env, request, url, step_num, total_reward
759
- )
 
760
 
761
  except Exception as exc:
762
  logger.error(f"Intelligent scraping failed for {url}: {exc}")
@@ -909,93 +937,143 @@ async def _scrape_single_page(
909
  ) -> AsyncGenerator[dict[str, Any], None]:
910
  """Fallback to original single-page scraping."""
911
 
912
- # Use the original scrape_url logic for single pages
913
- async for result in scrape_url(session, session_id, url, get_settings(), request, None, []):
914
- yield result
915
- step_num += 1
916
- yield _record_step(
917
- session,
918
- ScrapeStep(
919
- step_number=step_num,
920
- action="extractor_python",
921
- url=url,
922
- status="completed",
923
- message="Extractor agent ran sandbox Python analysis",
924
- extracted_data=phase_result.output,
925
- timestamp=_now_iso(),
926
- ),
927
- )
928
- else:
929
- session["errors"].append(phase_result.error or "Extractor sandbox analysis failed")
930
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
931
  step_num += 1
932
- extracted_count = len([name for name in fields_to_extract if name in extracted])
933
- verification_score = (
934
- extracted_count / len(fields_to_extract)
935
- if fields_to_extract
936
- else 0.0
937
- )
938
  yield _record_step(
939
  session,
940
  ScrapeStep(
941
  step_number=step_num,
942
- action="verify",
943
  url=url,
944
- status="completed",
945
- message=f"Verifier checked extraction completeness ({extracted_count}/{len(fields_to_extract)})",
946
- reward=verification_score,
947
- extracted_data={"coverage": verification_score},
948
  timestamp=_now_iso(),
949
  ),
950
  )
951
-
952
- step_num += 1
953
- done_action = Action(
954
- action_type=ActionType.DONE,
955
- parameters={"success": True},
956
- reasoning="Extraction complete",
957
  )
958
- _, reward, _, _, _, _ = await env.step(done_action)
959
  total_reward += reward
 
 
 
 
 
 
 
960
  yield _record_step(
961
  session,
962
  ScrapeStep(
963
  step_number=step_num,
964
- action="complete",
965
  url=url,
966
  status="completed",
967
- message=f"Completed scraping {url}",
968
- reward=total_reward,
969
- extracted_data=extracted,
970
  timestamp=_now_iso(),
971
  ),
972
  )
973
-
974
- session["total_reward"] += total_reward
975
- session["extracted_data"][url] = extracted
976
- _write_session_json_artifact(
977
- session,
978
- f"{_safe_artifact_name(urlparse(url).netloc or url)}_extracted.json",
979
- extracted,
980
- )
981
-
982
- if request.enable_memory:
983
- await _store_url_memory(session_id, url, extracted, memory_manager)
984
-
985
- except Exception as exc:
986
- error_message = f"{url}: {exc}"
987
- session["errors"].append(error_message)
988
- logger.exception("Error scraping URL", extra={"url": url, "session_id": session_id})
989
- yield {
990
- "type": "error",
991
- "data": {
992
- "url": url,
993
- "error": str(exc),
994
- "timestamp": _now_iso(),
995
- },
996
- }
997
- finally:
998
- remove_environment(episode_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
 
1000
 
1001
  async def _scrape_with_exploration(
 
18
  from typing import Any, AsyncGenerator
19
  from urllib.parse import quote_plus, urlparse
20
 
21
+ from bs4 import BeautifulSoup
22
  from fastapi import APIRouter, BackgroundTasks, HTTPException
23
  from fastapi.responses import StreamingResponse
24
  from pydantic import BaseModel, Field
 
46
  router = APIRouter(prefix="/scrape", tags=["Scraping"])
47
 
48
 
49
+ def parse_html(html: str) -> BeautifulSoup:
50
+ """Parse HTML string into BeautifulSoup object."""
51
+ return BeautifulSoup(html, "html.parser")
52
+
53
+
54
  class OutputFormat(str, Enum):
55
  """Supported output formats."""
56
 
 
725
  ),
726
  )
727
 
728
+ if terminated or truncated:
729
+ break
730
+
731
+ except Exception as exc:
732
+ error_message = f"{url}: {exc}"
733
+ session["errors"].append(error_message)
734
+ logger.exception("Error scraping URL", extra={"url": url, "session_id": session_id})
735
+ yield {
736
+ "type": "error",
737
+ "data": {
738
+ "url": url,
739
+ "error": str(exc),
740
+ "timestamp": _now_iso(),
741
+ },
742
+ }
743
+ finally:
744
+ remove_environment(episode_id)
745
+
746
+
747
  async def scrape_url_intelligently(
748
  session: dict[str, Any],
749
  session_id: str,
 
767
 
768
  # GitHub trending strategy
769
  if navigation_plan["strategy"] == "github_trending":
770
+ async for event in _scrape_github_trending(
771
  session, session_id, env, request, navigation_plan, step_num, total_reward
772
+ ):
773
+ yield event
774
 
775
  # General exploration strategy
776
  elif navigation_plan["strategy"] == "intelligent_exploration":
777
+ async for event in _scrape_with_exploration(
778
  session, session_id, env, request, navigation_plan, url, step_num, total_reward
779
+ ):
780
+ yield event
781
 
782
  # Default single page
783
  else:
784
+ async for event in _scrape_single_page(
785
  session, session_id, env, request, url, step_num, total_reward
786
+ ):
787
+ yield event
788
 
789
  except Exception as exc:
790
  logger.error(f"Intelligent scraping failed for {url}: {exc}")
 
937
  ) -> AsyncGenerator[dict[str, Any], None]:
938
  """Fallback to original single-page scraping."""
939
 
940
+ # Navigate to URL
941
+ step_num += 1
942
+ yield _record_step(
943
+ session,
944
+ ScrapeStep(
945
+ step_number=step_num,
946
+ action="navigate",
947
+ url=url,
948
+ status="running",
949
+ message=f"Navigating to {url}...",
950
+ timestamp=_now_iso(),
951
+ ),
952
+ )
953
+
954
+ navigate_action = Action(
955
+ action_type=ActionType.NAVIGATE,
956
+ parameters={"url": url},
957
+ reasoning=f"Navigate to target URL: {url}",
958
+ )
959
+ nav_obs, reward, _, _, _, nav_info = await env.step(navigate_action)
960
+ total_reward += reward
961
+
962
+ nav_success = nav_info.get("action_result", {}).get("success", bool(nav_obs.page_html))
963
+
964
+ yield _record_step(
965
+ session,
966
+ ScrapeStep(
967
+ step_number=step_num,
968
+ action="navigate",
969
+ url=url,
970
+ status="completed" if nav_success else "failed",
971
+ message=f"Navigated to {url}" if nav_success else "Navigation failed",
972
+ reward=reward,
973
+ timestamp=_now_iso(),
974
+ ),
975
+ )
976
+
977
+ if not nav_success or not nav_obs.page_html:
978
+ session["errors"].append(f"Failed to navigate to {url}")
979
+ return
980
+
981
+ # Extract fields
982
+ extracted = {}
983
+ fields_to_extract = _extract_fields_for_complexity(request.complexity)
984
+
985
+ for field_name in fields_to_extract:
986
  step_num += 1
 
 
 
 
 
 
987
  yield _record_step(
988
  session,
989
  ScrapeStep(
990
  step_number=step_num,
991
+ action="extract",
992
  url=url,
993
+ status="running",
994
+ message=f"Extracting {field_name}...",
 
 
995
  timestamp=_now_iso(),
996
  ),
997
  )
998
+
999
+ extract_action = Action(
1000
+ action_type=ActionType.EXTRACT_FIELD,
1001
+ parameters={"field_name": field_name},
1002
+ reasoning=f"Extract {field_name} from page",
 
1003
  )
1004
+ obs, reward, _, _, _, _ = await env.step(extract_action)
1005
  total_reward += reward
1006
+
1007
+ if obs.extracted_so_far:
1008
+ for ef in obs.extracted_so_far:
1009
+ if ef.field_name == field_name:
1010
+ extracted[field_name] = ef.value
1011
+ break
1012
+
1013
  yield _record_step(
1014
  session,
1015
  ScrapeStep(
1016
  step_number=step_num,
1017
+ action="extract",
1018
  url=url,
1019
  status="completed",
1020
+ message=f"Extracted {field_name}",
1021
+ reward=reward,
1022
+ extracted_data={field_name: extracted.get(field_name)},
1023
  timestamp=_now_iso(),
1024
  ),
1025
  )
1026
+
1027
+ # Verification step
1028
+ step_num += 1
1029
+ extracted_count = len([f for f in fields_to_extract if f in extracted])
1030
+ verification_score = extracted_count / len(fields_to_extract) if fields_to_extract else 0.0
1031
+
1032
+ yield _record_step(
1033
+ session,
1034
+ ScrapeStep(
1035
+ step_number=step_num,
1036
+ action="verify",
1037
+ url=url,
1038
+ status="completed",
1039
+ message=f"Verifier checked extraction completeness ({extracted_count}/{len(fields_to_extract)})",
1040
+ reward=verification_score,
1041
+ extracted_data={"coverage": verification_score},
1042
+ timestamp=_now_iso(),
1043
+ ),
1044
+ )
1045
+
1046
+ # Complete
1047
+ step_num += 1
1048
+ done_action = Action(
1049
+ action_type=ActionType.DONE,
1050
+ parameters={"success": True},
1051
+ reasoning="Extraction complete",
1052
+ )
1053
+ _, reward, _, _, _, _ = await env.step(done_action)
1054
+ total_reward += reward
1055
+
1056
+ yield _record_step(
1057
+ session,
1058
+ ScrapeStep(
1059
+ step_number=step_num,
1060
+ action="complete",
1061
+ url=url,
1062
+ status="completed",
1063
+ message=f"Completed scraping {url}",
1064
+ reward=total_reward,
1065
+ extracted_data=extracted,
1066
+ timestamp=_now_iso(),
1067
+ ),
1068
+ )
1069
+
1070
+ session["total_reward"] += total_reward
1071
+ session["extracted_data"][url] = extracted
1072
+ _write_session_json_artifact(
1073
+ session,
1074
+ f"{_safe_artifact_name(urlparse(url).netloc or url)}_extracted.json",
1075
+ extracted,
1076
+ )
1077
 
1078
 
1079
  async def _scrape_with_exploration(