Soham Waghmare commited on
Commit
f7da48c
·
1 Parent(s): 56e3a38

fix: scrape error handling

Browse files
Files changed (1) hide show
  1. backend/scraper.py +22 -13
backend/scraper.py CHANGED
@@ -87,9 +87,10 @@ class CrawlForAIScraper:
87
  if not search_results:
88
  self.logger.info("Performing DuckDuckGo search as fallback...")
89
  self.logger.warning("No search results found.")
90
- search_results = self._duckduckgo_search(query)
91
- break
92
 
 
 
93
  self.logger.info(f"Found {len(search_results)} results")
94
  return search_results
95
 
@@ -97,22 +98,30 @@ class CrawlForAIScraper:
97
  self.logger.error(f"Google search error: {str(e)}", exc_info=True)
98
  raise
99
 
100
- def _duckduckgo_search(self, query: str) -> List[str]:
101
  self.logger.info("Performing DuckDuckGo search...")
102
  try:
103
  encoded_query = quote_plus(query)
104
- url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
105
-
106
- response = self.session.get(
107
- url,
108
- headers={
109
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
110
- },
111
- timeout=10,
 
 
 
 
 
 
 
 
 
112
  )
113
- response.raise_for_status()
114
 
115
- soup = BeautifulSoup(response.text, "html.parser")
116
  search_results = []
117
 
118
  # DuckDuckGo search results are in elements with class 'result__url'
 
87
  if not search_results:
88
  self.logger.info("Performing DuckDuckGo search as fallback...")
89
  self.logger.warning("No search results found.")
90
+ search_results = await self._duckduckgo_search(query)
 
91
 
92
+ if not search_results:
93
+ raise Exception("No results found")
94
  self.logger.info(f"Found {len(search_results)} results")
95
  return search_results
96
 
 
98
  self.logger.error(f"Google search error: {str(e)}", exc_info=True)
99
  raise
100
 
101
+ async def _duckduckgo_search(self, query: str) -> List[str]:
102
  self.logger.info("Performing DuckDuckGo search...")
103
  try:
104
  encoded_query = quote_plus(query)
105
+ search_uri = f"https://html.duckduckgo.com/html/?q={encoded_query}"
106
+
107
+ # response = self.session.get(
108
+ # url,
109
+ # headers={
110
+ # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
111
+ # },
112
+ # timeout=10,
113
+ # )
114
+ # response.raise_for_status()
115
+
116
+ result = await self.crawler.arun(
117
+ url=search_uri,
118
+ screenshot=False,
119
+ cache_mode=CacheMode.BYPASS,
120
+ delay_before_return_html=2,
121
+ scan_full_page=True,
122
  )
 
123
 
124
+ soup = BeautifulSoup(result.html, "html.parser")
125
  search_results = []
126
 
127
  # DuckDuckGo search results are in elements with class 'result__url'