Update main.py
Browse files
main.py
CHANGED
|
@@ -64,15 +64,16 @@ class BingSearch:
|
|
| 64 |
timeout=self.timeout,
|
| 65 |
impersonate=impersonate
|
| 66 |
)
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
| 68 |
|
|
|
|
| 69 |
def _selectors(self, element):
|
| 70 |
selectors = {
|
| 71 |
-
'
|
| 72 |
-
'
|
| 73 |
-
'text': 'p',
|
| 74 |
-
'links': 'ol#b_results > li.b_algo',
|
| 75 |
-
'next': 'div#b_content nav[role="navigation"] a.sb_pagN'
|
| 76 |
}
|
| 77 |
return selectors[element]
|
| 78 |
|
|
@@ -106,6 +107,7 @@ class BingSearch:
|
|
| 106 |
print(f"Error decoding Base64 string: {e}")
|
| 107 |
return resp
|
| 108 |
|
|
|
|
| 109 |
def text(
|
| 110 |
self,
|
| 111 |
keywords: str,
|
|
@@ -116,61 +118,79 @@ class BingSearch:
|
|
| 116 |
) -> List[BingSearchResult]:
|
| 117 |
if not keywords:
|
| 118 |
raise ValueError("Search keywords cannot be empty")
|
| 119 |
-
|
| 120 |
-
"on": "Strict",
|
| 121 |
-
"moderate": "Moderate",
|
| 122 |
-
"off": "Off"
|
| 123 |
-
}
|
| 124 |
-
safe = safe_map.get(safesearch.lower(), "Moderate")
|
| 125 |
fetched_results = []
|
| 126 |
fetched_links = set()
|
|
|
|
| 127 |
def fetch_page(url):
|
| 128 |
try:
|
| 129 |
resp = self.session.get(url)
|
| 130 |
resp.raise_for_status()
|
| 131 |
return resp.text
|
| 132 |
except Exception as e:
|
| 133 |
-
|
| 134 |
-
raise Exception(f"Bing search failed with status {e.response.status_code}: {str(e)}")
|
| 135 |
-
else:
|
| 136 |
-
raise Exception(f"Bing search failed: {str(e)}")
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
while len(fetched_results) < max_results
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
continue
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
break
|
| 163 |
-
if len(fetched_results) >= max_results:
|
| 164 |
-
break
|
| 165 |
-
next_page_info = self._next_page(soup)
|
| 166 |
-
if next_page_info['url']:
|
| 167 |
-
urls_to_fetch.append(next_page_info['url'])
|
| 168 |
-
sleep(self.sleep_interval)
|
| 169 |
next_page_info = self._next_page(soup)
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
| 172 |
return fetched_results[:max_results]
|
| 173 |
|
|
|
|
| 174 |
def suggestions(self, query: str, region: str = None) -> List[str]:
|
| 175 |
if not query:
|
| 176 |
raise ValueError("Search query cannot be empty")
|
|
|
|
| 64 |
timeout=self.timeout,
|
| 65 |
impersonate=impersonate
|
| 66 |
)
|
| 67 |
+
# It's good practice to set a realistic User-Agent
|
| 68 |
+
self.session.headers.update({
|
| 69 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
|
| 70 |
+
})
|
| 71 |
|
| 72 |
+
# FIX: Updated selectors to be more robust against Bing UI changes.
|
| 73 |
def _selectors(self, element):
|
| 74 |
selectors = {
|
| 75 |
+
'links': 'ol#b_results > li', # More generic selector for any list item in results
|
| 76 |
+
'next': 'a.sb_pagN' # Selector for the "Next" page button
|
|
|
|
|
|
|
|
|
|
| 77 |
}
|
| 78 |
return selectors[element]
|
| 79 |
|
|
|
|
| 107 |
print(f"Error decoding Base64 string: {e}")
|
| 108 |
return resp
|
| 109 |
|
| 110 |
+
# FIX: The entire text parsing logic is updated to handle modern Bing HTML structure.
|
| 111 |
def text(
|
| 112 |
self,
|
| 113 |
keywords: str,
|
|
|
|
| 118 |
) -> List[BingSearchResult]:
|
| 119 |
if not keywords:
|
| 120 |
raise ValueError("Search keywords cannot be empty")
|
| 121 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
fetched_results = []
|
| 123 |
fetched_links = set()
|
| 124 |
+
|
| 125 |
def fetch_page(url):
|
| 126 |
try:
|
| 127 |
resp = self.session.get(url)
|
| 128 |
resp.raise_for_status()
|
| 129 |
return resp.text
|
| 130 |
except Exception as e:
|
| 131 |
+
raise Exception(f"Bing search failed: {str(e)}")
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
+
current_url = self._first_page(keywords)['url']
|
| 134 |
+
|
| 135 |
+
while current_url and len(fetched_results) < max_results:
|
| 136 |
+
html = fetch_page(current_url)
|
| 137 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 138 |
+
|
| 139 |
+
# Use the more generic selector for result blocks
|
| 140 |
+
result_blocks = soup.select(self._selectors('links'))
|
| 141 |
+
|
| 142 |
+
for result in result_blocks:
|
| 143 |
+
# Find the title and link, which are usually in an <h2> tag
|
| 144 |
+
title_tag = result.find('h2')
|
| 145 |
+
if not title_tag:
|
| 146 |
+
continue
|
| 147 |
+
|
| 148 |
+
link_tag = title_tag.find('a')
|
| 149 |
+
if not link_tag or not link_tag.has_attr('href'):
|
| 150 |
+
continue
|
| 151 |
+
|
| 152 |
+
url_val = self._get_url(link_tag)
|
| 153 |
+
title = title_tag.get_text(strip=True)
|
| 154 |
+
|
| 155 |
+
# Find the description, often in a div with class 'b_caption'
|
| 156 |
+
desc_container = result.find('div', class_='b_caption')
|
| 157 |
+
description = ''
|
| 158 |
+
if desc_container:
|
| 159 |
+
# Find the paragraph within the caption, or use the whole caption text
|
| 160 |
+
desc_p = desc_container.find('p')
|
| 161 |
+
if desc_p:
|
| 162 |
+
description = desc_p.get_text(strip=True)
|
| 163 |
+
else:
|
| 164 |
+
description = desc_container.get_text(strip=True)
|
| 165 |
+
|
| 166 |
+
# Fallback if no 'b_caption' is found
|
| 167 |
+
if not description:
|
| 168 |
+
p_tag = result.find('p')
|
| 169 |
+
if p_tag:
|
| 170 |
+
description = p_tag.get_text(strip=True)
|
| 171 |
+
|
| 172 |
+
if url_val and title:
|
| 173 |
+
if unique and url_val in fetched_links:
|
| 174 |
continue
|
| 175 |
+
|
| 176 |
+
fetched_results.append(BingSearchResult(url=url_val, title=title, description=description))
|
| 177 |
+
fetched_links.add(url_val)
|
| 178 |
+
|
| 179 |
+
if len(fetched_results) >= max_results:
|
| 180 |
+
break
|
| 181 |
+
|
| 182 |
+
if len(fetched_results) >= max_results:
|
| 183 |
+
break
|
| 184 |
+
|
| 185 |
+
# Find the next page URL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
next_page_info = self._next_page(soup)
|
| 187 |
+
current_url = next_page_info['url']
|
| 188 |
+
if current_url:
|
| 189 |
+
sleep(self.sleep_interval)
|
| 190 |
+
|
| 191 |
return fetched_results[:max_results]
|
| 192 |
|
| 193 |
+
|
| 194 |
def suggestions(self, query: str, region: str = None) -> List[str]:
|
| 195 |
if not query:
|
| 196 |
raise ValueError("Search query cannot be empty")
|