Update main.py
Browse files
main.py
CHANGED
|
@@ -64,14 +64,16 @@ class BingSearch:
|
|
| 64 |
timeout=self.timeout,
|
| 65 |
impersonate=impersonate
|
| 66 |
)
|
|
|
|
| 67 |
self.session.headers.update({
|
| 68 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
|
| 69 |
})
|
| 70 |
|
|
|
|
| 71 |
def _selectors(self, element):
|
| 72 |
selectors = {
|
| 73 |
-
'links': 'ol#b_results > li
|
| 74 |
-
'next': 'a.sb_pagN'
|
| 75 |
}
|
| 76 |
return selectors[element]
|
| 77 |
|
|
@@ -134,9 +136,11 @@ class BingSearch:
|
|
| 134 |
html = fetch_page(current_url)
|
| 135 |
soup = BeautifulSoup(html, "html.parser")
|
| 136 |
|
|
|
|
| 137 |
result_blocks = soup.select(self._selectors('links'))
|
| 138 |
|
| 139 |
for result in result_blocks:
|
|
|
|
| 140 |
title_tag = result.find('h2')
|
| 141 |
if not title_tag:
|
| 142 |
continue
|
|
@@ -148,15 +152,18 @@ class BingSearch:
|
|
| 148 |
url_val = self._get_url(link_tag)
|
| 149 |
title = title_tag.get_text(strip=True)
|
| 150 |
|
|
|
|
| 151 |
desc_container = result.find('div', class_='b_caption')
|
| 152 |
description = ''
|
| 153 |
if desc_container:
|
|
|
|
| 154 |
desc_p = desc_container.find('p')
|
| 155 |
if desc_p:
|
| 156 |
description = desc_p.get_text(strip=True)
|
| 157 |
else:
|
| 158 |
description = desc_container.get_text(strip=True)
|
| 159 |
|
|
|
|
| 160 |
if not description:
|
| 161 |
p_tag = result.find('p')
|
| 162 |
if p_tag:
|
|
@@ -166,42 +173,7 @@ class BingSearch:
|
|
| 166 |
if unique and url_val in fetched_links:
|
| 167 |
continue
|
| 168 |
|
| 169 |
-
|
| 170 |
-
metadata = {}
|
| 171 |
-
|
| 172 |
-
# Parse Sitelinks
|
| 173 |
-
sitelinks_container = result.find('ul', class_='b_vlist')
|
| 174 |
-
if sitelinks_container:
|
| 175 |
-
sitelinks = []
|
| 176 |
-
for link_item in sitelinks_container.find_all('li'):
|
| 177 |
-
sitelink_tag = link_item.find('a')
|
| 178 |
-
if sitelink_tag and sitelink_tag.has_attr('href'):
|
| 179 |
-
sitelinks.append({
|
| 180 |
-
'title': sitelink_tag.get_text(strip=True),
|
| 181 |
-
'url': self._get_url(sitelink_tag)
|
| 182 |
-
})
|
| 183 |
-
if sitelinks:
|
| 184 |
-
metadata['sitelinks'] = sitelinks
|
| 185 |
-
|
| 186 |
-
# Parse Date (heuristic approach)
|
| 187 |
-
if desc_container:
|
| 188 |
-
# Date is often in a span preceding the description text
|
| 189 |
-
date_span = desc_container.find('span', class_=None)
|
| 190 |
-
if date_span:
|
| 191 |
-
date_text = date_span.get_text(strip=True)
|
| 192 |
-
# Simple check for date-like content
|
| 193 |
-
if any(word in date_text.lower() for word in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'ago']):
|
| 194 |
-
metadata['date'] = date_text.replace('路', '').strip()
|
| 195 |
-
# --- End of Fix ---
|
| 196 |
-
|
| 197 |
-
fetched_results.append(
|
| 198 |
-
BingSearchResult(
|
| 199 |
-
url=url_val,
|
| 200 |
-
title=title,
|
| 201 |
-
description=description,
|
| 202 |
-
metadata=metadata # Pass the populated metadata
|
| 203 |
-
)
|
| 204 |
-
)
|
| 205 |
fetched_links.add(url_val)
|
| 206 |
|
| 207 |
if len(fetched_results) >= max_results:
|
|
@@ -210,6 +182,7 @@ class BingSearch:
|
|
| 210 |
if len(fetched_results) >= max_results:
|
| 211 |
break
|
| 212 |
|
|
|
|
| 213 |
next_page_info = self._next_page(soup)
|
| 214 |
current_url = next_page_info['url']
|
| 215 |
if current_url:
|
|
|
|
| 64 |
timeout=self.timeout,
|
| 65 |
impersonate=impersonate
|
| 66 |
)
|
| 67 |
+
# It's good practice to set a realistic User-Agent
|
| 68 |
self.session.headers.update({
|
| 69 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
|
| 70 |
})
|
| 71 |
|
| 72 |
+
# FIX: Updated selectors to be more robust against Bing UI changes.
|
| 73 |
def _selectors(self, element):
|
| 74 |
selectors = {
|
| 75 |
+
'links': 'ol#b_results > li', # More generic selector for any list item in results
|
| 76 |
+
'next': 'a.sb_pagN' # Selector for the "Next" page button
|
| 77 |
}
|
| 78 |
return selectors[element]
|
| 79 |
|
|
|
|
| 136 |
html = fetch_page(current_url)
|
| 137 |
soup = BeautifulSoup(html, "html.parser")
|
| 138 |
|
| 139 |
+
# Use the more generic selector for result blocks
|
| 140 |
result_blocks = soup.select(self._selectors('links'))
|
| 141 |
|
| 142 |
for result in result_blocks:
|
| 143 |
+
# Find the title and link, which are usually in an <h2> tag
|
| 144 |
title_tag = result.find('h2')
|
| 145 |
if not title_tag:
|
| 146 |
continue
|
|
|
|
| 152 |
url_val = self._get_url(link_tag)
|
| 153 |
title = title_tag.get_text(strip=True)
|
| 154 |
|
| 155 |
+
# Find the description, often in a div with class 'b_caption'
|
| 156 |
desc_container = result.find('div', class_='b_caption')
|
| 157 |
description = ''
|
| 158 |
if desc_container:
|
| 159 |
+
# Find the paragraph within the caption, or use the whole caption text
|
| 160 |
desc_p = desc_container.find('p')
|
| 161 |
if desc_p:
|
| 162 |
description = desc_p.get_text(strip=True)
|
| 163 |
else:
|
| 164 |
description = desc_container.get_text(strip=True)
|
| 165 |
|
| 166 |
+
# Fallback if no 'b_caption' is found
|
| 167 |
if not description:
|
| 168 |
p_tag = result.find('p')
|
| 169 |
if p_tag:
|
|
|
|
| 173 |
if unique and url_val in fetched_links:
|
| 174 |
continue
|
| 175 |
|
| 176 |
+
fetched_results.append(BingSearchResult(url=url_val, title=title, description=description))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
fetched_links.add(url_val)
|
| 178 |
|
| 179 |
if len(fetched_results) >= max_results:
|
|
|
|
| 182 |
if len(fetched_results) >= max_results:
|
| 183 |
break
|
| 184 |
|
| 185 |
+
# Find the next page URL
|
| 186 |
next_page_info = self._next_page(soup)
|
| 187 |
current_url = next_page_info['url']
|
| 188 |
if current_url:
|