Update main.py
Browse files
main.py
CHANGED
|
@@ -64,16 +64,14 @@ class BingSearch:
|
|
| 64 |
timeout=self.timeout,
|
| 65 |
impersonate=impersonate
|
| 66 |
)
|
| 67 |
-
# It's good practice to set a realistic User-Agent
|
| 68 |
self.session.headers.update({
|
| 69 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
|
| 70 |
})
|
| 71 |
|
| 72 |
-
# FIX: Updated selectors to be more robust against Bing UI changes.
|
| 73 |
def _selectors(self, element):
|
| 74 |
selectors = {
|
| 75 |
-
'links': 'ol#b_results > li', # More
|
| 76 |
-
'next': 'a.sb_pagN'
|
| 77 |
}
|
| 78 |
return selectors[element]
|
| 79 |
|
|
@@ -136,11 +134,9 @@ class BingSearch:
|
|
| 136 |
html = fetch_page(current_url)
|
| 137 |
soup = BeautifulSoup(html, "html.parser")
|
| 138 |
|
| 139 |
-
# Use the more generic selector for result blocks
|
| 140 |
result_blocks = soup.select(self._selectors('links'))
|
| 141 |
|
| 142 |
for result in result_blocks:
|
| 143 |
-
# Find the title and link, which are usually in an <h2> tag
|
| 144 |
title_tag = result.find('h2')
|
| 145 |
if not title_tag:
|
| 146 |
continue
|
|
@@ -152,18 +148,15 @@ class BingSearch:
|
|
| 152 |
url_val = self._get_url(link_tag)
|
| 153 |
title = title_tag.get_text(strip=True)
|
| 154 |
|
| 155 |
-
# Find the description, often in a div with class 'b_caption'
|
| 156 |
desc_container = result.find('div', class_='b_caption')
|
| 157 |
description = ''
|
| 158 |
if desc_container:
|
| 159 |
-
# Find the paragraph within the caption, or use the whole caption text
|
| 160 |
desc_p = desc_container.find('p')
|
| 161 |
if desc_p:
|
| 162 |
description = desc_p.get_text(strip=True)
|
| 163 |
else:
|
| 164 |
description = desc_container.get_text(strip=True)
|
| 165 |
|
| 166 |
-
# Fallback if no 'b_caption' is found
|
| 167 |
if not description:
|
| 168 |
p_tag = result.find('p')
|
| 169 |
if p_tag:
|
|
@@ -173,7 +166,42 @@ class BingSearch:
|
|
| 173 |
if unique and url_val in fetched_links:
|
| 174 |
continue
|
| 175 |
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
fetched_links.add(url_val)
|
| 178 |
|
| 179 |
if len(fetched_results) >= max_results:
|
|
@@ -182,7 +210,6 @@ class BingSearch:
|
|
| 182 |
if len(fetched_results) >= max_results:
|
| 183 |
break
|
| 184 |
|
| 185 |
-
# Find the next page URL
|
| 186 |
next_page_info = self._next_page(soup)
|
| 187 |
current_url = next_page_info['url']
|
| 188 |
if current_url:
|
|
|
|
| 64 |
timeout=self.timeout,
|
| 65 |
impersonate=impersonate
|
| 66 |
)
|
|
|
|
| 67 |
self.session.headers.update({
|
| 68 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
|
| 69 |
})
|
| 70 |
|
|
|
|
| 71 |
def _selectors(self, element):
|
| 72 |
selectors = {
|
| 73 |
+
'links': 'ol#b_results > li.b_algo', # More specific selector for organic results
|
| 74 |
+
'next': 'a.sb_pagN'
|
| 75 |
}
|
| 76 |
return selectors[element]
|
| 77 |
|
|
|
|
| 134 |
html = fetch_page(current_url)
|
| 135 |
soup = BeautifulSoup(html, "html.parser")
|
| 136 |
|
|
|
|
| 137 |
result_blocks = soup.select(self._selectors('links'))
|
| 138 |
|
| 139 |
for result in result_blocks:
|
|
|
|
| 140 |
title_tag = result.find('h2')
|
| 141 |
if not title_tag:
|
| 142 |
continue
|
|
|
|
| 148 |
url_val = self._get_url(link_tag)
|
| 149 |
title = title_tag.get_text(strip=True)
|
| 150 |
|
|
|
|
| 151 |
desc_container = result.find('div', class_='b_caption')
|
| 152 |
description = ''
|
| 153 |
if desc_container:
|
|
|
|
| 154 |
desc_p = desc_container.find('p')
|
| 155 |
if desc_p:
|
| 156 |
description = desc_p.get_text(strip=True)
|
| 157 |
else:
|
| 158 |
description = desc_container.get_text(strip=True)
|
| 159 |
|
|
|
|
| 160 |
if not description:
|
| 161 |
p_tag = result.find('p')
|
| 162 |
if p_tag:
|
|
|
|
| 166 |
if unique and url_val in fetched_links:
|
| 167 |
continue
|
| 168 |
|
| 169 |
+
# --- FIXED: Metadata Parsing Logic ---
|
| 170 |
+
metadata = {}
|
| 171 |
+
|
| 172 |
+
# Parse Sitelinks
|
| 173 |
+
sitelinks_container = result.find('ul', class_='b_vlist')
|
| 174 |
+
if sitelinks_container:
|
| 175 |
+
sitelinks = []
|
| 176 |
+
for link_item in sitelinks_container.find_all('li'):
|
| 177 |
+
sitelink_tag = link_item.find('a')
|
| 178 |
+
if sitelink_tag and sitelink_tag.has_attr('href'):
|
| 179 |
+
sitelinks.append({
|
| 180 |
+
'title': sitelink_tag.get_text(strip=True),
|
| 181 |
+
'url': self._get_url(sitelink_tag)
|
| 182 |
+
})
|
| 183 |
+
if sitelinks:
|
| 184 |
+
metadata['sitelinks'] = sitelinks
|
| 185 |
+
|
| 186 |
+
# Parse Date (heuristic approach)
|
| 187 |
+
if desc_container:
|
| 188 |
+
# Date is often in a span preceding the description text
|
| 189 |
+
date_span = desc_container.find('span', class_=None)
|
| 190 |
+
if date_span:
|
| 191 |
+
date_text = date_span.get_text(strip=True)
|
| 192 |
+
# Simple check for date-like content
|
| 193 |
+
if any(word in date_text.lower() for word in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'ago']):
|
| 194 |
+
metadata['date'] = date_text.replace('路', '').strip()
|
| 195 |
+
# --- End of Fix ---
|
| 196 |
+
|
| 197 |
+
fetched_results.append(
|
| 198 |
+
BingSearchResult(
|
| 199 |
+
url=url_val,
|
| 200 |
+
title=title,
|
| 201 |
+
description=description,
|
| 202 |
+
metadata=metadata # Pass the populated metadata
|
| 203 |
+
)
|
| 204 |
+
)
|
| 205 |
fetched_links.add(url_val)
|
| 206 |
|
| 207 |
if len(fetched_results) >= max_results:
|
|
|
|
| 210 |
if len(fetched_results) >= max_results:
|
| 211 |
break
|
| 212 |
|
|
|
|
| 213 |
next_page_info = self._next_page(soup)
|
| 214 |
current_url = next_page_info['url']
|
| 215 |
if current_url:
|