Greff3 commited on
Commit
c374d16
verified
1 Parent(s): 78a19b1

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +11 -38
main.py CHANGED
@@ -64,14 +64,16 @@ class BingSearch:
64
  timeout=self.timeout,
65
  impersonate=impersonate
66
  )
 
67
  self.session.headers.update({
68
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
69
  })
70
 
 
71
  def _selectors(self, element):
72
  selectors = {
73
- 'links': 'ol#b_results > li.b_algo', # More specific selector for organic results
74
- 'next': 'a.sb_pagN'
75
  }
76
  return selectors[element]
77
 
@@ -134,9 +136,11 @@ class BingSearch:
134
  html = fetch_page(current_url)
135
  soup = BeautifulSoup(html, "html.parser")
136
 
 
137
  result_blocks = soup.select(self._selectors('links'))
138
 
139
  for result in result_blocks:
 
140
  title_tag = result.find('h2')
141
  if not title_tag:
142
  continue
@@ -148,15 +152,18 @@ class BingSearch:
148
  url_val = self._get_url(link_tag)
149
  title = title_tag.get_text(strip=True)
150
 
 
151
  desc_container = result.find('div', class_='b_caption')
152
  description = ''
153
  if desc_container:
 
154
  desc_p = desc_container.find('p')
155
  if desc_p:
156
  description = desc_p.get_text(strip=True)
157
  else:
158
  description = desc_container.get_text(strip=True)
159
 
 
160
  if not description:
161
  p_tag = result.find('p')
162
  if p_tag:
@@ -166,42 +173,7 @@ class BingSearch:
166
  if unique and url_val in fetched_links:
167
  continue
168
 
169
- # --- FIXED: Metadata Parsing Logic ---
170
- metadata = {}
171
-
172
- # Parse Sitelinks
173
- sitelinks_container = result.find('ul', class_='b_vlist')
174
- if sitelinks_container:
175
- sitelinks = []
176
- for link_item in sitelinks_container.find_all('li'):
177
- sitelink_tag = link_item.find('a')
178
- if sitelink_tag and sitelink_tag.has_attr('href'):
179
- sitelinks.append({
180
- 'title': sitelink_tag.get_text(strip=True),
181
- 'url': self._get_url(sitelink_tag)
182
- })
183
- if sitelinks:
184
- metadata['sitelinks'] = sitelinks
185
-
186
- # Parse Date (heuristic approach)
187
- if desc_container:
188
- # Date is often in a span preceding the description text
189
- date_span = desc_container.find('span', class_=None)
190
- if date_span:
191
- date_text = date_span.get_text(strip=True)
192
- # Simple check for date-like content
193
- if any(word in date_text.lower() for word in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'ago']):
194
- metadata['date'] = date_text.replace('路', '').strip()
195
- # --- End of Fix ---
196
-
197
- fetched_results.append(
198
- BingSearchResult(
199
- url=url_val,
200
- title=title,
201
- description=description,
202
- metadata=metadata # Pass the populated metadata
203
- )
204
- )
205
  fetched_links.add(url_val)
206
 
207
  if len(fetched_results) >= max_results:
@@ -210,6 +182,7 @@ class BingSearch:
210
  if len(fetched_results) >= max_results:
211
  break
212
 
 
213
  next_page_info = self._next_page(soup)
214
  current_url = next_page_info['url']
215
  if current_url:
 
64
  timeout=self.timeout,
65
  impersonate=impersonate
66
  )
67
+ # It's good practice to set a realistic User-Agent
68
  self.session.headers.update({
69
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
70
  })
71
 
72
+ # FIX: Updated selectors to be more robust against Bing UI changes.
73
  def _selectors(self, element):
74
  selectors = {
75
+ 'links': 'ol#b_results > li', # More generic selector for any list item in results
76
+ 'next': 'a.sb_pagN' # Selector for the "Next" page button
77
  }
78
  return selectors[element]
79
 
 
136
  html = fetch_page(current_url)
137
  soup = BeautifulSoup(html, "html.parser")
138
 
139
+ # Use the more generic selector for result blocks
140
  result_blocks = soup.select(self._selectors('links'))
141
 
142
  for result in result_blocks:
143
+ # Find the title and link, which are usually in an <h2> tag
144
  title_tag = result.find('h2')
145
  if not title_tag:
146
  continue
 
152
  url_val = self._get_url(link_tag)
153
  title = title_tag.get_text(strip=True)
154
 
155
+ # Find the description, often in a div with class 'b_caption'
156
  desc_container = result.find('div', class_='b_caption')
157
  description = ''
158
  if desc_container:
159
+ # Find the paragraph within the caption, or use the whole caption text
160
  desc_p = desc_container.find('p')
161
  if desc_p:
162
  description = desc_p.get_text(strip=True)
163
  else:
164
  description = desc_container.get_text(strip=True)
165
 
166
+ # Fallback if no 'b_caption' is found
167
  if not description:
168
  p_tag = result.find('p')
169
  if p_tag:
 
173
  if unique and url_val in fetched_links:
174
  continue
175
 
176
+ fetched_results.append(BingSearchResult(url=url_val, title=title, description=description))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  fetched_links.add(url_val)
178
 
179
  if len(fetched_results) >= max_results:
 
182
  if len(fetched_results) >= max_results:
183
  break
184
 
185
+ # Find the next page URL
186
  next_page_info = self._next_page(soup)
187
  current_url = next_page_info['url']
188
  if current_url: