Greff3 commited on
Commit
78a19b1
verified
1 Parent(s): 7bf67aa

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +38 -11
main.py CHANGED
@@ -64,16 +64,14 @@ class BingSearch:
64
  timeout=self.timeout,
65
  impersonate=impersonate
66
  )
67
- # It's good practice to set a realistic User-Agent
68
  self.session.headers.update({
69
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
70
  })
71
 
72
- # FIX: Updated selectors to be more robust against Bing UI changes.
73
  def _selectors(self, element):
74
  selectors = {
75
- 'links': 'ol#b_results > li', # More generic selector for any list item in results
76
- 'next': 'a.sb_pagN' # Selector for the "Next" page button
77
  }
78
  return selectors[element]
79
 
@@ -136,11 +134,9 @@ class BingSearch:
136
  html = fetch_page(current_url)
137
  soup = BeautifulSoup(html, "html.parser")
138
 
139
- # Use the more generic selector for result blocks
140
  result_blocks = soup.select(self._selectors('links'))
141
 
142
  for result in result_blocks:
143
- # Find the title and link, which are usually in an <h2> tag
144
  title_tag = result.find('h2')
145
  if not title_tag:
146
  continue
@@ -152,18 +148,15 @@ class BingSearch:
152
  url_val = self._get_url(link_tag)
153
  title = title_tag.get_text(strip=True)
154
 
155
- # Find the description, often in a div with class 'b_caption'
156
  desc_container = result.find('div', class_='b_caption')
157
  description = ''
158
  if desc_container:
159
- # Find the paragraph within the caption, or use the whole caption text
160
  desc_p = desc_container.find('p')
161
  if desc_p:
162
  description = desc_p.get_text(strip=True)
163
  else:
164
  description = desc_container.get_text(strip=True)
165
 
166
- # Fallback if no 'b_caption' is found
167
  if not description:
168
  p_tag = result.find('p')
169
  if p_tag:
@@ -173,7 +166,42 @@ class BingSearch:
173
  if unique and url_val in fetched_links:
174
  continue
175
 
176
- fetched_results.append(BingSearchResult(url=url_val, title=title, description=description))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  fetched_links.add(url_val)
178
 
179
  if len(fetched_results) >= max_results:
@@ -182,7 +210,6 @@ class BingSearch:
182
  if len(fetched_results) >= max_results:
183
  break
184
 
185
- # Find the next page URL
186
  next_page_info = self._next_page(soup)
187
  current_url = next_page_info['url']
188
  if current_url:
 
64
  timeout=self.timeout,
65
  impersonate=impersonate
66
  )
 
67
  self.session.headers.update({
68
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
69
  })
70
 
 
71
  def _selectors(self, element):
72
  selectors = {
73
+ 'links': 'ol#b_results > li.b_algo', # More specific selector for organic results
74
+ 'next': 'a.sb_pagN'
75
  }
76
  return selectors[element]
77
 
 
134
  html = fetch_page(current_url)
135
  soup = BeautifulSoup(html, "html.parser")
136
 
 
137
  result_blocks = soup.select(self._selectors('links'))
138
 
139
  for result in result_blocks:
 
140
  title_tag = result.find('h2')
141
  if not title_tag:
142
  continue
 
148
  url_val = self._get_url(link_tag)
149
  title = title_tag.get_text(strip=True)
150
 
 
151
  desc_container = result.find('div', class_='b_caption')
152
  description = ''
153
  if desc_container:
 
154
  desc_p = desc_container.find('p')
155
  if desc_p:
156
  description = desc_p.get_text(strip=True)
157
  else:
158
  description = desc_container.get_text(strip=True)
159
 
 
160
  if not description:
161
  p_tag = result.find('p')
162
  if p_tag:
 
166
  if unique and url_val in fetched_links:
167
  continue
168
 
169
+ # --- FIXED: Metadata Parsing Logic ---
170
+ metadata = {}
171
+
172
+ # Parse Sitelinks
173
+ sitelinks_container = result.find('ul', class_='b_vlist')
174
+ if sitelinks_container:
175
+ sitelinks = []
176
+ for link_item in sitelinks_container.find_all('li'):
177
+ sitelink_tag = link_item.find('a')
178
+ if sitelink_tag and sitelink_tag.has_attr('href'):
179
+ sitelinks.append({
180
+ 'title': sitelink_tag.get_text(strip=True),
181
+ 'url': self._get_url(sitelink_tag)
182
+ })
183
+ if sitelinks:
184
+ metadata['sitelinks'] = sitelinks
185
+
186
+ # Parse Date (heuristic approach)
187
+ if desc_container:
188
+ # Date is often in a span preceding the description text
189
+ date_span = desc_container.find('span', class_=None)
190
+ if date_span:
191
+ date_text = date_span.get_text(strip=True)
192
+ # Simple check for date-like content
193
+ if any(word in date_text.lower() for word in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'ago']):
194
+ metadata['date'] = date_text.replace('路', '').strip()
195
+ # --- End of Fix ---
196
+
197
+ fetched_results.append(
198
+ BingSearchResult(
199
+ url=url_val,
200
+ title=title,
201
+ description=description,
202
+ metadata=metadata # Pass the populated metadata
203
+ )
204
+ )
205
  fetched_links.add(url_val)
206
 
207
  if len(fetched_results) >= max_results:
 
210
  if len(fetched_results) >= max_results:
211
  break
212
 
 
213
  next_page_info = self._next_page(soup)
214
  current_url = next_page_info['url']
215
  if current_url: