Spaces:

Greff3
/

Brave

Running

App Files Files Community

Greff3 commited on Oct 5

Commit

c374d16

verified ·

1 Parent(s): 78a19b1

Update main.py

Browse files

Files changed (1) hide show

main.py +11 -38

main.py CHANGED Viewed

@@ -64,14 +64,16 @@ class BingSearch:
             timeout=self.timeout,
             impersonate=impersonate
         )
         self.session.headers.update({
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
         })
     def _selectors(self, element):
         selectors = {
-            'links': 'ol#b_results > li.b_algo',  # More specific selector for organic results
-            'next': 'a.sb_pagN'
         }
         return selectors[element]
@@ -134,9 +136,11 @@ class BingSearch:
             html = fetch_page(current_url)
             soup = BeautifulSoup(html, "html.parser")
             result_blocks = soup.select(self._selectors('links'))
             for result in result_blocks:
                 title_tag = result.find('h2')
                 if not title_tag:
                     continue
@@ -148,15 +152,18 @@ class BingSearch:
                 url_val = self._get_url(link_tag)
                 title = title_tag.get_text(strip=True)
                 desc_container = result.find('div', class_='b_caption')
                 description = ''
                 if desc_container:
                     desc_p = desc_container.find('p')
                     if desc_p:
                         description = desc_p.get_text(strip=True)
                     else:
                         description = desc_container.get_text(strip=True)
                 if not description:
                     p_tag = result.find('p')
                     if p_tag:
@@ -166,42 +173,7 @@ class BingSearch:
                     if unique and url_val in fetched_links:
                         continue
-                    # --- FIXED: Metadata Parsing Logic ---
-                    metadata = {}
-                    # Parse Sitelinks
-                    sitelinks_container = result.find('ul', class_='b_vlist')
-                    if sitelinks_container:
-                        sitelinks = []
-                        for link_item in sitelinks_container.find_all('li'):
-                            sitelink_tag = link_item.find('a')
-                            if sitelink_tag and sitelink_tag.has_attr('href'):
-                                sitelinks.append({
-                                    'title': sitelink_tag.get_text(strip=True),
-                                    'url': self._get_url(sitelink_tag)
-                                })
-                        if sitelinks:
-                            metadata['sitelinks'] = sitelinks
-                    # Parse Date (heuristic approach)
-                    if desc_container:
-                        # Date is often in a span preceding the description text
-                        date_span = desc_container.find('span', class_=None)
-                        if date_span:
-                            date_text = date_span.get_text(strip=True)
-                            # Simple check for date-like content
-                            if any(word in date_text.lower() for word in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'ago']):
-                                metadata['date'] = date_text.replace('·', '').strip()
-                    # --- End of Fix ---
-                    fetched_results.append(
-                        BingSearchResult(
-                            url=url_val,
-                            title=title,
-                            description=description,
-                            metadata=metadata  # Pass the populated metadata
-                        )
-                    )
                     fetched_links.add(url_val)
                     if len(fetched_results) >= max_results:
@@ -210,6 +182,7 @@ class BingSearch:
             if len(fetched_results) >= max_results:
                 break
             next_page_info = self._next_page(soup)
             current_url = next_page_info['url']
             if current_url:

             timeout=self.timeout,
             impersonate=impersonate
         )
+        # It's good practice to set a realistic User-Agent
         self.session.headers.update({
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
         })
+    # FIX: Updated selectors to be more robust against Bing UI changes.
     def _selectors(self, element):
         selectors = {
+            'links': 'ol#b_results > li',  # More generic selector for any list item in results
+            'next': 'a.sb_pagN' # Selector for the "Next" page button
         }
         return selectors[element]
             html = fetch_page(current_url)
             soup = BeautifulSoup(html, "html.parser")
+            # Use the more generic selector for result blocks
             result_blocks = soup.select(self._selectors('links'))
             for result in result_blocks:
+                # Find the title and link, which are usually in an <h2> tag
                 title_tag = result.find('h2')
                 if not title_tag:
                     continue
                 url_val = self._get_url(link_tag)
                 title = title_tag.get_text(strip=True)
+                # Find the description, often in a div with class 'b_caption'
                 desc_container = result.find('div', class_='b_caption')
                 description = ''
                 if desc_container:
+                    # Find the paragraph within the caption, or use the whole caption text
                     desc_p = desc_container.find('p')
                     if desc_p:
                         description = desc_p.get_text(strip=True)
                     else:
                         description = desc_container.get_text(strip=True)
+                # Fallback if no 'b_caption' is found
                 if not description:
                     p_tag = result.find('p')
                     if p_tag:
                     if unique and url_val in fetched_links:
                         continue
+                    fetched_results.append(BingSearchResult(url=url_val, title=title, description=description))
                     fetched_links.add(url_val)
                     if len(fetched_results) >= max_results:
             if len(fetched_results) >= max_results:
                 break
+            # Find the next page URL
             next_page_info = self._next_page(soup)
             current_url = next_page_info['url']
             if current_url: