Spaces:

Greff3
/

Brave

Running

App Files Files Community

Greff3 commited on Oct 5

Commit

78a19b1

verified ·

1 Parent(s): 7bf67aa

Update main.py

Browse files

Files changed (1) hide show

main.py +38 -11

main.py CHANGED Viewed

@@ -64,16 +64,14 @@ class BingSearch:
             timeout=self.timeout,
             impersonate=impersonate
         )
-        # It's good practice to set a realistic User-Agent
         self.session.headers.update({
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
         })
-    # FIX: Updated selectors to be more robust against Bing UI changes.
     def _selectors(self, element):
         selectors = {
-            'links': 'ol#b_results > li',  # More generic selector for any list item in results
-            'next': 'a.sb_pagN' # Selector for the "Next" page button
         }
         return selectors[element]
@@ -136,11 +134,9 @@ class BingSearch:
             html = fetch_page(current_url)
             soup = BeautifulSoup(html, "html.parser")
-            # Use the more generic selector for result blocks
             result_blocks = soup.select(self._selectors('links'))
             for result in result_blocks:
-                # Find the title and link, which are usually in an <h2> tag
                 title_tag = result.find('h2')
                 if not title_tag:
                     continue
@@ -152,18 +148,15 @@ class BingSearch:
                 url_val = self._get_url(link_tag)
                 title = title_tag.get_text(strip=True)
-                # Find the description, often in a div with class 'b_caption'
                 desc_container = result.find('div', class_='b_caption')
                 description = ''
                 if desc_container:
-                    # Find the paragraph within the caption, or use the whole caption text
                     desc_p = desc_container.find('p')
                     if desc_p:
                         description = desc_p.get_text(strip=True)
                     else:
                         description = desc_container.get_text(strip=True)
-                # Fallback if no 'b_caption' is found
                 if not description:
                     p_tag = result.find('p')
                     if p_tag:
@@ -173,7 +166,42 @@ class BingSearch:
                     if unique and url_val in fetched_links:
                         continue
-                    fetched_results.append(BingSearchResult(url=url_val, title=title, description=description))
                     fetched_links.add(url_val)
                     if len(fetched_results) >= max_results:
@@ -182,7 +210,6 @@ class BingSearch:
             if len(fetched_results) >= max_results:
                 break
-            # Find the next page URL
             next_page_info = self._next_page(soup)
             current_url = next_page_info['url']
             if current_url:

             timeout=self.timeout,
             impersonate=impersonate
         )
         self.session.headers.update({
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
         })
     def _selectors(self, element):
         selectors = {
+            'links': 'ol#b_results > li.b_algo',  # More specific selector for organic results
+            'next': 'a.sb_pagN'
         }
         return selectors[element]
             html = fetch_page(current_url)
             soup = BeautifulSoup(html, "html.parser")
             result_blocks = soup.select(self._selectors('links'))
             for result in result_blocks:
                 title_tag = result.find('h2')
                 if not title_tag:
                     continue
                 url_val = self._get_url(link_tag)
                 title = title_tag.get_text(strip=True)
                 desc_container = result.find('div', class_='b_caption')
                 description = ''
                 if desc_container:
                     desc_p = desc_container.find('p')
                     if desc_p:
                         description = desc_p.get_text(strip=True)
                     else:
                         description = desc_container.get_text(strip=True)
                 if not description:
                     p_tag = result.find('p')
                     if p_tag:
                     if unique and url_val in fetched_links:
                         continue
+                    # --- FIXED: Metadata Parsing Logic ---
+                    metadata = {}
+                    # Parse Sitelinks
+                    sitelinks_container = result.find('ul', class_='b_vlist')
+                    if sitelinks_container:
+                        sitelinks = []
+                        for link_item in sitelinks_container.find_all('li'):
+                            sitelink_tag = link_item.find('a')
+                            if sitelink_tag and sitelink_tag.has_attr('href'):
+                                sitelinks.append({
+                                    'title': sitelink_tag.get_text(strip=True),
+                                    'url': self._get_url(sitelink_tag)
+                                })
+                        if sitelinks:
+                            metadata['sitelinks'] = sitelinks
+                    # Parse Date (heuristic approach)
+                    if desc_container:
+                        # Date is often in a span preceding the description text
+                        date_span = desc_container.find('span', class_=None)
+                        if date_span:
+                            date_text = date_span.get_text(strip=True)
+                            # Simple check for date-like content
+                            if any(word in date_text.lower() for word in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'ago']):
+                                metadata['date'] = date_text.replace('·', '').strip()
+                    # --- End of Fix ---
+                    fetched_results.append(
+                        BingSearchResult(
+                            url=url_val,
+                            title=title,
+                            description=description,
+                            metadata=metadata  # Pass the populated metadata
+                        )
+                    )
                     fetched_links.add(url_val)
                     if len(fetched_results) >= max_results:
             if len(fetched_results) >= max_results:
                 break
             next_page_info = self._next_page(soup)
             current_url = next_page_info['url']
             if current_url: