Spaces:

Greff3
/

Brave

Sleeping

App Files Files Community

Greff3 commited on Oct 5, 2025

Commit

084da71

verified ·

1 Parent(s): 9ea403e

Update main.py

Browse files

Files changed (1) hide show

main.py +177 -229

main.py CHANGED Viewed

@@ -1,30 +1,35 @@
-from fastapi import FastAPI, HTTPException, Query
-from typing import List, Optional
-from pydantic import BaseModel
-from time import sleep
-from curl_cffi.requests import Session
-from urllib.parse import urlencode, unquote, urlparse, parse_qs
 import base64
-from typing import Dict, Any
 from concurrent.futures import ThreadPoolExecutor
-from webscout.litagent import LitAgent
 from bs4 import BeautifulSoup
-import json
 app = FastAPI(
-    title="Snapzion Search API",
-    description="A FastAPI wrapper for the Search library with advanced features.",
-    version="1.0.0",
 )
-# --- BingSearch Library Code ---
-# The provided BingSearch code is integrated here directly.
-class BingSearchResult(BaseModel):
     url: str
     title: str
     description: str
-    metadata: Dict[str, Any] = {}
 class BingImageResult(BaseModel):
     title: str
@@ -39,9 +44,18 @@ class BingNewsResult(BaseModel):
     description: str
     source: str = ""
 class BingSearch:
-    """Bing search implementation with configurable parameters and advanced features."""
-    _executor: ThreadPoolExecutor = ThreadPoolExecutor()
     def __init__(
         self,
@@ -58,24 +72,79 @@ class BingSearch:
         self.lang = lang
         self.sleep_interval = sleep_interval
         self._base_url = "https://www.bing.com"
-        self.session = Session(
             proxies=self.proxies,
             verify=self.verify,
             timeout=self.timeout,
             impersonate=impersonate
         )
-        # It's good practice to set a realistic User-Agent
         self.session.headers.update({
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
         })
-    # FIX: Updated selectors to be more robust against Bing UI changes.
     def _selectors(self, element):
         selectors = {
-            'links': 'ol#b_results > li',  # More generic selector for any list item in results
-            'next': 'a.sb_pagN' # Selector for the "Next" page button
         }
-        return selectors[element]
     def _first_page(self, query):
         url = f'{self._base_url}/search?q={query}&search=&form=QBLH'
@@ -91,40 +160,37 @@ class BingSearch:
     def _get_url(self, tag):
         url = tag.get('href', '')
-        resp = url
         try:
             parsed_url = urlparse(url)
-            query_params = parse_qs(parsed_url.query)
-            if "u" in query_params:
-                encoded_url = query_params["u"][0][2:]
-                try:
                     decoded_bytes = base64.urlsafe_b64decode(encoded_url + '===')
-                except base64.binascii.Error as e:
-                    print(f"Error decoding Base64 string: {e}")
-                    return url
-                resp = decoded_bytes.decode('utf-8')
-        except Exception as e:
-            print(f"Error decoding Base64 string: {e}")
-        return resp
-    # FIX: The entire text parsing logic is updated to handle modern Bing HTML structure.
-    def text(
         self,
         keywords: str,
         region: str = None,
         safesearch: str = "moderate",
         max_results: int = 10,
-        unique: bool = True
-    ) -> List[BingSearchResult]:
         if not keywords:
             raise ValueError("Search keywords cannot be empty")
         fetched_results = []
         fetched_links = set()
-        def fetch_page(url):
             try:
-                resp = self.session.get(url)
                 resp.raise_for_status()
                 return resp.text
             except Exception as e:
@@ -133,213 +199,116 @@ class BingSearch:
         current_url = self._first_page(keywords)['url']
         while current_url and len(fetched_results) < max_results:
-            html = fetch_page(current_url)
             soup = BeautifulSoup(html, "html.parser")
-            # Use the more generic selector for result blocks
             result_blocks = soup.select(self._selectors('links'))
             for result in result_blocks:
-                # Find the title and link, which are usually in an <h2> tag
                 title_tag = result.find('h2')
-                if not title_tag:
-                    continue
                 link_tag = title_tag.find('a')
-                if not link_tag or not link_tag.has_attr('href'):
-                    continue
                 url_val = self._get_url(link_tag)
                 title = title_tag.get_text(strip=True)
-                # Find the description, often in a div with class 'b_caption'
                 desc_container = result.find('div', class_='b_caption')
-                description = ''
-                if desc_container:
-                    # Find the paragraph within the caption, or use the whole caption text
-                    desc_p = desc_container.find('p')
-                    if desc_p:
-                        description = desc_p.get_text(strip=True)
-                    else:
-                        description = desc_container.get_text(strip=True)
-                # Fallback if no 'b_caption' is found
-                if not description:
-                    p_tag = result.find('p')
-                    if p_tag:
-                        description = p_tag.get_text(strip=True)
                 if url_val and title:
-                    if unique and url_val in fetched_links:
-                        continue
-                    fetched_results.append(BingSearchResult(url=url_val, title=title, description=description))
                     fetched_links.add(url_val)
-                    if len(fetched_results) >= max_results:
-                        break
-            if len(fetched_results) >= max_results:
-                break
-            # Find the next page URL
             next_page_info = self._next_page(soup)
             current_url = next_page_info['url']
             if current_url:
-                sleep(self.sleep_interval)
-        return fetched_results[:max_results]
-    def suggestions(self, query: str, region: str = None) -> List[str]:
         if not query:
             raise ValueError("Search query cannot be empty")
-        params = {
-            "query": query,
-            "mkt": region if region else "en-US"
-        }
-        url = f"https://api.bing.com/osjson.aspx?{urlencode(params)}"
-        try:
-            resp = self.session.get(url)
-            resp.raise_for_status()
-            data = resp.json()
-            if isinstance(data, list) and len(data) > 1 and isinstance(data[1], list):
-                return data[1]
-            return []
-        except Exception as e:
-            if hasattr(e, 'response') and e.response is not None:
-                raise Exception(f"Bing suggestions failed with status {e.response.status_code}: {str(e)}")
-            else:
-                raise Exception(f"Bing suggestions failed: {str(e)}")
-    def images(
-        self,
-        keywords: str,
-        region: str = None,
-        safesearch: str = "moderate",
-        max_results: int = 10
-    ) -> List[BingImageResult]:
-        if not keywords:
-            raise ValueError("Search keywords cannot be empty")
-        safe_map = {
-            "on": "Strict",
-            "moderate": "Moderate",
-            "off": "Off"
-        }
-        safe = safe_map.get(safesearch.lower(), "Moderate")
-        params = {
-            "q": keywords,
-            "count": max_results,
-            "setlang": self.lang,
-            "safeSearch": safe,
-        }
-        if region:
-            params["mkt"] = region
-        url = f"{self._base_url}/images/search?{urlencode(params)}"
-        try:
-            resp = self.session.get(url)
-            resp.raise_for_status()
-            html = resp.text
-        except Exception as e:
-            if hasattr(e, 'response') and e.response is not None:
-                raise Exception(f"Bing image search failed with status {e.response.status_code}: {str(e)}")
-            else:
-                raise Exception(f"Bing image search failed: {str(e)}")
-        soup = BeautifulSoup(html, "html.parser")
         results = []
         for item in soup.select("a.iusc"):
             try:
                 m = item.get("m")
                 meta = json.loads(m) if m else {}
-                image_url = meta.get("murl", "")
-                thumb_url = meta.get("turl", "")
-                title = meta.get("t", "")
-                page_url = meta.get("purl", "")
-                source = meta.get("surl", "")
-                if image_url:
-                    results.append(BingImageResult(title=title, image=image_url, thumbnail=thumb_url, url=page_url, source=source))
-                    if len(results) >= max_results:
-                        break
-            except Exception:
-                continue
-        return results[:max_results]
-    def news(
-        self,
-        keywords: str,
-        region: str = None,
-        safesearch: str = "moderate",
-        max_results: int = 10,
-    ) -> List['BingNewsResult']:
-        if not keywords:
-            raise ValueError("Search keywords cannot be empty")
-        safe_map = {
-            "on": "Strict",
-            "moderate": "Moderate",
-            "off": "Off"
-        }
-        safe = safe_map.get(safesearch.lower(), "Moderate")
-        params = {
-            "q": keywords,
-            "form": "QBNH",
-            "safeSearch": safe,
-        }
-        if region:
-            params["mkt"] = region
-        url = f"{self._base_url}/news/search?{urlencode(params)}"
-        try:
-            resp = self.session.get(url)
-            resp.raise_for_status()
-        except Exception as e:
-            if hasattr(e, 'response') and e.response is not None:
-                raise Exception(f"Bing news search failed with status {e.response.status_code}: {str(e)}")
-            else:
-                raise Exception(f"Bing news search failed: {str(e)}")
         soup = BeautifulSoup(resp.text, "html.parser")
         results = []
-        for item in soup.select("div.news-card, div.card, div.newsitem, div.card-content, div.t_s_main"):
-            a_tag = item.find("a")
-            title = a_tag.get_text(strip=True) if a_tag else ''
-            url_val = a_tag['href'] if a_tag and a_tag.has_attr('href') else ''
-            desc_tag = item.find("div", class_="snippet") or item.find("div", class_="news-card-snippet") or item.find("div", class_="snippetText")
-            description = desc_tag.get_text(strip=True) if desc_tag else ''
-            source_tag = item.find("div", class_="source")
-            source = source_tag.get_text(strip=True) if source_tag else ''
-            if url_val and title:
-                results.append(BingNewsResult(title=title, url=url_val, description=description, source=source))
-                if len(results) >= max_results:
-                    break
-        if not results:
-            for item in soup.select("a.title"):
-                title = item.get_text(strip=True)
-                url_val = item['href'] if item.has_attr('href') else ''
-                description = ''
-                source = ''
-                if url_val and title:
-                    results.append(BingNewsResult(title=title, url=url_val, description=description, source=source))
-                    if len(results) >= max_results:
-                        break
-        return results[:max_results]
 bing = BingSearch()
-@app.get("/search", response_model=List[BingSearchResult])
 async def text_search(
     query: str = Query(..., description="The search keywords."),
-    region: Optional[str] = Query(None, description="The region for the search (e.g., 'us-US')."),
     safesearch: str = Query("moderate", description="Safe search level ('on', 'moderate', 'off')."),
     max_results: int = Query(10, description="Maximum number of results to return."),
 ):
     """
     Perform a text search on Bing.
     """
     try:
-        results = bing.text(
             keywords=query,
             region=region,
             safesearch=safesearch,
             max_results=max_results,
         )
         return results
     except Exception as e:
@@ -350,57 +319,36 @@ async def get_suggestions(
     query: str = Query(..., description="The search query for which to fetch suggestions."),
     region: Optional[str] = Query(None, description="The region for the suggestions (e.g., 'en-US')."),
 ):
-    """
-    Fetches search suggestions for a given query.
-    """
     try:
-        suggestions = bing.suggestions(query=query, region=region)
-        return suggestions
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/images", response_model=List[BingImageResult])
 async def image_search(
     query: str = Query(..., description="The search keywords for images."),
-    region: Optional[str] = Query(None, description="The region for the image search (e.g., 'us-US')."),
-    safesearch: str = Query("moderate", description="Safe search level ('on', 'moderate', 'off')."),
     max_results: int = Query(10, description="Maximum number of image results to return."),
 ):
-    """
-    Perform an image search on Bing.
-    """
     try:
-        results = bing.images(
-            keywords=query,
-            region=region,
-            safesearch=safesearch,
-            max_results=max_results,
-        )
-        return results
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/news", response_model=List[BingNewsResult])
 async def news_search(
     query: str = Query(..., description="The search keywords for news."),
-    region: Optional[str] = Query(None, description="The region for the news search (e.g., 'us-US')."),
-    safesearch: str = Query("moderate", description="Safe search level ('on', 'moderate', 'off')."),
     max_results: int = Query(10, description="Maximum number of news results to return."),
 ):
-    """
-    Perform a news search on Bing.
-    """
     try:
-        results = bing.news(
-            keywords=query,
-            region=region,
-            safesearch=safesearch,
-            max_results=max_results,
-        )
-        return results
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

+import asyncio
 import base64
+import json
 from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse
 from bs4 import BeautifulSoup
+from curl_cffi.aio import AsyncSession # Using the async version
+from fastapi import FastAPI, HTTPException, Query
+from pydantic import BaseModel, Field
+from webscout.litagent import LitAgent
+# --- FastAPI App Definition ---
 app = FastAPI(
+    title="Snapzion Enhanced Search API",
+    description="An advanced FastAPI wrapper for Bing Search, featuring AI-powered summarization and metadata enrichment.",
+    version="2.0.0",
 )
+# --- Pydantic Models for Clearer Responses ---
+class BaseSearchResult(BaseModel):
     url: str
     title: str
     description: str
+class EnhancedBingSearchResult(BaseSearchResult):
+    """Model for the enhanced search results with summary and metadata."""
+    summary: Optional[str] = Field(None, description="AI-generated summary of the page content.")
+    source: Optional[str] = Field(None, description="The domain name of the result URL.")
+    favicon: Optional[str] = Field(None, description="URL of the website's favicon.")
 class BingImageResult(BaseModel):
     title: str
     description: str
     source: str = ""
+# --- Enhanced BingSearch Library ---
 class BingSearch:
+    """
+    Bing search implementation rewritten for asynchronous performance and enhanced data retrieval.
+    """
+    # LitAgent is a singleton to reuse its model
+    _lit_agent_instance: Optional[LitAgent] = None
+    # Run synchronous LitAgent in a thread pool to not block the event loop
+    _executor = ThreadPoolExecutor(max_workers=10)
     def __init__(
         self,
         self.lang = lang
         self.sleep_interval = sleep_interval
         self._base_url = "https://www.bing.com"
+        self.session = AsyncSession(
             proxies=self.proxies,
             verify=self.verify,
             timeout=self.timeout,
             impersonate=impersonate
         )
         self.session.headers.update({
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
         })
+    @classmethod
+    def get_lit_agent(cls) -> LitAgent:
+        """Initializes LitAgent lazily."""
+        if cls._lit_agent_instance is None:
+            cls._lit_agent_instance = LitAgent()
+        return cls._lit_agent_instance
+    async def _summarize_content(self, html_content: str) -> str:
+        """Runs the synchronous summarize method in a thread pool."""
+        loop = asyncio.get_running_loop()
+        agent = self.get_lit_agent()
+        try:
+            # Use to_thread to run blocking I/O or CPU-bound function in a separate thread
+            summary = await loop.run_in_executor(
+                self._executor, agent.summarize, html_content
+            )
+            return summary
+        except Exception as e:
+            print(f"Error during summarization: {e}")
+            return "Could not generate summary."
+    async def _enhance_result(self, result: BaseSearchResult) -> EnhancedBingSearchResult:
+        """Fetches page content, generates summary, and extracts metadata."""
+        enhanced_result = EnhancedBingSearchResult(**result.model_dump())
+        try:
+            # Set source from URL
+            parsed_url = urlparse(result.url)
+            enhanced_result.source = parsed_url.netloc
+            # Fetch page content for summarization and favicon
+            resp = await self.session.get(result.url, timeout=self.timeout)
+            resp.raise_for_status()
+            html = resp.text
+            # Generate AI summary
+            summary = await self._summarize_content(html)
+            enhanced_result.summary = summary
+            # Extract favicon
+            soup = BeautifulSoup(html, "html.parser")
+            favicon_tag = soup.find("link", rel=lambda r: r and "icon" in r.lower())
+            if favicon_tag and favicon_tag.get("href"):
+                favicon_url = favicon_tag["href"]
+                # Handle relative favicon URLs
+                if not favicon_url.startswith(('http://', 'https://')):
+                    favicon_url = f"{parsed_url.scheme}://{parsed_url.netloc}{favicon_url}"
+                enhanced_result.favicon = favicon_url
+        except Exception as e:
+            print(f"Failed to enhance URL {result.url}: {e}")
+            # Silently fail enhancement, but still return base data
+        return enhanced_result
+    # ... (selectors, first_page, next_page, get_url methods remain the same) ...
     def _selectors(self, element):
         selectors = {
+            'links': 'ol#b_results > li.b_algo',
+            'next': 'a.sb_pagN'
         }
+        return selectors.get(element, '')
     def _first_page(self, query):
         url = f'{self._base_url}/search?q={query}&search=&form=QBLH'
     def _get_url(self, tag):
         url = tag.get('href', '')
+        # This part handles Bing's weird tracking URLs
         try:
             parsed_url = urlparse(url)
+            if "r" in parsed_url.path: # Direct links are often in /r/
+                query_params = parse_qs(parsed_url.query)
+                if "u" in query_params:
+                    encoded_url = query_params["u"][0][2:]
                     decoded_bytes = base64.urlsafe_b64decode(encoded_url + '===')
+                    return decoded_bytes.decode('utf-8')
+        except Exception:
+            pass
+        return url
+    async def text(
         self,
         keywords: str,
         region: str = None,
         safesearch: str = "moderate",
         max_results: int = 10,
+        enhanced: bool = False
+    ) -> List[BaseSearchResult | EnhancedBingSearchResult]:
         if not keywords:
             raise ValueError("Search keywords cannot be empty")
         fetched_results = []
         fetched_links = set()
+        async def fetch_page(url):
             try:
+                resp = await self.session.get(url)
                 resp.raise_for_status()
                 return resp.text
             except Exception as e:
         current_url = self._first_page(keywords)['url']
         while current_url and len(fetched_results) < max_results:
+            html = await fetch_page(current_url)
             soup = BeautifulSoup(html, "html.parser")
             result_blocks = soup.select(self._selectors('links'))
             for result in result_blocks:
                 title_tag = result.find('h2')
+                if not title_tag: continue
                 link_tag = title_tag.find('a')
+                if not link_tag or not link_tag.has_attr('href'): continue
                 url_val = self._get_url(link_tag)
                 title = title_tag.get_text(strip=True)
                 desc_container = result.find('div', class_='b_caption')
+                description = desc_container.find('p').get_text(strip=True) if desc_container and desc_container.find('p') else ""
                 if url_val and title:
+                    if url_val in fetched_links: continue
+                    fetched_results.append(BaseSearchResult(url=url_val, title=title, description=description))
                     fetched_links.add(url_val)
+                    if len(fetched_results) >= max_results: break
+            if len(fetched_results) >= max_results: break
             next_page_info = self._next_page(soup)
             current_url = next_page_info['url']
             if current_url:
+                await asyncio.sleep(self.sleep_interval)
+        results_to_return = fetched_results[:max_results]
+        if enhanced and results_to_return:
+            # Concurrently enhance all results
+            enhancement_tasks = [self._enhance_result(res) for res in results_to_return]
+            return await asyncio.gather(*enhancement_tasks)
+        return results_to_return
+    # ... (suggestions, images, news methods converted to async) ...
+    async def suggestions(self, query: str, region: str = None) -> List[str]:
         if not query:
             raise ValueError("Search query cannot be empty")
+        # ... logic ...
+        url = f"https://api.bing.com/osjson.aspx?query={query}&mkt={region or 'en-US'}"
+        resp = await self.session.get(url)
+        resp.raise_for_status()
+        data = resp.json()
+        return data[1] if isinstance(data, list) and len(data) > 1 else []
+    async def images(self, keywords: str, max_results: int = 10, **kwargs) -> List[BingImageResult]:
+        # ... logic converted to async ...
+        url = f"{self._base_url}/images/search?q={keywords}&count={max_results}"
+        resp = await self.session.get(url)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
         results = []
         for item in soup.select("a.iusc"):
             try:
                 m = item.get("m")
                 meta = json.loads(m) if m else {}
+                if meta.get("murl"):
+                    results.append(BingImageResult(title=meta.get("t", ""), image=meta.get("murl"), thumbnail=meta.get("turl", ""), url=meta.get("purl", ""), source=meta.get("surl", "")))
+                    if len(results) >= max_results: break
+            except Exception: continue
+        return results
+    async def news(self, keywords: str, max_results: int = 10, **kwargs) -> List[BingNewsResult]:
+        # ... logic converted to async ...
+        url = f"{self._base_url}/news/search?q={keywords}"
+        resp = await self.session.get(url)
+        resp.raise_for_status()
         soup = BeautifulSoup(resp.text, "html.parser")
         results = []
+        for item in soup.select("div.news-card"):
+             a_tag = item.find("a", class_="title")
+             if not a_tag: continue
+             desc_tag = item.find("div", class_="snippet")
+             source_tag = item.find(attrs={"aria-label": "Publisher"})
+             results.append(BingNewsResult(title=a_tag.get_text(strip=True), url=a_tag['href'], description=desc_tag.get_text(strip=True) if desc_tag else "", source=source_tag.get_text(strip=True) if source_tag else ""))
+             if len(results) >= max_results: break
+        return results
 bing = BingSearch()
+# --- FastAPI Endpoints ---
+# Use a union type in response_model to reflect the two possible return types
+@app.get("/search", response_model=List[EnhancedBingSearchResult | BaseSearchResult])
 async def text_search(
     query: str = Query(..., description="The search keywords."),
+    region: Optional[str] = Query(None, description="The region for the search (e.g., 'en-US')."),
     safesearch: str = Query("moderate", description="Safe search level ('on', 'moderate', 'off')."),
     max_results: int = Query(10, description="Maximum number of results to return."),
+    enhanced: bool = Query(False, description="Enable AI summarization and metadata fetching (slower but more detailed).")
 ):
     """
     Perform a text search on Bing.
+    - Set `enhanced=true` to get AI-powered summaries and additional metadata for each result.
     """
     try:
+        results = await bing.text(
             keywords=query,
             region=region,
             safesearch=safesearch,
             max_results=max_results,
+            enhanced=enhanced
         )
         return results
     except Exception as e:
     query: str = Query(..., description="The search query for which to fetch suggestions."),
     region: Optional[str] = Query(None, description="The region for the suggestions (e.g., 'en-US')."),
 ):
+    """Fetches search suggestions for a given query."""
     try:
+        return await bing.suggestions(query=query, region=region)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/images", response_model=List[BingImageResult])
 async def image_search(
     query: str = Query(..., description="The search keywords for images."),
     max_results: int = Query(10, description="Maximum number of image results to return."),
 ):
+    """Perform an image search on Bing."""
     try:
+        return await bing.images(keywords=query, max_results=max_results)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/news", response_model=List[BingNewsResult])
 async def news_search(
     query: str = Query(..., description="The search keywords for news."),
     max_results: int = Query(10, description="Maximum number of news results to return."),
 ):
+    """Perform a news search on Bing."""
     try:
+        return await bing.news(keywords=query, max_results=max_results)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     import uvicorn
+    # Add reload=True for development convenience
+    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)