Greff3 commited on
Commit
5a6ba72
·
verified ·
1 Parent(s): 084da71

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +34 -74
main.py CHANGED
@@ -3,10 +3,11 @@ import base64
3
  import json
4
  from concurrent.futures import ThreadPoolExecutor
5
  from typing import Any, Dict, List, Optional
6
- from urllib.parse import urlparse
7
 
8
  from bs4 import BeautifulSoup
9
- from curl_cffi.aio import AsyncSession # Using the async version
 
10
  from fastapi import FastAPI, HTTPException, Query
11
  from pydantic import BaseModel, Field
12
  from webscout.litagent import LitAgent
@@ -15,7 +16,7 @@ from webscout.litagent import LitAgent
15
  app = FastAPI(
16
  title="Snapzion Enhanced Search API",
17
  description="An advanced FastAPI wrapper for Bing Search, featuring AI-powered summarization and metadata enrichment.",
18
- version="2.0.0",
19
  )
20
 
21
  # --- Pydantic Models for Clearer Responses ---
@@ -50,13 +51,9 @@ class BingSearch:
50
  """
51
  Bing search implementation rewritten for asynchronous performance and enhanced data retrieval.
52
  """
53
- # LitAgent is a singleton to reuse its model
54
  _lit_agent_instance: Optional[LitAgent] = None
55
-
56
- # Run synchronous LitAgent in a thread pool to not block the event loop
57
  _executor = ThreadPoolExecutor(max_workers=10)
58
 
59
-
60
  def __init__(
61
  self,
62
  timeout: int = 10,
@@ -94,7 +91,6 @@ class BingSearch:
94
  loop = asyncio.get_running_loop()
95
  agent = self.get_lit_agent()
96
  try:
97
- # Use to_thread to run blocking I/O or CPU-bound function in a separate thread
98
  summary = await loop.run_in_executor(
99
  self._executor, agent.summarize, html_content
100
  )
@@ -108,37 +104,29 @@ class BingSearch:
108
  """Fetches page content, generates summary, and extracts metadata."""
109
  enhanced_result = EnhancedBingSearchResult(**result.model_dump())
110
  try:
111
- # Set source from URL
112
  parsed_url = urlparse(result.url)
113
  enhanced_result.source = parsed_url.netloc
114
 
115
- # Fetch page content for summarization and favicon
116
  resp = await self.session.get(result.url, timeout=self.timeout)
117
  resp.raise_for_status()
118
  html = resp.text
119
 
120
- # Generate AI summary
121
  summary = await self._summarize_content(html)
122
  enhanced_result.summary = summary
123
 
124
- # Extract favicon
125
  soup = BeautifulSoup(html, "html.parser")
126
  favicon_tag = soup.find("link", rel=lambda r: r and "icon" in r.lower())
127
  if favicon_tag and favicon_tag.get("href"):
128
  favicon_url = favicon_tag["href"]
129
- # Handle relative favicon URLs
130
- if not favicon_url.startswith(('http://', 'https://')):
131
- favicon_url = f"{parsed_url.scheme}://{parsed_url.netloc}{favicon_url}"
 
132
  enhanced_result.favicon = favicon_url
133
-
134
  except Exception as e:
135
  print(f"Failed to enhance URL {result.url}: {e}")
136
- # Silently fail enhancement, but still return base data
137
-
138
  return enhanced_result
139
 
140
-
141
- # ... (selectors, first_page, next_page, get_url methods remain the same) ...
142
  def _selectors(self, element):
143
  selectors = {
144
  'links': 'ol#b_results > li.b_algo',
@@ -153,34 +141,16 @@ class BingSearch:
153
  def _next_page(self, soup):
154
  selector = self._selectors('next')
155
  next_page_tag = soup.select_one(selector)
156
- url = None
157
  if next_page_tag and next_page_tag.get('href'):
158
- url = self._base_url + next_page_tag['href']
159
- return {'url': url, 'data': None}
160
 
161
  def _get_url(self, tag):
162
- url = tag.get('href', '')
163
- # This part handles Bing's weird tracking URLs
164
- try:
165
- parsed_url = urlparse(url)
166
- if "r" in parsed_url.path: # Direct links are often in /r/
167
- query_params = parse_qs(parsed_url.query)
168
- if "u" in query_params:
169
- encoded_url = query_params["u"][0][2:]
170
- decoded_bytes = base64.urlsafe_b64decode(encoded_url + '===')
171
- return decoded_bytes.decode('utf-8')
172
- except Exception:
173
- pass
174
- return url
175
 
176
-
177
  async def text(
178
- self,
179
- keywords: str,
180
- region: str = None,
181
- safesearch: str = "moderate",
182
- max_results: int = 10,
183
- enhanced: bool = False
184
  ) -> List[BaseSearchResult | EnhancedBingSearchResult]:
185
  if not keywords:
186
  raise ValueError("Search keywords cannot be empty")
@@ -215,14 +185,16 @@ class BingSearch:
215
  title = title_tag.get_text(strip=True)
216
 
217
  desc_container = result.find('div', class_='b_caption')
218
- description = desc_container.find('p').get_text(strip=True) if desc_container and desc_container.find('p') else ""
219
-
 
 
 
 
220
  if url_val and title:
221
  if url_val in fetched_links: continue
222
-
223
  fetched_results.append(BaseSearchResult(url=url_val, title=title, description=description))
224
  fetched_links.add(url_val)
225
-
226
  if len(fetched_results) >= max_results: break
227
 
228
  if len(fetched_results) >= max_results: break
@@ -235,25 +207,22 @@ class BingSearch:
235
  results_to_return = fetched_results[:max_results]
236
 
237
  if enhanced and results_to_return:
238
- # Concurrently enhance all results
239
  enhancement_tasks = [self._enhance_result(res) for res in results_to_return]
240
  return await asyncio.gather(*enhancement_tasks)
241
 
242
  return results_to_return
243
 
244
- # ... (suggestions, images, news methods converted to async) ...
245
- async def suggestions(self, query: str, region: str = None) -> List[str]:
246
- if not query:
247
- raise ValueError("Search query cannot be empty")
248
- # ... logic ...
249
- url = f"https://api.bing.com/osjson.aspx?query={query}&mkt={region or 'en-US'}"
250
  resp = await self.session.get(url)
251
  resp.raise_for_status()
252
  data = resp.json()
253
  return data[1] if isinstance(data, list) and len(data) > 1 else []
254
 
255
  async def images(self, keywords: str, max_results: int = 10, **kwargs) -> List[BingImageResult]:
256
- # ... logic converted to async ...
257
  url = f"{self._base_url}/images/search?q={keywords}&count={max_results}"
258
  resp = await self.session.get(url)
259
  resp.raise_for_status()
@@ -261,8 +230,9 @@ class BingSearch:
261
  results = []
262
  for item in soup.select("a.iusc"):
263
  try:
264
- m = item.get("m")
265
- meta = json.loads(m) if m else {}
 
266
  if meta.get("murl"):
267
  results.append(BingImageResult(title=meta.get("t", ""), image=meta.get("murl"), thumbnail=meta.get("turl", ""), url=meta.get("purl", ""), source=meta.get("surl", "")))
268
  if len(results) >= max_results: break
@@ -270,7 +240,7 @@ class BingSearch:
270
  return results
271
 
272
  async def news(self, keywords: str, max_results: int = 10, **kwargs) -> List[BingNewsResult]:
273
- # ... logic converted to async ...
274
  url = f"{self._base_url}/news/search?q={keywords}"
275
  resp = await self.session.get(url)
276
  resp.raise_for_status()
@@ -278,7 +248,7 @@ class BingSearch:
278
  results = []
279
  for item in soup.select("div.news-card"):
280
  a_tag = item.find("a", class_="title")
281
- if not a_tag: continue
282
  desc_tag = item.find("div", class_="snippet")
283
  source_tag = item.find(attrs={"aria-label": "Publisher"})
284
  results.append(BingNewsResult(title=a_tag.get_text(strip=True), url=a_tag['href'], description=desc_tag.get_text(strip=True) if desc_tag else "", source=source_tag.get_text(strip=True) if source_tag else ""))
@@ -289,12 +259,9 @@ bing = BingSearch()
289
 
290
  # --- FastAPI Endpoints ---
291
 
292
- # Use a union type in response_model to reflect the two possible return types
293
- @app.get("/search", response_model=List[EnhancedBingSearchResult | BaseSearchResult])
294
  async def text_search(
295
  query: str = Query(..., description="The search keywords."),
296
- region: Optional[str] = Query(None, description="The region for the search (e.g., 'en-US')."),
297
- safesearch: str = Query("moderate", description="Safe search level ('on', 'moderate', 'off')."),
298
  max_results: int = Query(10, description="Maximum number of results to return."),
299
  enhanced: bool = Query(False, description="Enable AI summarization and metadata fetching (slower but more detailed).")
300
  ):
@@ -305,8 +272,6 @@ async def text_search(
305
  try:
306
  results = await bing.text(
307
  keywords=query,
308
- region=region,
309
- safesearch=safesearch,
310
  max_results=max_results,
311
  enhanced=enhanced
312
  )
@@ -314,34 +279,30 @@ async def text_search(
314
  except Exception as e:
315
  raise HTTPException(status_code=500, detail=str(e))
316
 
317
- @app.get("/suggestions", response_model=List[str])
318
  async def get_suggestions(
319
  query: str = Query(..., description="The search query for which to fetch suggestions."),
320
- region: Optional[str] = Query(None, description="The region for the suggestions (e.g., 'en-US')."),
321
  ):
322
- """Fetches search suggestions for a given query."""
323
  try:
324
- return await bing.suggestions(query=query, region=region)
325
  except Exception as e:
326
  raise HTTPException(status_code=500, detail=str(e))
327
 
328
- @app.get("/images", response_model=List[BingImageResult])
329
  async def image_search(
330
  query: str = Query(..., description="The search keywords for images."),
331
  max_results: int = Query(10, description="Maximum number of image results to return."),
332
  ):
333
- """Perform an image search on Bing."""
334
  try:
335
  return await bing.images(keywords=query, max_results=max_results)
336
  except Exception as e:
337
  raise HTTPException(status_code=500, detail=str(e))
338
 
339
- @app.get("/news", response_model=List[BingNewsResult])
340
  async def news_search(
341
  query: str = Query(..., description="The search keywords for news."),
342
  max_results: int = Query(10, description="Maximum number of news results to return."),
343
  ):
344
- """Perform a news search on Bing."""
345
  try:
346
  return await bing.news(keywords=query, max_results=max_results)
347
  except Exception as e:
@@ -350,5 +311,4 @@ async def news_search(
350
 
351
  if __name__ == "__main__":
352
  import uvicorn
353
- # Add reload=True for development convenience
354
  uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
 
3
  import json
4
  from concurrent.futures import ThreadPoolExecutor
5
  from typing import Any, Dict, List, Optional
6
+ from urllib.parse import parse_qs, urlparse
7
 
8
  from bs4 import BeautifulSoup
9
+ # This import will work correctly after running `pip install --upgrade curl_cffi`
10
+ from curl_cffi.aio import AsyncSession
11
  from fastapi import FastAPI, HTTPException, Query
12
  from pydantic import BaseModel, Field
13
  from webscout.litagent import LitAgent
 
16
  app = FastAPI(
17
  title="Snapzion Enhanced Search API",
18
  description="An advanced FastAPI wrapper for Bing Search, featuring AI-powered summarization and metadata enrichment.",
19
+ version="2.0.1", # Version bump
20
  )
21
 
22
  # --- Pydantic Models for Clearer Responses ---
 
51
  """
52
  Bing search implementation rewritten for asynchronous performance and enhanced data retrieval.
53
  """
 
54
  _lit_agent_instance: Optional[LitAgent] = None
 
 
55
  _executor = ThreadPoolExecutor(max_workers=10)
56
 
 
57
  def __init__(
58
  self,
59
  timeout: int = 10,
 
91
  loop = asyncio.get_running_loop()
92
  agent = self.get_lit_agent()
93
  try:
 
94
  summary = await loop.run_in_executor(
95
  self._executor, agent.summarize, html_content
96
  )
 
104
  """Fetches page content, generates summary, and extracts metadata."""
105
  enhanced_result = EnhancedBingSearchResult(**result.model_dump())
106
  try:
 
107
  parsed_url = urlparse(result.url)
108
  enhanced_result.source = parsed_url.netloc
109
 
 
110
  resp = await self.session.get(result.url, timeout=self.timeout)
111
  resp.raise_for_status()
112
  html = resp.text
113
 
 
114
  summary = await self._summarize_content(html)
115
  enhanced_result.summary = summary
116
 
 
117
  soup = BeautifulSoup(html, "html.parser")
118
  favicon_tag = soup.find("link", rel=lambda r: r and "icon" in r.lower())
119
  if favicon_tag and favicon_tag.get("href"):
120
  favicon_url = favicon_tag["href"]
121
+ if not favicon_url.startswith(('http://', 'https://', '//')):
122
+ favicon_url = f"{parsed_url.scheme}://{parsed_url.netloc}{'/' if not favicon_url.startswith('/') else ''}{favicon_url}"
123
+ elif favicon_url.startswith('//'):
124
+ favicon_url = f"{parsed_url.scheme}:{favicon_url}"
125
  enhanced_result.favicon = favicon_url
 
126
  except Exception as e:
127
  print(f"Failed to enhance URL {result.url}: {e}")
 
 
128
  return enhanced_result
129
 
 
 
130
  def _selectors(self, element):
131
  selectors = {
132
  'links': 'ol#b_results > li.b_algo',
 
141
  def _next_page(self, soup):
142
  selector = self._selectors('next')
143
  next_page_tag = soup.select_one(selector)
 
144
  if next_page_tag and next_page_tag.get('href'):
145
+ return {'url': self._base_url + next_page_tag['href'], 'data': None}
146
+ return {'url': None, 'data': None}
147
 
148
  def _get_url(self, tag):
149
+ # A more direct approach that is often sufficient
150
+ return tag.get('href', '')
 
 
 
 
 
 
 
 
 
 
 
151
 
 
152
  async def text(
153
+ self, keywords: str, max_results: int = 10, enhanced: bool = False, **kwargs
 
 
 
 
 
154
  ) -> List[BaseSearchResult | EnhancedBingSearchResult]:
155
  if not keywords:
156
  raise ValueError("Search keywords cannot be empty")
 
185
  title = title_tag.get_text(strip=True)
186
 
187
  desc_container = result.find('div', class_='b_caption')
188
+ description = ""
189
+ if desc_container:
190
+ p_tag = desc_container.find('p')
191
+ if p_tag:
192
+ description = p_tag.get_text(strip=True)
193
+
194
  if url_val and title:
195
  if url_val in fetched_links: continue
 
196
  fetched_results.append(BaseSearchResult(url=url_val, title=title, description=description))
197
  fetched_links.add(url_val)
 
198
  if len(fetched_results) >= max_results: break
199
 
200
  if len(fetched_results) >= max_results: break
 
207
  results_to_return = fetched_results[:max_results]
208
 
209
  if enhanced and results_to_return:
 
210
  enhancement_tasks = [self._enhance_result(res) for res in results_to_return]
211
  return await asyncio.gather(*enhancement_tasks)
212
 
213
  return results_to_return
214
 
215
+ async def suggestions(self, query: str, **kwargs) -> List[str]:
216
+ if not query: raise ValueError("Query cannot be empty")
217
+ region = kwargs.get('region', 'en-US')
218
+ url = f"https://api.bing.com/osjson.aspx?query={query}&mkt={region}"
 
 
219
  resp = await self.session.get(url)
220
  resp.raise_for_status()
221
  data = resp.json()
222
  return data[1] if isinstance(data, list) and len(data) > 1 else []
223
 
224
  async def images(self, keywords: str, max_results: int = 10, **kwargs) -> List[BingImageResult]:
225
+ if not keywords: raise ValueError("Keywords cannot be empty")
226
  url = f"{self._base_url}/images/search?q={keywords}&count={max_results}"
227
  resp = await self.session.get(url)
228
  resp.raise_for_status()
 
230
  results = []
231
  for item in soup.select("a.iusc"):
232
  try:
233
+ m_data = item.get("m")
234
+ if not m_data: continue
235
+ meta = json.loads(m_data)
236
  if meta.get("murl"):
237
  results.append(BingImageResult(title=meta.get("t", ""), image=meta.get("murl"), thumbnail=meta.get("turl", ""), url=meta.get("purl", ""), source=meta.get("surl", "")))
238
  if len(results) >= max_results: break
 
240
  return results
241
 
242
  async def news(self, keywords: str, max_results: int = 10, **kwargs) -> List[BingNewsResult]:
243
+ if not keywords: raise ValueError("Keywords cannot be empty")
244
  url = f"{self._base_url}/news/search?q={keywords}"
245
  resp = await self.session.get(url)
246
  resp.raise_for_status()
 
248
  results = []
249
  for item in soup.select("div.news-card"):
250
  a_tag = item.find("a", class_="title")
251
+ if not (a_tag and a_tag.has_attr('href')): continue
252
  desc_tag = item.find("div", class_="snippet")
253
  source_tag = item.find(attrs={"aria-label": "Publisher"})
254
  results.append(BingNewsResult(title=a_tag.get_text(strip=True), url=a_tag['href'], description=desc_tag.get_text(strip=True) if desc_tag else "", source=source_tag.get_text(strip=True) if source_tag else ""))
 
259
 
260
  # --- FastAPI Endpoints ---
261
 
262
+ @app.get("/search", response_model=List[EnhancedBingSearchResult | BaseSearchResult], summary="Perform a standard or enhanced text search")
 
263
  async def text_search(
264
  query: str = Query(..., description="The search keywords."),
 
 
265
  max_results: int = Query(10, description="Maximum number of results to return."),
266
  enhanced: bool = Query(False, description="Enable AI summarization and metadata fetching (slower but more detailed).")
267
  ):
 
272
  try:
273
  results = await bing.text(
274
  keywords=query,
 
 
275
  max_results=max_results,
276
  enhanced=enhanced
277
  )
 
279
  except Exception as e:
280
  raise HTTPException(status_code=500, detail=str(e))
281
 
282
+ @app.get("/suggestions", response_model=List[str], summary="Fetch search suggestions")
283
  async def get_suggestions(
284
  query: str = Query(..., description="The search query for which to fetch suggestions."),
 
285
  ):
 
286
  try:
287
+ return await bing.suggestions(query=query)
288
  except Exception as e:
289
  raise HTTPException(status_code=500, detail=str(e))
290
 
291
+ @app.get("/images", response_model=List[BingImageResult], summary="Search for images")
292
  async def image_search(
293
  query: str = Query(..., description="The search keywords for images."),
294
  max_results: int = Query(10, description="Maximum number of image results to return."),
295
  ):
 
296
  try:
297
  return await bing.images(keywords=query, max_results=max_results)
298
  except Exception as e:
299
  raise HTTPException(status_code=500, detail=str(e))
300
 
301
+ @app.get("/news", response_model=List[BingNewsResult], summary="Search for news articles")
302
  async def news_search(
303
  query: str = Query(..., description="The search keywords for news."),
304
  max_results: int = Query(10, description="Maximum number of news results to return."),
305
  ):
 
306
  try:
307
  return await bing.news(keywords=query, max_results=max_results)
308
  except Exception as e:
 
311
 
312
  if __name__ == "__main__":
313
  import uvicorn
 
314
  uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)