rkihacker commited on
Commit
2b7ed67
·
verified ·
1 Parent(s): ef5f360

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +73 -204
main.py CHANGED
@@ -1,209 +1,78 @@
1
- from typing import List, Optional, Set
2
- from urllib.parse import parse_qs, unquote, urlparse
3
-
4
- import httpx
5
- from bs4 import BeautifulSoup
6
- from fastapi import FastAPI, HTTPException, Query
7
-
8
- UPSTREAM_URL = "https://lite.duckduckgo.com/lite/"
9
- DEFAULT_KL = "wt-wt"
10
- MAX_PAGES = 5
11
- PAGE_SIZE_HINT = 30
12
- MAX_RESULTS = MAX_PAGES * PAGE_SIZE_HINT
13
-
14
- app = FastAPI(
15
- title="DuckDuckGo Lite Web Search Proxy",
16
- description=(
17
- "A tiny FastAPI wrapper that proxies search queries to DuckDuckGo Lite and returns"
18
- " structured JSON results."
19
- ),
20
- version="1.0.0",
21
  )
22
 
23
-
24
- def _extract_results(html: str) -> List[dict]:
25
- """Parse DuckDuckGo Lite HTML into a list of search results."""
26
- soup = BeautifulSoup(html, "html.parser")
27
- results: List[dict] = []
28
-
29
- for table in soup.select("table.result"):
30
- link_tag = table.select_one("td.result-link a, a.result-link")
31
- if not link_tag:
32
- continue
33
-
34
- title = link_tag.get_text(strip=True)
35
- url = _normalize_url(link_tag.get("href"))
36
- if not title or not url:
37
- continue
38
-
39
- snippet = _extract_snippet_text(table, link_tag)
40
-
41
- results.append({
42
- "title": title,
43
- "url": url,
44
- "snippet": snippet,
45
- })
46
-
47
- if not results:
48
- # As a fallback, try to find plain links if the expected structure changes.
49
- for link_tag in soup.select("a.result-link"):
50
- title = link_tag.get_text(strip=True)
51
- url = _normalize_url(link_tag.get("href"))
52
- if not title or not url:
53
- continue
54
- snippet = _extract_snippet_text(link_tag.find_parent("table") or soup, link_tag)
55
- results.append({
56
- "title": title,
57
- "url": url,
58
- "snippet": snippet,
59
- })
60
-
61
- return results
62
-
63
-
64
- def _extract_snippet_text(container, link_tag) -> Optional[str]:
65
- """Best effort extraction of result snippet text."""
66
- if not container:
67
- return None
68
-
69
- def _clean_text(tag) -> Optional[str]:
70
- if not tag:
71
- return None
72
- text = tag.get_text(" ", strip=True)
73
- return text or None
74
-
75
- # Prefer rows that follow the link row inside the same table.
76
- link_row = link_tag.find_parent("tr")
77
- if link_row:
78
- for sibling_row in link_row.find_next_siblings("tr"):
79
- candidate = sibling_row.find("td") or sibling_row.find("div")
80
- if not candidate:
81
- continue
82
- classes = {cls.lower() for cls in candidate.get("class", [])}
83
- if not candidate.get_text(strip=True):
84
- continue
85
- if "result-snippet" in classes or any("snippet" in cls for cls in classes) or not candidate.find("a"):
86
- text = _clean_text(candidate)
87
- if text:
88
- return text
89
-
90
- # Fallback: look for known snippet containers within the table.
91
- for candidate in container.select("td.result-snippet, div.result-snippet"):
92
- text = _clean_text(candidate)
93
- if text:
94
- return text
95
-
96
- return None
97
-
98
-
99
- def _normalize_url(href: Optional[str]) -> Optional[str]:
100
- """Convert protocol-relative and redirect URLs to absolute targets."""
101
- if not href:
102
- return None
103
-
104
- href = href.strip()
105
- if href.startswith("//"):
106
- href = f"https:{href}"
107
-
108
- parsed = urlparse(href)
109
- if (
110
- parsed.netloc.endswith("duckduckgo.com")
111
- and parsed.path.startswith("/l")
112
- ):
113
- query = parse_qs(parsed.query)
114
- uddg = query.get("uddg", [])
115
- if uddg:
116
- return unquote(uddg[0])
117
-
118
- return href
119
-
120
-
121
- async def _collect_results(
122
- client: httpx.AsyncClient,
123
- base_params: dict,
124
- headers: dict,
125
- limit: Optional[int],
126
- ) -> List[dict]:
127
- collected: List[dict] = []
128
- seen_urls: Set[str] = set()
129
- offset = int(base_params.get("s", "0") or 0)
130
- pages_fetched = 0
131
-
132
- while True:
133
- page_params = dict(base_params)
134
- page_params["s"] = str(offset)
135
- response = await client.get(UPSTREAM_URL, params=page_params, headers=headers)
136
- response.raise_for_status()
137
- page_results = _extract_results(response.text)
138
-
139
- if not page_results:
140
- break
141
-
142
- for item in page_results:
143
- url = item.get("url")
144
- if url and url in seen_urls:
145
- continue
146
- if url:
147
- seen_urls.add(url)
148
- collected.append(item)
149
- if limit and len(collected) >= limit:
150
- return collected[:limit]
151
-
152
- pages_fetched += 1
153
- if limit is None or pages_fetched >= MAX_PAGES:
154
- break
155
-
156
- offset += len(page_results) or PAGE_SIZE_HINT
157
-
158
- return collected
159
-
160
-
161
- @app.post("/lite/")
162
- async def search_duckduckgo_lite(
163
- q: str = Query(..., description="keywords for query", min_length=1),
164
- s: Optional[int] = Query(None, description="can be `0`"),
165
- o: Optional[str] = Query(None, description="can be `json`"),
166
- api: Optional[str] = Query(None, description="can be `d.js`"),
167
- kl: Optional[str] = Query(None, description="market/locale code"),
168
- bing_market: Optional[str] = Query(None, description="market/locale code"),
169
- limit: Optional[int] = Query(
170
- None,
171
- gt=0,
172
- le=MAX_RESULTS,
173
- description=(
174
- "Maximum number of results to return. If greater than a single page, the service"
175
- " will fetch additional DuckDuckGo Lite pages up to the configured maximum."
176
- ),
177
- ),
178
  ):
179
- params = {"q": q}
180
- if s is not None:
181
- params["s"] = str(max(s, 0))
182
- if o:
183
- params["o"] = o
184
- if api:
185
- params["api"] = api
186
- params["kl"] = kl or DEFAULT_KL
187
- if bing_market:
188
- params["bing_market"] = bing_market
189
 
190
- headers = {
191
- "User-Agent": (
192
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
193
- "(KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
194
- )
195
- }
196
-
197
- try:
198
- async with httpx.AsyncClient(timeout=httpx.Timeout(10.0)) as client:
199
- results = await _collect_results(client, params, headers, limit)
200
- except httpx.HTTPError as exc:
201
- raise HTTPException(status_code=502, detail="Upstream search failed") from exc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
- return {
204
- "query": q,
205
- "kl": params.get("kl"),
206
- "bing_market": params.get("bing_market"),
207
- "count": len(results),
208
- "results": results,
209
- }
 
1
+ # main.py
2
+ from fastapi import FastAPI, Query
3
+ from typing import List, Optional
4
+ from pydantic import BaseModel
5
+ import uvicorn
6
+
7
+ # Paste the entire BingSearch library code here
8
+ # (Omitted for brevity in this response, but include the full code from the query)
9
+
10
+ app = FastAPI(title="BingSearch API", description="API for Bing search functionalities", version="1.0")
11
+
12
+ bing = BingSearch(
13
+ timeout=10,
14
+ proxies=None,
15
+ verify=True,
16
+ lang="en-US",
17
+ sleep_interval=0.0,
18
+ impersonate="chrome110"
 
 
19
  )
20
 
21
+ class SearchResult(BaseModel):
22
+ url: str
23
+ title: str
24
+ description: str
25
+
26
+ class ImageResult(BaseModel):
27
+ title: str
28
+ image: str
29
+ thumbnail: str
30
+ url: str
31
+ source: str
32
+
33
+ class NewsResult(BaseModel):
34
+ title: str
35
+ url: str
36
+ description: str
37
+ source: str
38
+
39
+ @app.get("/search/text", response_model=List[SearchResult])
40
+ def search_text(
41
+ keywords: str = Query(..., description="Search keywords"),
42
+ region: Optional[str] = Query(None, description="Region for search"),
43
+ safesearch: str = Query("moderate", description="Safe search level: on, moderate, off"),
44
+ max_results: int = Query(10, description="Maximum number of results"),
45
+ unique: bool = Query(True, description="Exclude duplicate URLs")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  ):
47
+ results = bing.text(keywords, region, safesearch, max_results, unique)
48
+ return [SearchResult(url=r.url, title=r.title, description=r.description) for r in results]
 
 
 
 
 
 
 
 
49
 
50
+ @app.get("/search/suggestions", response_model=List[str])
51
+ def get_suggestions(
52
+ query: str = Query(..., description="Query for suggestions"),
53
+ region: Optional[str] = Query(None, description="Region for suggestions")
54
+ ):
55
+ return bing.suggestions(query, region)
56
+
57
+ @app.get("/search/images", response_model=List[ImageResult])
58
+ def search_images(
59
+ keywords: str = Query(..., description="Search keywords"),
60
+ region: Optional[str] = Query(None, description="Region for search"),
61
+ safesearch: str = Query("moderate", description="Safe search level: on, moderate, off"),
62
+ max_results: int = Query(10, description="Maximum number of results")
63
+ ):
64
+ results = bing.images(keywords, region, safesearch, max_results)
65
+ return [ImageResult(title=r.title, image=r.image, thumbnail=r.thumbnail, url=r.url, source=r.source) for r in results]
66
+
67
+ @app.get("/search/news", response_model=List[NewsResult])
68
+ def search_news(
69
+ keywords: str = Query(..., description="Search keywords"),
70
+ region: Optional[str] = Query(None, description="Region for search"),
71
+ safesearch: str = Query("moderate", description="Safe search level: on, moderate, off"),
72
+ max_results: int = Query(10, description="Maximum number of results")
73
+ ):
74
+ results = bing.news(keywords, region, safesearch, max_results)
75
+ return [NewsResult(title=r.title, url=r.url, description=r.description, source=r.source) for r in results]
76
 
77
+ if __name__ == "__main__":
78
+ uvicorn.run(app, host="0.0.0.0", port=8000)