AdarshJi commited on
Commit
1ed2217
·
verified ·
1 Parent(s): 2918e0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +650 -527
app.py CHANGED
@@ -1,552 +1,675 @@
1
- # server.py
2
- import base64
 
 
 
 
 
 
 
 
 
 
 
 
3
  import json
4
- import logging
5
- import os
6
  import random
7
- import string
8
- import time
9
- from concurrent.futures import ThreadPoolExecutor, as_completed
10
- from dataclasses import dataclass
11
- from typing import Any, Dict, List, Optional, Tuple
12
-
13
- import cloudscraper
14
- from fastapi import Body, FastAPI, HTTPException, Request
15
- from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
16
- from fastapi.staticfiles import StaticFiles
17
- from pydantic import BaseModel, Field, validator
18
- from requests.exceptions import RequestException
19
- from urllib.parse import urlencode
20
-
21
- # ---------------------------
22
- # Config & Logging
23
- # ---------------------------
24
- BASE = "https://image-generation.perchance.org"
25
- OUT_DIR = "outputs"
26
- UPLOADS_DIR = os.path.join(OUT_DIR, "uploads")
27
- FILES_ROUTE = "/files"
28
- os.makedirs(OUT_DIR, exist_ok=True)
29
- os.makedirs(UPLOADS_DIR, exist_ok=True)
30
-
31
- logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)-7s | %(message)s")
32
- logger = logging.getLogger("perchance_server")
33
-
34
- app = FastAPI(title="Perchance Image Generator API")
35
-
36
- # serve files
37
- app.mount(FILES_ROUTE, StaticFiles(directory=OUT_DIR), name="files")
38
-
39
-
40
- # ---------------------------
41
- # Pydantic models (API)
42
- # ---------------------------
43
- class PromptItem(BaseModel):
44
- prompt: str
45
- negative: Optional[str] = ""
46
- seed: Optional[int] = -1
47
- resolution: Optional[str] = "512x768"
48
- guidance: Optional[float] = 7.0
49
- # optional per-item count: how many images to generate for this prompt
50
- count: Optional[int] = 1
51
-
52
- @validator("count")
53
- def count_min(cls, v):
54
- if v is None:
55
- return 1
56
- if v < 1:
57
- raise ValueError("count must be >= 1")
58
- return v
59
-
60
-
61
- class GenerateRequest(BaseModel):
62
- items: List[PromptItem] = Field(..., description="List of prompt objects. If you pass a single item and top-level count >1, the single prompt will be used repeatedly.")
63
- # If items list length == 1 and global_count > 1 -> generate that many variants
64
- global_count: Optional[int] = Field(1, description="When items has a single entry and you want multiple images")
65
- concurrency: Optional[int] = Field(2, description="How many concurrent generation tasks")
66
- proxies: Optional[List[str]] = None
67
- proxy_rotate: Optional[bool] = False
68
- fast: Optional[bool] = False
69
- max_wait: Optional[int] = 90
70
-
71
- @validator("global_count")
72
- def gcount_min(cls, v):
73
- if v is None:
74
- return 1
75
- if v < 1:
76
- raise ValueError("global_count must be >= 1")
77
- return v
78
-
79
-
80
- class UploadBase64Request(BaseModel):
81
- filename: Optional[str]
82
- data: str # data:image/png;base64,... or raw base64
83
-
84
-
85
- # ---------------------------
86
- # Small utility helpers
87
- # ---------------------------
88
- def safe_filename(name: str) -> str:
89
- keep = "-_.() %s%s" % (string.ascii_letters, string.digits)
90
- return "".join(c for c in name if c in keep)[:140].replace(" ", "_")
91
-
92
-
93
- def random_token(n=8):
94
- return "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(n))
95
-
96
-
97
- # ---------------------------
98
- # Proxy manager (lightweight)
99
- # ---------------------------
100
- class ProxyManager:
101
- def __init__(self, proxies: Optional[List[str]] = None, rotate: bool = False):
102
- self.sessions: List[Tuple[str, Any]] = []
103
- self.rotate = rotate
104
- self.index = 0
105
- if proxies:
106
- self.load(proxies)
107
 
108
- @staticmethod
109
- def normalize(p: str) -> str:
110
- p = p.strip()
111
- if not p:
112
- return ""
113
- if "://" not in p:
114
- return "http://" + p
115
- return p
116
-
117
- def load(self, proxies: List[str]):
118
- self.sessions = []
119
- for raw in proxies:
120
- p = self.normalize(raw)
121
- if not p:
122
- continue
123
- try:
124
- s = cloudscraper.create_scraper()
125
- s.proxies = {"http": p, "https": p}
126
- self.sessions.append((p, s))
127
- except Exception as e:
128
- logger.warning("Failed to create session for proxy %s : %s", p, e)
129
- logger.info("ProxyManager loaded %d proxies", len(self.sessions))
130
-
131
- def count(self):
132
- return len(self.sessions)
133
-
134
- def get(self) -> Tuple[Optional[str], Any]:
135
- if not self.sessions:
136
- return None, cloudscraper.create_scraper()
137
- if self.rotate:
138
- p, s = self.sessions[self.index % len(self.sessions)]
139
- self.index += 1
140
- return p, s
141
- else:
142
- return self.sessions[0]
143
-
144
-
145
- # ---------------------------
146
- # Perchance client (blocking)
147
- # ---------------------------
148
- @dataclass
149
- class PerchanceClient:
150
- base_url: str = BASE
151
- timeout: int = 30
152
- max_wait: int = 90
153
- proxy_manager: Optional[ProxyManager] = None
154
- headers: dict = None
155
-
156
- def __post_init__(self):
157
- if self.headers is None:
158
- self.headers = {
159
- "Accept": "*/*",
160
- "Content-Type": "text/plain;charset=UTF-8",
161
- "Origin": "https://image-generation.perchance.org",
162
- "Referer": "https://image-generation.perchance.org/embed",
163
- "User-Agent": "PerchanceClient/1.0",
164
- }
165
-
166
- def _choose_session(self):
167
- if self.proxy_manager and self.proxy_manager.count() > 0:
168
- return self.proxy_manager.get()
169
- return None, cloudscraper.create_scraper()
170
-
171
- def verify_user(self, thread=0):
172
- _, s = self._choose_session()
173
- url = f"{self.base_url}/api/verifyUser?thread={thread}&__cacheBust={random.random()}"
174
  try:
175
- r = s.get(url, timeout=self.timeout, headers=self.headers)
176
- r.raise_for_status()
177
- return r.json()
 
 
178
  except Exception as e:
179
- logger.debug("verify_user failed: %s", e)
180
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
- def get_ad_access_code(self):
183
- _, s = self._choose_session()
184
- url = f"{self.base_url}/api/getAccessCodeForAdPoweredStuff"
185
- try:
186
- r = s.get(url, timeout=self.timeout, headers=self.headers)
187
- r.raise_for_status()
188
- return r.text.strip()
189
- except Exception:
190
- return ""
191
-
192
- def post_generate(self, body: dict, params: dict):
193
- proxy, s = self._choose_session()
194
- qs = "?" + urlencode(params) if params else ""
195
- url = f"{self.base_url}/api/generate{qs}"
196
- try:
197
- r = s.post(url, headers=self.headers, data=json.dumps(body), timeout=self.timeout)
198
- r.raise_for_status()
199
- try:
200
- return r.json()
201
- except Exception:
202
- return {"status": "invalid_json", "raw": r.text}
203
- except RequestException as e:
204
- logger.warning("post_generate failed via %s: %s", proxy or "direct", e)
205
- return {"status": "fetch_failure", "error": str(e)}
206
-
207
- def await_existing_generation(self, user_key: str, timeout_seconds: int = 18):
208
- proxy, s = self._choose_session()
209
- url = f"{self.base_url}/api/awaitExistingGenerationRequest?userKey={user_key}&__cacheBust={random.random()}"
210
- try:
211
- r = s.get(url, timeout=timeout_seconds, headers=self.headers)
212
- r.raise_for_status()
213
  try:
214
- return r.json()
215
- except Exception:
216
- return {"status": "invalid_json", "raw": r.text}
217
- except Exception:
218
- return {"status": "fetch_failure"}
219
-
220
- def download_temporary_image(self, image_id: str, save_dir: str, prefix: str, max_wait: Optional[int] = None):
221
- max_wait = max_wait or self.max_wait
222
- os.makedirs(save_dir, exist_ok=True)
223
- dl_url = f"{self.base_url}/api/downloadTemporaryImage?imageId={image_id}"
224
- start = time.time()
225
- backoff = 0.5
226
- attempt = 0
227
-
228
- def looks_wait(s: str) -> bool:
229
- if not s:
230
- return False
231
- sl = s.lower()
232
- return any(k in sl for k in ("wait", "waiting", "pending", "processing"))
233
-
234
- if looks_wait(image_id):
235
- time.sleep(0.25)
236
-
237
- while True:
238
- attempt += 1
239
- proxy, s = self._choose_session()
240
- try:
241
- r = s.get(dl_url, timeout=self.timeout, headers=self.headers, stream=True)
242
- if r.status_code == 200:
243
- ct = r.headers.get("Content-Type", "")
244
- ext = ".jpg"
245
- if "png" in ct:
246
- ext = ".png"
247
- elif "webp" in ct:
248
- ext = ".webp"
249
- name = f"{prefix}_{image_id[:12]}{ext}"
250
- outpath = os.path.join(save_dir, safe_filename(name))
251
- with open(outpath, "wb") as fh:
252
- for chunk in r.iter_content(8192):
253
- if chunk:
254
- fh.write(chunk)
255
- logger.info("Saved downloaded image to %s", outpath)
256
- return outpath
257
- txt = (r.text or "").lower()
258
- elapsed = time.time() - start
259
- if r.status_code in (404, 410) or "waiting" in txt or looks_wait(image_id):
260
- if elapsed >= max_wait:
261
- raise RequestException(f"Timeout waiting for image {image_id}")
262
- time.sleep(backoff)
263
- backoff = min(backoff * 1.8, 8.0)
264
- continue
265
- r.raise_for_status()
266
- except RequestException as e:
267
- elapsed = time.time() - start
268
- if elapsed < max_wait:
269
- logger.debug("download attempt failed (%s). retrying...", e)
270
- time.sleep(backoff)
271
- backoff = min(backoff * 1.8, 8.0)
272
- continue
273
- logger.error("Failed download: %s", e)
274
- raise
275
-
276
- def generate_one(self, prompt: str, negative: str, seed: int, resolution: str, guidance: float,
277
- user_key: str, ad_access_code: str, request_id: str, max_retries: int = 4, max_wait: Optional[int] = None):
278
- params = {"userKey": user_key, "requestId": request_id, "adAccessCode": ad_access_code, "__cacheBust": random.random()}
279
- body = {
280
- "prompt": prompt,
281
- "negativePrompt": negative,
282
- "seed": seed,
283
- "resolution": resolution,
284
- "guidanceScale": guidance,
285
- "channel": "ai-text-to-image-generator",
286
- "subChannel": "private",
287
- "userKey": user_key,
288
- "adAccessCode": ad_access_code,
289
- "requestId": request_id,
290
  }
291
 
292
- attempt = 0
293
- refreshed = False
294
- while attempt < max_retries:
295
- attempt += 1
296
- result = self.post_generate(body, params)
297
- status = result.get("status")
298
- if status == "success":
299
- if result.get("imageId"):
300
- return {"imageId": result["imageId"], "seed": result.get("seed")}
301
- if result.get("imageDataUrls"):
302
- return {"inline": result["imageDataUrls"][0], "seed": result.get("seed")}
303
- return {"error": "no_image_in_success", "raw": result}
304
- elif status == "waiting_for_prev_request_to_finish":
305
- self.await_existing_generation(user_key)
306
- time.sleep(0.2 + 0.2 * random.random())
307
- continue
308
- elif status == "invalid_ad_access_code":
309
- if not refreshed:
310
- new = self.get_ad_access_code()
311
- if new:
312
- ad_access_code = new
313
- params["adAccessCode"] = new
314
- body["adAccessCode"] = new
315
- refreshed = True
316
- time.sleep(0.8)
 
 
 
 
 
 
317
  continue
318
- return {"error": "invalid_ad_access_code", "raw": result}
319
- elif status == "gen_failure" and result.get("type") == 1:
320
- time.sleep(2.0)
321
- continue
322
- else:
323
- # network or unknown - retry
324
- if status in (None, "fetch_failure", "invalid_json", "stale_request"):
325
- time.sleep(1.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  continue
327
- return {"error": "unhandled_status", "status": status, "raw": result}
328
- return {"error": "max_retries_exceeded"}
 
 
 
 
 
 
 
 
 
 
 
329
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
- # ---------------------------
332
- # API logic
333
- # ---------------------------
334
- def build_tasks_from_request(req: GenerateRequest) -> List[Dict]:
335
  """
336
- Build a flat list of generation tasks from the request payload.
337
- Each task is a dict with keys: prompt, negative, seed, resolution, guidance, request_count (1..)
 
 
 
 
 
338
  """
339
- tasks = []
340
- items = req.items
341
- if len(items) == 1 and req.global_count and req.global_count > 1:
342
- # repeat the single prompt global_count times
343
- it = items[0]
344
- for i in range(req.global_count):
345
- tasks.append({
346
- "prompt": it.prompt,
347
- "negative": it.negative,
348
- "seed": it.seed,
349
- "resolution": it.resolution,
350
- "guidance": it.guidance,
351
- "task_index": i + 1,
352
- })
353
- else:
354
- # for each item, generate item.count images
355
- for it in items:
356
- cnt = it.count or 1
357
- for i in range(cnt):
358
- tasks.append({
359
- "prompt": it.prompt,
360
- "negative": it.negative,
361
- "seed": it.seed,
362
- "resolution": it.resolution,
363
- "guidance": it.guidance,
364
- "task_index": i + 1,
365
- })
366
- return tasks
367
-
368
-
369
- def make_file_url(request: Request, filepath: str) -> str:
370
- # filepath is relative under OUT_DIR
371
- host = str(request.base_url).rstrip("/")
372
- # file path relative to OUT_DIR
373
- rel = os.path.relpath(filepath, OUT_DIR)
374
- return f"{host}{FILES_ROUTE}/{rel.replace(os.path.sep, '/')}"
375
-
376
- # Blocking worker that runs generation for a single task and returns result path or error
377
- def run_generation_task(client: PerchanceClient, task: Dict, user_key: str, ad_code: str, save_dir: str, max_wait: int) -> Dict:
378
- req_id = f"{time.time()}-{random_token(6)}"
379
- gen = client.generate_one(
380
- prompt=task["prompt"],
381
- negative=task["negative"],
382
- seed=task["seed"],
383
- resolution=task["resolution"],
384
- guidance=task["guidance"],
385
- user_key=user_key,
386
- ad_access_code=ad_code,
387
- request_id=req_id,
388
- max_retries=5,
389
- max_wait=max_wait
390
- )
391
- if not isinstance(gen, dict):
392
- return {"error": "invalid_gen_response"}
393
- if gen.get("inline"):
394
- # inline base64 dataURL -> save and return
395
- data_url = gen["inline"]
396
- header, b64 = (data_url.split(",", 1) + [""])[:2]
397
- ext = ".png" if "png" in header else ".jpg"
398
- prefix = safe_filename(task["prompt"])[:30] + "_" + random_token(4)
399
- filename = f"{prefix}{ext}"
400
- outpath = os.path.join(save_dir, filename)
401
- with open(outpath, "wb") as fh:
402
- fh.write(base64.b64decode(b64))
403
- return {"path": outpath}
404
- if gen.get("imageId"):
405
- try:
406
- prefix = safe_filename(task["prompt"])[:30] + "_" + random_token(4)
407
- outpath = client.download_temporary_image(gen["imageId"], save_dir, prefix=prefix, max_wait=max_wait)
408
- return {"path": outpath}
409
- except Exception as e:
410
- logger.exception("Download failed for imageId %s: %s", gen.get("imageId"), e)
411
- return {"error": str(e), "raw": gen}
412
- return {"error": gen.get("error", "unknown")}
 
413
 
 
 
 
 
414
 
415
- # ---------------------------
416
- # Endpoints
417
- # ---------------------------
418
- @app.post("/upload_base64")
419
- async def upload_base64(payload: UploadBase64Request):
 
 
 
 
 
 
420
  """
421
- Accept base64 data and save to uploads folder, return a URL to access it.
422
- Body: { "filename": "optional.png", "data": "data:image/png;base64,..." }
423
  """
424
- data = payload.data
425
- if "," in data:
426
- _, b64 = data.split(",", 1)
427
- else:
428
- b64 = data
 
 
 
 
429
  try:
430
- raw = base64.b64decode(b64)
431
- except Exception:
432
- raise HTTPException(status_code=400, detail="invalid base64")
433
- fn = payload.filename or f"upload_{int(time.time())}_{random_token(6)}.png"
434
- fn = safe_filename(fn)
435
- outpath = os.path.join(UPLOADS_DIR, fn)
436
- with open(outpath, "wb") as fh:
437
- fh.write(raw)
438
- url = f"{str(app.url_path_for('files'))}/{os.path.relpath(outpath, OUT_DIR).replace(os.path.sep, '/')}"
439
- # return full URL
440
- # base URL unknown in background, so return path-like and let client prefix host
441
- return {"path": outpath, "url_path": url}
442
-
443
-
444
- @app.post("/generate")
445
- async def generate_stream(request: Request, body: GenerateRequest = Body(...)):
446
- """
447
- Streaming endpoint. Sends Server-Sent-Events-like chunked responses with status updates.
448
- Each event is a JSON object on its own line.
449
 
450
- Example client usage: fetch and parse lines; each line is a JSON event.
451
- """
452
- # build tasks
453
- tasks = build_tasks_from_request(body)
454
- if not tasks:
455
- raise HTTPException(status_code=400, detail="no tasks built from request")
456
-
457
- # set up proxy manager & client
458
- proxy_mgr = ProxyManager(proxies=body.proxies, rotate=body.proxy_rotate) if body.proxies else None
459
- client = PerchanceClient(proxy_manager=proxy_mgr, max_wait=body.max_wait)
460
- # initial verify and ad code fetch
461
- verify = client.verify_user(thread=0)
462
- user_key = verify.get("userKey", "") or ""
463
- ad_code = client.get_ad_access_code() or ""
464
-
465
- concurrency = max(1, body.concurrency or 1)
466
- save_dir = OUT_DIR
467
-
468
- # We'll use a blocking ThreadPoolExecutor for generation tasks and stream results as they finish.
469
- executor = ThreadPoolExecutor(max_workers=concurrency)
470
-
471
- def event_stream():
472
- # announce start
473
- yield json.dumps({"event": "started", "total_tasks": len(tasks)}) + "\n"
474
- futures = {}
475
- for i, t in enumerate(tasks):
476
- # submit each generation task
477
- fut = executor.submit(run_generation_task, client, t, user_key, ad_code, save_dir, body.max_wait)
478
- futures[fut] = {"task_index": i + 1, "task": t}
479
- # as finished, send updates
480
- completed = 0
481
- for fut in as_completed(list(futures.keys())):
482
- meta = futures[fut]
483
- idx = meta["task_index"]
484
- t = meta["task"]
485
- try:
486
- res = fut.result()
487
- except Exception as e:
488
- logger.exception("Task raised exception")
489
- res = {"error": str(e)}
490
- completed += 1
491
- # prepare response entry
492
- if res.get("path"):
493
- url = make_file_url(request, res["path"])
494
- event = {"event": "completed", "task_index": idx, "prompt": t["prompt"], "file_url": url}
495
- else:
496
- event = {"event": "failed", "task_index": idx, "prompt": t["prompt"], "error": res.get("error"), "raw": res.get("raw")}
497
- # stream the event as JSON line
498
- yield json.dumps(event) + "\n"
499
- yield json.dumps({"event": "all_done", "completed": completed}) + "\n"
500
-
501
- return StreamingResponse(event_stream(), media_type="application/json")
502
-
503
-
504
- @app.post("/generate_sync")
505
- async def generate_sync(request: Request, body: GenerateRequest = Body(...)):
506
- """
507
- Non-streaming endpoint that blocks and returns JSON array of results (list of urls / errors).
508
  """
509
- tasks = build_tasks_from_request(body)
510
- if not tasks:
511
- raise HTTPException(status_code=400, detail="no tasks")
512
-
513
- proxy_mgr = ProxyManager(proxies=body.proxies, rotate=body.proxy_rotate) if body.proxies else None
514
- client = PerchanceClient(proxy_manager=proxy_mgr, max_wait=body.max_wait)
515
- verify = client.verify_user(thread=0)
516
- user_key = verify.get("userKey", "") or ""
517
- ad_code = client.get_ad_access_code() or ""
518
-
519
- concurrency = max(1, body.concurrency or 1)
520
- results = [None] * len(tasks)
521
- with ThreadPoolExecutor(max_workers=concurrency) as ex:
522
- futures = {ex.submit(run_generation_task, client, t, user_key, ad_code, OUT_DIR, body.max_wait): idx for idx, t in enumerate(tasks)}
523
- for fut in as_completed(futures):
524
- idx = futures[fut]
525
- try:
526
- res = fut.result()
527
- except Exception as e:
528
- res = {"error": str(e)}
529
- if res.get("path"):
530
- url = make_file_url(request, res["path"])
531
- results[idx] = {"ok": True, "url": url}
532
- else:
533
- results[idx] = {"ok": False, "error": res.get("error"), "raw": res.get("raw")}
534
- return JSONResponse({"results": results})
535
-
536
-
537
- @app.get("/", response_class=HTMLResponse)
538
- async def index():
539
- return """
540
- <html><body>
541
- <h2>Perchance Image Generator API</h2>
542
- <p>POST /generate to stream results. POST /generate_sync to get final JSON. POST /upload_base64 to upload images.</p>
543
- </body></html>
544
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
 
 
 
 
 
546
 
547
- # ---------------------------
548
- # Safety note
549
- # ---------------------------
550
- @app.on_event("startup")
551
- def startup_note():
552
- logger.info("Server ready. Files served under %s at %s/files/", OUT_DIR, "http://<host>:<port>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import asyncio
3
+ import json
4
+ from typing import List, Dict, Any
5
+
6
+ import aiohttp
7
+ from fastapi import FastAPI, HTTPException, Request
8
+ from fastapi.responses import StreamingResponse, JSONResponse
9
+ from pydantic import BaseModel
10
+ from starlette.middleware.cors import CORSMiddleware
11
+ import asyncio
12
+ import aiohttp
13
+ import urllib.parse
14
+ from bs4 import BeautifulSoup as bs
15
  import json
 
 
16
  import random
17
+ import html
18
+ from bs4 import NavigableString, Tag
19
+ from rich import print
20
+
21
+ class BingScraper:
22
+ # Rotating User-Agent list.
23
+ USER_AGENTS = [
24
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
25
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15",
26
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0",
27
+ "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36"
28
+ ]
29
+
30
+ # Optional proxy list (empty if not used for fastest performance).
31
+ PROXIES = [
32
+ # "http://proxy1.example.com:8080",
33
+ # "http://proxy2.example.com:8080",
34
+ ]
35
+
36
+ def __init__(self):
37
+ pass
38
+
39
+ @classmethod
40
+ def get_random_headers(cls):
41
+ """Return HTTP headers with a random User-Agent."""
42
+ return {
43
+ "User-Agent": random.choice(cls.USER_AGENTS),
44
+ "Accept": "text/html,application/xhtml+xml,application/xml;""q=0.9,image/avif,image/webp,*/*;q=0.8","Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8","Cache-Control": "no-cache","Pragma": "no-cache","Cookie": "MUID=12164D9908FD6B8C103F5871090F6AAA; _EDGE_V=1; MUIDB=12164D9908FD6B8C103F5871090F6AAA; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=26D35EBC20D44E418642BA10D7C14F42&dmnchg=1; ak_bmsc=17AB29B1FF4DE4D83B080A7E5CD24350~000000000000000000000000000000~YAAQNtjIFzUD3a2WAQAAnKvFuRtoko96GJe/vh01R588ZQYnYquFWtB0CzXeN5JGXZWgz7CwJqtckHuj3Z70qUhOcji4vkzhCMc/u91gnAIA0zCu7FcDeEJQgRx6n9MxhjrDAel2IIezGUgh+5ktFvDgUIO05s06PqDAtIUzuc9yTrbdAJi3iZvxFFKdGnbQkJ5krI9w3auWhY6i7JvcUPiDsbzzv0Chj1MxzRT1zdkP1B/JFtz+s5d8rUfagFpQporeRG/9gdid4qUPWvPHD6k98AdCTBYOysMHH2z9ErrD5PCO2mLK/RPrJSoqqN4d2mtnWeHNeF897PioJk0nOJw/IrseF0EgdsscKs7NVg/e3Mp27FTEIBduBRa93vvaabLMxg38; _UR=QS=0&TQS=0&Pn=0; BFBUSR=BFBHP=0; _HPVN=CS=eyJQbiI6eyJDbiI6MSwiU3QiOjAsIlFzIjowLCJQcm9kIjoiUCJ9LCJTYyI6eyJDbiI6MSwiU3QiOjAsIlFzIjowLCJQcm9kIjoiSCJ9LCJReiI6eyJDbiI6MSwiU3QiOjAsIlFzIjowLCJQcm9kIjoiVCJ9LCJBcCI6dHJ1ZSwiTXV0ZSI6dHJ1ZSwiTGFkIjoiMjAyNS0wNS0xMFQwMDowMDowMFoiLCJJb3RkIjowLCJHd2IiOjAsIlRucyI6MCwiRGZ0IjpudWxsLCJNdnMiOjAsIkZsdCI6MCwiSW1wIjoyLCJUb2JuIjowfQ==; _Rwho=u=d&ts=2025-05-10; ipv6=hit=1746877082438&t=4; _EDGE_S=F=1&SID=159E288249896C4D3A4D3D6A487B6D96&mkt=en-in; USRLOC=HS=1&ELOC=LAT=28.648426055908203|LON=77.1643295288086|N=Delhi%2C%20Delhi|ELT=1|; SRCHUSR=DOB=20250510&DS=1; MMCASM=ID=94A07BAF860D449DA2A4ABC8CAAC2538; _RwBf=r=0&ilt=9&ihpd=1&ispd=1&rc=24&rb=0&rg=200&pc=21&mtu=0&rbb=0&clo=0&v=9&l=2025-05-10T07:00:00.0000000Z&lft=0001-01-01T00:00:00.0000000&aof=0&ard=0001-01-01T00:00:00.0000000&rwdbt=0&rwflt=0&rwaul2=0&g=&o=2&p=&c=&t=0&s=0001-01-01T00:00:00.0000000+00:00&ts=2025-05-10T11:00:39.6623875+00:00&rwred=0&wls=&wlb=&wle=&ccp=&cpt=&lka=0&lkt=0&aad=0&TH=&cid=0&gb=; _SS=SID=159E288249896C4D3A4D3D6A487B6D96&R=24&RB=0&GB=0&RG=200&RP=21; SRCHHPGUSR=SRCHLANG=en&IG=00DEBA4A2B754BC983011302535541CA&PV=10.0.0&DM=1&BRW=S&BRH=M&CW=1094&CH=738&SCW=1177&SCH=2902&DPR=1.3&UTC=330&WTS=63882470261&PRVCW=1094&PRVCH=738&EXLTT=1&HV=1746874838&HVE=CfDJ8GtUudZcSi1Enm88WwQKtCf-s_3m7rtBIR85jW2Uv01W2IjDUasKRdncp2MkJ7Bl7PxVetzuZETt8bkyd54iRcMP8SVUsClaL2I5uvRiGiSldOKFjy7i69jYPS-egJOhCGf717H5WHFvCI4UwespMZgxZkdo8SVoBwOlx_yQKpA2qtpqV7t6wYd7etwY1FpUaA&BCML=0&BCSRLANG=&ADLT=OFF"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ }
47
+
48
+
49
+ @classmethod
50
+ def get_random_proxy(cls):
51
+ """Return a random proxy if available."""
52
+ if cls.PROXIES:
53
+ return random.choice(cls.PROXIES)
54
+ return None
55
+
56
+ async def fetch_html(self, url, session):
57
+ """Fetch HTML content asynchronously with minimal overhead."""
58
+ headers = self.get_random_headers()
59
+ proxy = self.get_random_proxy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  try:
61
+ async with session.get(url, headers=headers, proxy=proxy, timeout=30) as response:
62
+ if response.status != 200:
63
+ print(f"Failed to fetch {url}: HTTP {response.status}")
64
+ return None
65
+ return await response.text()
66
  except Exception as e:
67
+ print(f"Error fetching {url}: {e}")
68
+ return None
69
+
70
+ # ===== Text Search Methods =====
71
+ async def fetch_bing_text(self, query, session):
72
+ """Fetch Bing search results HTML for the given query."""
73
+ encoded_query = urllib.parse.quote_plus(query)
74
+ url = f'https://www.bing.com/search?q={encoded_query}'
75
+ return await self.fetch_html(url, session)
76
+
77
+ def parse_text_results(self, html):
78
+ """
79
+ Parse the Bing HTML using BeautifulSoup to extract:
80
+ - icon (if available)
81
+ - URL
82
+ - Title
83
+ - Abstract (or fallback text)
84
+ - Additional columns (if available)
85
+ """
86
+ soup = bs(html, 'lxml')
87
+ results = []
88
+ result_no = 0
89
+ main_containers = soup.find_all(id="b_results")
90
+ for container in main_containers:
91
+ list_results = container.find_all(class_='b_algo')
92
+ for cont in list_results:
93
+ result_no += 1
94
+ try:
95
+ icon = 'https:' + cont.find_all(class_='rms_iac')[0].get('data-src')
96
+ except Exception:
97
+ icon = None
98
+ try:
99
+ URL = cont.find_all(class_='tilk')[0].get('href')
100
+ except Exception:
101
+ continue
102
+ try:
103
+ Title = cont.find_all('h2')[0].get_text(strip=True)
104
+ except Exception:
105
+ Title = "No Title"
106
+ try:
107
+ abstract_elem = cont.find_all(class_='b_caption')
108
+ if abstract_elem and abstract_elem[0].get_text(strip=True):
109
+ Abstract = abstract_elem[0].get_text(strip=True)
110
+ else:
111
+ Abstract = cont.find_all(class_='b_algoSlug')[0].get_text(strip=True)
112
+ except Exception:
113
+ Abstract = "No Abstract"
114
+ other = []
115
+ try:
116
+ for column in cont.find_all(class_='b_rc_gb_sub_column'):
117
+ for div in column.find_all('div'):
118
+ try:
119
+ sub_title = div.find_all(class_='b_rc_gb_sub_title')[0].get_text(strip=True)
120
+ except Exception:
121
+ sub_title = "No Sub-title"
122
+ try:
123
+ sub_description = div.find_all(class_='b_rc_gb_text_wrapper')[0].get_text(strip=True)
124
+ except Exception:
125
+ sub_description = ""
126
+ if sub_description:
127
+ other.append({'Title': sub_title, 'Description': sub_description})
128
+ except Exception:
129
+ other = None
130
+ results.append({
131
+ 'no': result_no,
132
+ 'icon': icon,
133
+ 'URL': URL,
134
+ 'Title': Title,
135
+ 'Abstract': Abstract,
136
+ 'other': other
137
+ })
138
+ return results
139
+
140
+ async def search_text(self, query, session):
141
+ """Perform a text search (regular and news) and return parsed results."""
142
+ real_query = query
143
+ news_query = query + " news"
144
+ # Run both searches concurrently.
145
+ real_html_task = asyncio.create_task(self.fetch_bing_text(real_query, session))
146
+ news_html_task = asyncio.create_task(self.fetch_bing_text(news_query, session))
147
+ real_html, news_html = await asyncio.gather(real_html_task, news_html_task)
148
+ if not real_html or not news_html:
149
+ print("Failed to retrieve one or both text searches.")
150
+ return None
151
+ real_results = self.parse_text_results(real_html)
152
+ news_results = self.parse_text_results(news_html)
153
+ return {
154
+ "query_results": real_results,
155
+ "news_results": news_results
156
+ }
157
 
158
+ # ===== Video Search Methods =====
159
+ async def fetch_bing_video(self, query, session):
160
+ """
161
+ Fetch Bing video search results HTML for the given query.
162
+ The URL below targets Bing's video search endpoint.
163
+ """
164
+ encoded_query = urllib.parse.quote_plus(query)
165
+ url = f'https://www.bing.com/videos/search?q={encoded_query}&adlt=off'
166
+ return await self.fetch_html(url, session)
167
+
168
+ def parse_video_results(self, html_content):
169
+ """
170
+ Parse the Bing video search HTML to extract:
171
+ - Video URL (from the mmeta JSON)
172
+ - Title (from an element with class "mc_vtvc_title")
173
+ - Thumbnail URL (from the mmeta JSON)
174
+ - Duration (if available)
175
+ - Source (if available)
176
+ """
177
+ soup = bs(html_content, 'lxml')
178
+ video_results = []
179
+ result_no = 0
180
+
181
+ # Find all divs with class "mc_vtvc" that have an mmeta attribute.
182
+ video_divs = soup.find_all("div", class_="mc_vtvc", attrs={"mmeta": True})
183
+ for div in video_divs:
184
+ result_no += 1
185
+ mmeta_raw = div.get("mmeta")
 
 
 
186
  try:
187
+ # Unescape and parse the JSON metadata.
188
+ mmeta_json = json.loads(html.unescape(mmeta_raw))
189
+ except Exception as e:
190
+ mmeta_json = {}
191
+ print(f"Error parsing mmeta JSON: {e}")
192
+ # Retrieve video URL and thumbnail from the metadata.
193
+ video_url = mmeta_json.get("murl") or mmeta_json.get("pgurl")
194
+ thumbnail = mmeta_json.get("turl")
195
+ # Find the title; often stored in an element with class "mc_vtvc_title".
196
+ title_elem = div.find(class_="mc_vtvc_title")
197
+ title = title_elem.get_text(strip=True) if title_elem else "No Title"
198
+ # Look for duration within an element that may contain a duration value.
199
+ duration_elem = div.find(class_="mc_bc_rc")
200
+ duration = duration_elem.get_text(strip=True) if duration_elem else None
201
+ # Optionally, extract the video source if available.
202
+ source_elem = div.find(class_="mc_vtvc_meta_source")
203
+ source = source_elem.get_text(strip=True) if source_elem else None
204
+
205
+ video_results.append({
206
+ "no": result_no,
207
+ "Video URL": video_url,
208
+ "Title": title,
209
+ "Thumbnail": thumbnail,
210
+ "Duration": duration,
211
+ "Source": source,
212
+ "Meta": mmeta_json # Additional metadata if needed.
213
+ })
214
+ return video_results
215
+
216
+ async def search_video(self, query, session):
217
+ """Perform a video search and return parsed results."""
218
+ html_content = await self.fetch_bing_video(query, session)
219
+ if not html_content:
220
+ print("Failed to retrieve video search results.")
221
+ return None
222
+ video_results = self.parse_video_results(html_content)
223
+ return {
224
+ "query": query,
225
+ "video_results": video_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  }
227
 
228
+ # ===== Image Search Methods =====
229
+ async def fetch_bing_image(self, query, session):
230
+ """Fetch Bing images search results HTML for the given query."""
231
+ encoded_query = urllib.parse.quote_plus(query)
232
+ # Using the URL structure with &ch=803.
233
+ url = f'https://www.bing.com/images/search?q={encoded_query}'
234
+ return await self.fetch_html(url, session)
235
+
236
+ def parse_image_results(self, html):
237
+ """
238
+ Parse the Bing images HTML using BeautifulSoup to extract detailed image data.
239
+ For each image result (<li> inside a <ul class="dgControl_list">), extract:
240
+ - JSON data from the 'm' attribute in <a class="iusc">
241
+ - Image resolution from the <span class="nowrap"> tag inside <div class="img_info hon">
242
+ - The hosting domain from the <div class="lnkw"> tag.
243
+ """
244
+ soup = bs(html, 'lxml')
245
+ image_results = []
246
+ # Find all <ul> elements that contain image results.
247
+ ul_lists = soup.find_all("ul", class_="dgControl_list")
248
+ for ul in ul_lists:
249
+ li_elements = ul.find_all("li")
250
+ for li in li_elements:
251
+ image_data = {}
252
+ try:
253
+ # Extract the JSON details from the anchor tag.
254
+ a_tag = li.find("a", class_="iusc")
255
+ if not a_tag:
256
+ continue
257
+ m_json_str = a_tag.get("m")
258
+ if not m_json_str:
259
  continue
260
+ try:
261
+ # Parse the JSON string.
262
+ details = json.loads(m_json_str)
263
+ except Exception as e:
264
+ print(f"JSON parsing error: {e}")
265
+ continue
266
+ # Copy all available keys from the JSON.
267
+ image_data.update(details)
268
+ # Extract additional details from the HTML.
269
+ # Resolution: from <div class="img_info hon"> -> <span class="nowrap">
270
+ img_info = li.find("div", class_="img_info")
271
+ if img_info:
272
+ resolution_span = img_info.find("span", class_="nowrap")
273
+ if resolution_span:
274
+ image_data["resolution"] = resolution_span.get_text(strip=True)
275
+ # Domain: from <div class="lnkw"> -> <a> text.
276
+ lnkw_div = li.find("div", class_="lnkw")
277
+ if lnkw_div:
278
+ domain_a = lnkw_div.find("a")
279
+ if domain_a:
280
+ image_data["domain"] = domain_a.get_text(strip=True)
281
+ image_results.append(image_data)
282
+ except Exception as e:
283
+ print(f"Error parsing an image result: {e}")
284
  continue
285
+ return image_results
286
+
287
+ async def search_image(self, query, session):
288
+ """Perform an image search and return parsed results."""
289
+ image_html = await self.fetch_bing_image(query, session)
290
+ if not image_html:
291
+ print("Failed to retrieve image search results.")
292
+ return None
293
+ image_results = self.parse_image_results(image_html)
294
+ return {
295
+ "query": query,
296
+ "image_results": image_results
297
+ }
298
 
299
+ # ===== Wikipedia Search and Extraction Methods =====
300
+ @staticmethod
301
+ def format_anchor(tag):
302
+ """
303
+ Format an <a> tag into the desired output format:
304
+ (Link Text)[Full_URL]
305
+ """
306
+ href = tag.get("href", "")
307
+ if href.startswith("/") or href.startswith("#"):
308
+ href = "https://en.wikipedia.org" + href
309
+ text = tag.get_text(strip=True)
310
+ return f"({text})[{href}]"
311
+
312
+ @classmethod
313
+ def extract_text(cls, element):
314
+ """
315
+ Recursively extract text from an element.
316
+ - For <a> tags, return the formatted version and do not process its children.
317
+ - For other tags, process only their immediate children.
318
+ """
319
+ if isinstance(element, NavigableString):
320
+ return element.strip()
321
+ if isinstance(element, Tag):
322
+ if element.name in ["style", "script", "noscript"]:
323
+ return ""
324
+ if element.name == "a":
325
+ return cls.format_anchor(element)
326
+ texts = []
327
+ for child in element.children:
328
+ t = cls.extract_text(child)
329
+ if t:
330
+ texts.append(t)
331
+ return " ".join(texts).strip()
332
+ return ""
333
+
334
+ @classmethod
335
+ def process_infobox(cls, table):
336
+ """
337
+ Process the infobox table row by row using recursive extraction.
338
+ Returns a list of strings where each line represents a row.
339
+ """
340
+ lines = []
341
+ for row in table.find_all("tr"):
342
+ header_cell = row.find("th")
343
+ data_cell = row.find("td")
344
+ header_text = cls.extract_text(header_cell) if header_cell else ""
345
+ data_text = cls.extract_text(data_cell) if data_cell else ""
346
+ if header_text and data_text:
347
+ lines.append(f"{header_text}: {data_text}")
348
+ elif header_text:
349
+ lines.append(header_text)
350
+ elif data_text:
351
+ lines.append(data_text)
352
+ return lines
353
+
354
+ @classmethod
355
+ def process_paragraph(cls, p_tag):
356
+ """
357
+ Process the paragraph (<p> tag) using recursive extraction.
358
+ """
359
+ return cls.extract_text(p_tag)
360
+
361
+ async def extract_page_content(self, url, session):
362
+ """
363
+ Fetch the Wikipedia page HTML once and extract both the infobox and a
364
+ paragraph with sufficient content.
365
+ """
366
+ html_content = await self.fetch_html(url, session)
367
+ if not html_content:
368
+ print("Failed to fetch the page.")
369
+ return None, None
370
+
371
+ soup = bs(html_content, "lxml")
372
+ # Extract infobox.
373
+ table = soup.find("table", class_="infobox")
374
+ infobox_lines = self.process_infobox(table) if table else None
375
+
376
+ # Extract a paragraph with sufficient content.
377
+ content_div = soup.find("div", class_="mw-content-ltr")
378
+ paragraph_text = None
379
+ if content_div:
380
+ paragraphs = content_div.find_all("p")
381
+ for p in paragraphs:
382
+ text = self.process_paragraph(p)
383
+ # Only choose a paragraph if it has at least 5 words (adjust threshold as needed)
384
+ if len(text.split()) >= 5:
385
+ paragraph_text = text
386
+ break
387
+
388
+ return infobox_lines, paragraph_text
389
+
390
+ async def search_wikipedia_url(self, query, session):
391
+ """
392
+ Perform a Bing search for the query and return a Wikipedia URL if found.
393
+ """
394
+ # Correct the search query by appending "wikipedia.org"
395
+ encoded_query = urllib.parse.quote_plus(query + " wikipedia.org")
396
+ bing_url = f"https://www.bing.com/search?q={encoded_query}"
397
+ html_content = await self.fetch_html(bing_url, session)
398
+ if not html_content:
399
+ return None
400
+ soup = bs(html_content, "lxml")
401
+ # Try different containers for search results.
402
+ results = soup.find_all("li", class_="b_algo") or soup.find_all("div", class_="b_algo")
403
+ for result in results:
404
+ a_tag = result.find("a", href=True)
405
+ if a_tag and "wikipedia.org" in a_tag["href"]:
406
+ return a_tag["href"]
407
+ return None
408
+
409
+ async def get_wikipedia_url_concurrently(self, query, session, num_attempts=10):
410
+ """
411
+ Launch num_attempts concurrent search tasks for a Wikipedia URL.
412
+ Return the first found Wikipedia URL and cancel the other tasks.
413
+ """
414
+ tasks = [asyncio.create_task(self.search_wikipedia_url(query, session)) for _ in range(num_attempts)]
415
+ done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
416
+ wiki_url = None
417
+ for task in done:
418
+ wiki_url = task.result()
419
+ if wiki_url:
420
+ break
421
+ for task in pending:
422
+ task.cancel()
423
+ return wiki_url
424
+
425
+ async def search_wikipedia(self, query, session):
426
+ """
427
+ Search Bing for a Wikipedia URL and extract its infobox and a paragraph.
428
+ """
429
+ wiki_url = await self.get_wikipedia_url_concurrently(query, session, num_attempts=10)
430
+ if not wiki_url:
431
+ print("Could not find a Wikipedia URL for the query.")
432
+ return None
433
+ print(f"\nFound Wikipedia URL: {wiki_url}\n")
434
+ infobox_lines, paragraph_text = await self.extract_page_content(wiki_url, session)
435
+ return {
436
+ "wikipedia_url": wiki_url,
437
+ "infobox": infobox_lines,
438
+ "paragraph": paragraph_text
439
+ }
440
 
441
+ # New wrapper class providing a unified API interface.
442
+ class BingScraperAPI:
 
 
443
  """
444
+ A unified API that exposes methods as:
445
+ - search(query)
446
+ - video(query)
447
+ - images(query)
448
+ - wikipedia(query)
449
+ In addition, a new asynchronous generator 'fetch' is provided to run multiple
450
+ providers concurrently and yield results as soon as they become available.
451
  """
452
+ def __init__(self):
453
+ self.scraper = BingScraper()
454
+
455
+ async def search(self, query):
456
+ """Perform a regular and news text search."""
457
+ async with aiohttp.ClientSession() as session:
458
+ return await self.scraper.search_text(query, session)
459
+
460
+ async def video(self, query):
461
+ """Perform a video search."""
462
+ async with aiohttp.ClientSession() as session:
463
+ return await self.scraper.search_video(query, session)
464
+
465
+ async def images(self, query):
466
+ """Perform an image search."""
467
+ async with aiohttp.ClientSession() as session:
468
+ return await self.scraper.search_image(query, session)
469
+
470
+ async def wikipedia(self, query):
471
+ """Perform a Wikipedia search and extract infobox and content."""
472
+ async with aiohttp.ClientSession() as session:
473
+ return await self.scraper.search_wikipedia(query, session)
474
+
475
+ async def fetch(self, providers, param):
476
+ """
477
+ Accepts:
478
+ - providers: a list of provider names (e.g., ["search", "video", "images"])
479
+ - param: a list of query parameters.
480
+ If a single parameter is provided, it is used for all providers.
481
+ Otherwise, the number of parameters must match the number of providers.
482
+
483
+ Launches tasks concurrently and yields (provider_name, result) as soon as each is available.
484
+ """
485
+ # Map provider names to the corresponding methods.
486
+ provider_map = {
487
+ "search": self.search,
488
+ "video": self.video,
489
+ "images": self.images,
490
+ "wikipedia": self.wikipedia
491
+ }
492
+
493
+ # Determine query parameters to use for each provider.
494
+ if len(param) == 1:
495
+ params = [param[0]] * len(providers)
496
+ elif len(param) == len(providers):
497
+ params = param
498
+ else:
499
+ raise ValueError("The number of query parameters must be either 1 or equal to the number of providers.")
500
+
501
+ tasks = []
502
+ for prov, q in zip(providers, params):
503
+ if prov not in provider_map:
504
+ raise ValueError(f"Unknown provider: {prov}")
505
+ # Schedule the provider call.
506
+ task = asyncio.create_task(provider_map[prov](q))
507
+ tasks.append((prov, task))
508
+
509
+ # Yield results as they become available.
510
+ for prov, task in tasks:
511
+ try:
512
+ result = await asyncio.wait_for(task, timeout=60)
513
+ yield prov, result
514
+ except Exception as e:
515
+ yield prov, {"error": str(e)}
516
+
517
+
518
+ app = FastAPI(title="BingScraper API (FastAPI wrapper)")
519
+
520
+ # Allow CORS for local dev; change origins in production
521
+ app.add_middleware(
522
+ CORSMiddleware,
523
+ allow_origins=["*"],
524
+ allow_methods=["*"],
525
+ allow_headers=["*"],
526
+ )
527
 
528
+ # Pydantic model for request validation
529
+ class FetchRequest(BaseModel):
530
+ providers: List[str]
531
+ param: List[str]
532
 
533
+
534
+ # Allowed providers -> underlying scraper call names (these call the low-level methods)
535
+ ALLOWED_PROVIDERS = {
536
+ "search": "search_text",
537
+ "video": "search_video",
538
+ "images": "search_image",
539
+ "wikipedia": "search_wikipedia"
540
+ }
541
+
542
+
543
+ async def _run_provider(scraper: BingScraperAPI, provider: str, query: str, session: aiohttp.ClientSession, timeout: float = 30.0) -> Dict[str, Any]:
544
  """
545
+ Run a single provider using the underlying BingScraper methods (reusing the given session).
546
+ Returns a dict: {"provider": provider, "result": ...}
547
  """
548
+ # Map provider to the low-level method on scraper.scraper
549
+ low_level = ALLOWED_PROVIDERS.get(provider)
550
+ if not low_level:
551
+ raise ValueError(f"Unknown provider: {provider}")
552
+
553
+ method = getattr(scraper.scraper, low_level, None)
554
+ if not method or not asyncio.iscoroutinefunction(method):
555
+ raise RuntimeError(f"Provider method not found or not async: {low_level}")
556
+
557
  try:
558
+ # Run the provider with a per-call timeout
559
+ result = await asyncio.wait_for(method(query, session), timeout=timeout)
560
+ return {"provider": provider, "result": result}
561
+ except asyncio.CancelledError:
562
+ raise
563
+ except Exception as e:
564
+ return {"provider": provider, "error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
565
 
566
+
567
+ @app.post("/v1/agents/websearch")
568
+ async def fetch_post(payload: FetchRequest):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
  """
570
+ Accepts JSON:
571
+ {
572
+ "providers": ["search", "images"],
573
+ "param": ["who is elon musk", "elon musk"]
574
+ }
575
+ Returns JSON mapping provider -> result as they finish (collected).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  """
577
+ providers = payload.providers
578
+ param = payload.param
579
+
580
+ if not providers:
581
+ raise HTTPException(status_code=400, detail="providers must be a non-empty list")
582
+ if not param:
583
+ raise HTTPException(status_code=400, detail="param must be a non-empty list")
584
+
585
+ # Expand params list to match providers (if user passed single param)
586
+ if len(param) == 1:
587
+ params = [param[0]] * len(providers)
588
+ elif len(param) == len(providers):
589
+ params = param
590
+ else:
591
+ raise HTTPException(status_code=400, detail="param length must be 1 or equal to length of providers")
592
 
593
+ # Validate providers
594
+ for p in providers:
595
+ if p not in ALLOWED_PROVIDERS:
596
+ raise HTTPException(status_code=400, detail=f"Unsupported provider: {p}")
597
 
598
+ scraper_api = BingScraperAPI()
599
+
600
+ # Use a single aiohttp session for all provider calls (faster)
601
+ async with aiohttp.ClientSession() as session:
602
+ tasks = [
603
+ asyncio.create_task(_run_provider(scraper_api, prov, q, session))
604
+ for prov, q in zip(providers, params)
605
+ ]
606
+
607
+ results = {}
608
+ # Collect results as they finish
609
+ for coro in asyncio.as_completed(tasks):
610
+ res = await coro
611
+ prov = res.get("provider")
612
+ # store either result or error
613
+ results[prov] = res.get("result") if "result" in res else {"error": res.get("error")}
614
+
615
+ return JSONResponse(results)
616
+
617
+
618
+ @app.get("/v1/agents/websearch/stream")
619
+ async def stream(request: Request, providers: str, params: str):
620
+ """
621
+ Stream results as Server-Sent Events (SSE).
622
+ Query string example:
623
+ /stream?providers=search,images&params=who%20is%20elon%20musk,elon%20musk
624
+ Each SSE event will be JSON:
625
+ {"provider": "<provider>", "result": <result or error>}
626
+ """
627
+ prov_list = [p.strip() for p in providers.split(",") if p.strip()]
628
+ param_list = [p.strip() for p in params.split(",") if p.strip()]
629
+
630
+ if not prov_list:
631
+ raise HTTPException(status_code=400, detail="providers query param required")
632
+ if not param_list:
633
+ raise HTTPException(status_code=400, detail="params query param required")
634
+
635
+ if len(param_list) == 1:
636
+ param_list = [param_list[0]] * len(prov_list)
637
+ elif len(param_list) != len(prov_list):
638
+ raise HTTPException(status_code=400, detail="params must have length 1 or equal to providers length")
639
+
640
+ for p in prov_list:
641
+ if p not in ALLOWED_PROVIDERS:
642
+ raise HTTPException(status_code=400, detail=f"Unsupported provider: {p}")
643
+
644
+ scraper_api = BingScraperAPI()
645
+
646
+ async def event_generator():
647
+ # Create one session for all calls
648
+ async with aiohttp.ClientSession() as session:
649
+ tasks = [
650
+ asyncio.create_task(_run_provider(scraper_api, prov, q, session))
651
+ for prov, q in zip(prov_list, param_list)
652
+ ]
653
+
654
+ for coro in asyncio.as_completed(tasks):
655
+ if await request.is_disconnected():
656
+ # client disconnected
657
+ for t in tasks:
658
+ t.cancel()
659
+ break
660
+ res = await coro
661
+ # SSE format: data: <json>\n\n
662
+ payload = json.dumps(res, default=str)
663
+ yield f"data: {payload}\n\n"
664
+
665
+ return StreamingResponse(event_generator(), media_type="text/event-stream")
666
+
667
+
668
+ # Optional root endpoint
669
+ @app.get("/")
670
+ async def root():
671
+ return {"msg": "BingScraper FastAPI wrapper. Use POST /fetch or GET /stream"}
672
+
673
+
674
+ # To run:
675
+ # uvicorn app:app --host 0.0.0.0 --port 8000 --reload