AdarshJi commited on
Commit
45ad1f9
·
verified ·
1 Parent(s): d11ef09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +376 -496
app.py CHANGED
@@ -1,270 +1,82 @@
1
- # app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from __future__ import annotations
3
  import os
4
- import time
5
- import logging
6
- import threading
7
  import asyncio
8
- from typing import Optional, Dict, Any, Tuple
9
- from concurrent.futures import ThreadPoolExecutor
 
 
 
10
  from urllib.parse import quote_plus, urljoin
11
 
12
- from fastapi import FastAPI, HTTPException, Query, Body
 
13
  from pydantic import BaseModel
14
- from starlette.responses import JSONResponse
15
 
16
- from selenium import webdriver
17
- from selenium.webdriver.chrome.options import Options
18
- from selenium.webdriver.chrome.service import Service
19
- from selenium.common.exceptions import WebDriverException, SessionNotCreatedException
20
- from webdriver_manager.chrome import ChromeDriverManager
21
  from bs4 import BeautifulSoup
22
- from selenium.webdriver.common.by import By
23
- from selenium.webdriver.support.ui import WebDriverWait
24
- from selenium.webdriver.support import expected_conditions as EC
25
- from selenium.common.exceptions import TimeoutException
26
-
27
- # virtual display
28
- from pyvirtualdisplay import Display
29
-
30
- # Logging
31
- logging.basicConfig(level=logging.INFO)
32
- logger = logging.getLogger("fast_fetcher")
33
-
34
- # ---------------- BrowserManager ----------------
35
- class BrowserManager:
36
- def __init__(
37
- self,
38
- headless: bool = True,
39
- user_agent: Optional[str] = None,
40
- window_size: str = "1366,768",
41
- disable_images: bool = True,
42
- block_resource_urls: Optional[list[str]] = None,
43
- ):
44
- self.headless = headless
45
- self.user_agent = user_agent
46
- self.window_size = window_size
47
- self.disable_images = disable_images
48
- self.block_resource_urls = block_resource_urls or [
49
- "*.doubleclick.net/*",
50
- "*.google-analytics.com/*",
51
- "*.googlesyndication.com/*",
52
- "*.adservice.google.com/*",
53
- ]
54
- self._driver_lock = threading.Lock()
55
- self._driver: Optional[webdriver.Chrome] = None
56
- self._display: Optional[Display] = None
57
- self._start_driver_with_retries()
58
-
59
- def _build_options(self) -> Options:
60
- opts = Options()
61
- # If CHROME_BIN is present, point to it
62
- chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/google-chrome-stable")
63
- if os.path.exists(chrome_bin):
64
- opts.binary_location = chrome_bin
65
- logger.debug("Using chrome binary: %s", chrome_bin)
66
- else:
67
- logger.warning("Chrome binary not found at %s (will rely on system/browser manager).", chrome_bin)
68
-
69
- if self.headless:
70
- opts.add_argument("--headless=new")
71
- opts.add_argument("--headless")
72
-
73
- # container-friendly flags (and stable fallback)
74
- opts.add_argument("--no-sandbox")
75
- opts.add_argument("--disable-setuid-sandbox")
76
- opts.add_argument("--disable-dev-shm-usage")
77
- opts.add_argument("--disable-gpu")
78
- opts.add_argument("--disable-extensions")
79
- opts.add_argument("--disable-blink-features=AutomationControlled")
80
- opts.add_argument("--disable-software-rasterizer")
81
- opts.add_argument(f"--window-size={self.window_size}")
82
- opts.add_argument("--remote-debugging-port=0")
83
-
84
- if self.user_agent:
85
- opts.add_argument(f"--user-agent={self.user_agent}")
86
-
87
- if self.disable_images:
88
- prefs = {
89
- "profile.managed_default_content_settings.images": 2,
90
- "profile.managed_default_content_settings.stylesheets": 2,
91
- "profile.managed_default_content_settings.fonts": 2,
92
- }
93
- opts.add_experimental_option("prefs", prefs)
94
-
95
- opts.add_experimental_option("excludeSwitches", ["enable-logging"])
96
- opts.add_experimental_option("useAutomationExtension", False)
97
- return opts
98
-
99
- def _start_driver_with_retries(self, attempts: int = 3, delay_seconds: float = 1.0):
100
- last_exc = None
101
- for attempt in range(1, attempts + 1):
102
- try:
103
- logger.info("Starting Chrome driver (attempt %d/%d)...", attempt, attempts)
104
- self._start_driver()
105
- logger.info("Chrome driver started successfully.")
106
- return
107
- except Exception as exc:
108
- logger.exception("Failed to start driver on attempt %d: %s", attempt, exc)
109
- last_exc = exc
110
- time.sleep(delay_seconds)
111
- raise RuntimeError(f"Unable to start Chrome driver after {attempts} attempts: {last_exc}") from last_exc
112
-
113
- def _start_xvfb_if_needed(self):
114
- # If headless=False AND no DISPLAY, start Xvfb via pyvirtualdisplay
115
- if not self.headless and os.environ.get("DISPLAY", "") == "":
116
- try:
117
- logger.info("No DISPLAY found and headless=False — starting virtual X display (Xvfb).")
118
- self._display = Display(visible=0, size=(int(self.window_size.split(",")[0]), int(self.window_size.split(",")[1])))
119
- self._display.start()
120
- logger.info("Virtual X display started (DISPLAY=%s).", os.environ.get("DISPLAY"))
121
- except Exception as e:
122
- logger.exception("Failed to start virtual display: %s", e)
123
- raise
124
-
125
- def _stop_xvfb_if_started(self):
126
- if self._display:
127
- try:
128
- self._display.stop()
129
- logger.info("Virtual X display stopped.")
130
- except Exception:
131
- pass
132
- self._display = None
133
-
134
- def _start_driver(self):
135
- # start virtual display if required BEFORE launching Chrome
136
- self._start_xvfb_if_needed()
137
-
138
- opts = self._build_options()
139
-
140
- # 1) Try Selenium Manager (webdriver.Chrome(options=opts)). Selenium >=4.14 may use driver manager itself.
141
- primary_exc = None
142
- fallback_exc = None
143
- try:
144
- logger.debug("Attempting to start Chrome via Selenium Manager (webdriver.Chrome(options=opts))")
145
- self._driver = webdriver.Chrome(options=opts)
146
- # quick smoke test: ensure browser is responsive (may throw)
147
- try:
148
- self._driver.execute_script("return navigator.userAgent")
149
- except Exception as e:
150
- # browser started but died quickly
151
- raise RuntimeError("Browser started by Selenium Manager but crashed immediately.") from e
152
-
153
- self._post_start_setup()
154
- return
155
- except Exception as e_primary:
156
- primary_exc = e_primary
157
- logger.warning("Selenium Manager attempt failed: %s", e_primary)
158
-
159
- # 2) Fallback: use webdriver-manager to download driver and start with the explicit Service
160
- try:
161
- driver_path = ChromeDriverManager().install()
162
- logger.info("webdriver-manager installed chromedriver: %s", driver_path)
163
- try:
164
- os.chmod(driver_path, 0o755)
165
- except Exception:
166
- logger.debug("chmod on chromedriver failed or unnecessary.")
167
-
168
- service = Service(driver_path)
169
- self._driver = webdriver.Chrome(service=service, options=opts)
170
- self._post_start_setup()
171
- return
172
- except Exception as e_fallback:
173
- fallback_exc = e_fallback
174
- logger.exception("webdriver-manager fallback failed: %s", e_fallback)
175
-
176
- # 3) Final fallback: attempt system /usr/bin/chromedriver if available
177
- try:
178
- sys_path = "/usr/bin/chromedriver"
179
- if os.path.exists(sys_path):
180
- logger.info("Trying system chromedriver at %s", sys_path)
181
- try:
182
- os.chmod(sys_path, 0o755)
183
- except Exception:
184
- pass
185
- service = Service(sys_path)
186
- self._driver = webdriver.Chrome(service=service, options=opts)
187
- self._post_start_setup()
188
- return
189
- except Exception as e_sys:
190
- logger.exception("System chromedriver attempt failed: %s", e_sys)
191
-
192
- # If all failed, stop virtual display (if started) and raise a helpful error
193
- self._stop_xvfb_if_started()
194
- # Include both primary and fallback messages in the raised exception
195
- raise RuntimeError(f"Failed to start Chrome driver. primary_error={primary_exc}, fallback_error={fallback_exc}")
196
-
197
- def _post_start_setup(self):
198
- try:
199
- self._driver.set_page_load_timeout(60)
200
- # best-effort CDP network blocking
201
- try:
202
- self._driver.execute_cdp_cmd("Network.enable", {})
203
- if self.block_resource_urls:
204
- self._driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": self.block_resource_urls})
205
- except Exception:
206
- pass
207
- except Exception:
208
- pass
209
-
210
- def fetch_html(
211
- self,
212
- url: str,
213
- wait_seconds: Optional[float] = 10.0,
214
- wait_for_selector: Optional[str] = None,
215
- ) -> str:
216
- if self._driver is None:
217
- self._start_driver_with_retries()
218
-
219
- with self._driver_lock:
220
- driver = self._driver
221
- try:
222
- driver.get(url)
223
-
224
- if wait_for_selector and wait_seconds:
225
- try:
226
- WebDriverWait(driver, wait_seconds).until(
227
- EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_selector))
228
- )
229
- except TimeoutException:
230
- pass
231
- else:
232
- if wait_seconds:
233
- try:
234
- WebDriverWait(driver, min(wait_seconds, 3)).until(
235
- lambda d: d.execute_script("return document.readyState") == "complete"
236
- )
237
- except Exception:
238
- time.sleep(0.5)
239
-
240
- return driver.page_source
241
- except WebDriverException as e:
242
- logger.exception("WebDriver exception during fetch: %s", e)
243
- # restart driver and raise
244
- try:
245
- self._safe_quit_driver()
246
- except Exception:
247
- pass
248
- self._start_driver_with_retries()
249
- raise RuntimeError(f"WebDriver error during fetch: {e}")
250
-
251
- def _safe_quit_driver(self):
252
- if self._driver:
253
- try:
254
- self._driver.quit()
255
- except Exception:
256
- pass
257
- self._driver = None
258
- # stop display if we started one
259
- self._stop_xvfb_if_started()
260
-
261
- def close(self):
262
- self._safe_quit_driver()
263
-
264
-
265
- # ---------------- EXTRACT_DATA (same as your earlier implementation) ----------------
266
- def EXTRACT_DATA(html: str) -> Dict[str, Any]:
267
- soup = BeautifulSoup(html, "html.parser")
268
  BASE_URL = "https://www.google.com"
269
 
270
  def safe_text(el):
@@ -274,311 +86,379 @@ def EXTRACT_DATA(html: str) -> Dict[str, Any]:
274
  return el.get(attr) if el and el.has_attr(attr) else ""
275
 
276
  def abs_url(url):
277
- return urljoin(BASE_URL, url) if url else ""
 
 
 
278
 
279
  def clean_thumb(src):
280
  if src and not src.startswith("data:"):
281
  return abs_url(src)
282
  return None
283
 
284
- def is_ad_element(element):
285
- for parent in element.parents:
286
- if parent.get("id") in ["tads", "tadsb"] or "ads-ad" in parent.get("class", []):
287
- return True
288
- return False
289
-
290
  web_results = []
291
  for result in soup.select(".tF2Cxc"):
292
- if is_ad_element(result):
293
- continue
294
  title_tag = result.select_one("h3")
295
  link_tag = result.select_one("a")
296
  cite_tag = result.select_one("cite")
297
  snippet_tag = result.select_one(".VwiC3b")
298
- read_more_tag = result.select_one(".vzmbzf")
299
-
300
  if title_tag and link_tag:
301
- entry = {
302
- "no": len(web_results) + 1,
303
  "title": safe_text(title_tag),
304
  "link": abs_url(safe_attr(link_tag, "href")),
305
  "displayed_url": safe_text(cite_tag),
306
  "snippet": safe_text(snippet_tag)
307
- }
308
- extra = []
309
- if read_more_tag:
310
- read_more_url = abs_url(safe_attr(read_more_tag, "href"))
311
- if read_more_url:
312
- extra.append({"read_more": read_more_url})
313
- if extra:
314
- entry["extra"] = extra
315
- web_results.append(entry)
316
 
 
317
  image_results = []
318
  for img_item in soup.select(".eA0Zlc"):
319
  img_tag = img_item.select_one("img")
320
  link_tag = img_item.select_one("a")
321
- source_tag = img_item.select_one(".s0fJje span")
322
  src = safe_attr(img_tag, "data-src") or safe_attr(img_tag, "src")
323
  thumb = clean_thumb(src)
324
  if thumb:
325
- image_results.append({
326
- "thumbnail": thumb,
327
- "alt": safe_attr(img_tag, "alt"),
328
- "source": safe_text(source_tag),
329
- "link": abs_url(safe_attr(link_tag, "href"))
330
- })
331
-
332
- video_results = []
333
- for video in soup.select(".KYaZsb"):
334
- title_tag = video.select_one(".tNxQIb.ynAwRc")
335
- link_tag = video.select_one("a.rIRoqf")
336
- thumb_img = video.select_one(".AZJdrc img")
337
- duration_tag = video.select_one(".c8rnLc")
338
- channel_tag = video.select_one(".Sg4azc span:first-child")
339
- date_tag = video.select_one(".rbYSKb span")
340
- desc_tag = video.select_one(".wNifxf .p4wth")
341
- thumb_src = safe_attr(thumb_img, "data-src") or safe_attr(thumb_img, "src")
342
- thumb = clean_thumb(thumb_src)
343
- if title_tag and link_tag:
344
- video_results.append({
345
- "title": safe_text(title_tag),
346
- "link": abs_url(safe_attr(link_tag, "href")),
347
- "thumbnail": thumb,
348
- "duration": safe_text(duration_tag),
349
- "channel": safe_text(channel_tag),
350
- "date": safe_text(date_tag),
351
- "description_snippet": safe_text(desc_tag)
352
- })
353
-
354
- news_results = []
355
- for news in soup.select(".m7jPZ"):
356
- title_tag = news.select_one(".n0jPhd")
357
- link_tag = news.select_one("a")
358
- source_tag = news.select_one(".MgUUmf span")
359
- time_tag = news.select_one(".rbYSKb span")
360
- thumb_img = news.select_one(".uhHOwf img")
361
- thumb_src = safe_attr(thumb_img, "data-src") or safe_attr(thumb_img, "src")
362
- thumb = clean_thumb(thumb_src)
363
- if title_tag and link_tag:
364
- news_results.append({
365
- "title": safe_text(title_tag),
366
- "link": abs_url(safe_attr(link_tag, "href")),
367
- "source": safe_text(source_tag),
368
- "time": safe_text(time_tag),
369
- "thumbnail": thumb
370
- })
371
-
372
- knowledge_panel = {}
373
- rhs = soup.find(id="rhs")
374
- if rhs:
375
- title_tag = rhs.select_one(".PZPZlf.ssJ7i")
376
- subtitle_tag = rhs.select_one(".iAIpCb span")
377
- if title_tag:
378
- knowledge_panel["title"] = safe_text(title_tag)
379
- if subtitle_tag:
380
- knowledge_panel["subtitle"] = safe_text(subtitle_tag)
381
-
382
- desc_tag = rhs.select_one(".kno-rdesc span")
383
- if desc_tag:
384
- knowledge_panel["description"] = safe_text(desc_tag)
385
-
386
- facts = {}
387
- for fact in rhs.select(".zloOqf"):
388
- label_tag = fact.select_one(".w8qArf")
389
- value_tag = fact.select_one(".LrzXr")
390
- if label_tag and value_tag:
391
- label = safe_text(label_tag).replace(":", "").strip()
392
- links = value_tag.find_all("a")
393
- if links and len(links) > 1:
394
- names = [safe_text(a) for a in links if safe_text(a)]
395
- if names:
396
- facts[label] = names
397
- else:
398
- text = safe_text(value_tag)
399
- if text:
400
- facts[label] = text
401
- if facts:
402
- knowledge_panel["facts"] = facts
403
-
404
- profiles = []
405
- for profile in rhs.select(".dRrfkf a"):
406
- name_tag = profile.select_one(".CtCigf")
407
- link = safe_attr(profile, "href")
408
- if name_tag and link:
409
- profiles.append({
410
- "platform": safe_text(name_tag),
411
- "link": abs_url(link)
412
- })
413
- if profiles:
414
- knowledge_panel["profiles"] = profiles
415
-
416
- if not knowledge_panel:
417
- knowledge_panel = None
418
-
419
- ai_overview = None
420
- ai_container = soup.select_one(".p2M1Qe .f5cPye")
421
- if ai_container:
422
- text = safe_text(ai_container)
423
- if text:
424
- ai_overview = text
425
-
426
- thumbnails = set()
427
- for img in soup.select("img[data-src], img[src]"):
428
- src = safe_attr(img, "data-src") or safe_attr(img, "src")
429
- clean = clean_thumb(src)
430
- if clean:
431
- thumbnails.add(clean)
432
-
433
- all_thumbnails = sorted(thumbnails) if thumbnails else None
434
 
 
435
  data = {}
436
  if web_results:
437
  data["web_results"] = web_results
438
  if image_results:
439
  data["image_results"] = image_results
440
- if video_results:
441
- data["video_results"] = video_results
442
- if news_results:
443
- data["news_results"] = news_results
444
- if knowledge_panel:
445
- data["knowledge_panel"] = knowledge_panel
446
- if ai_overview:
447
- data["ai_overview"] = ai_overview
448
- if all_thumbnails:
449
- data["all_thumbnail_urls"] = all_thumbnails
450
 
451
- return data
 
 
 
 
 
 
 
 
452
 
 
453
 
454
- # ---------------- BrowserPool and API ----------------
455
- class BrowserPool:
456
- def __init__(self, pool_size: int = 1, headless: bool = True):
457
- self.pool_size = max(1, pool_size)
458
- self.managers = [BrowserManager(headless=headless) for _ in range(self.pool_size)]
459
- self._rr_index = 0
460
- self._rr_lock = threading.Lock()
461
 
462
- def pick_manager(self) -> BrowserManager:
463
- with self._rr_lock:
464
- idx = self._rr_index
465
- self._rr_index = (self._rr_index + 1) % self.pool_size
466
- return self.managers[idx]
 
467
 
468
- def close_all(self):
469
- for m in self.managers:
470
- try:
471
- m.close()
472
- except Exception:
473
- pass
474
 
 
475
  class SimpleTTLCache:
476
  def __init__(self, ttl_seconds: int = 20):
477
  self.ttl = ttl_seconds
478
- self._cache: Dict[str, Tuple[float, Any]] = {}
479
- self._lock = threading.Lock()
480
 
481
- def get(self, key: str):
482
- with self._lock:
483
- item = self._cache.get(key)
484
  if not item:
485
  return None
486
  ts, value = item
487
  if time.time() - ts > self.ttl:
488
- del self._cache[key]
489
  return None
490
  return value
491
 
492
- def set(self, key: str, value: Any):
493
- with self._lock:
494
- self._cache[key] = (time.time(), value)
495
-
496
- class SearchRequest(BaseModel):
497
- query: Optional[str] = None
498
- url: Optional[str] = None
499
- wait_for_selector: Optional[str] = None
500
- headless: Optional[bool] = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
 
502
- app = FastAPI(title="fast_fetcher_api", version="0.1")
503
- POOL: Optional[BrowserPool] = None
504
- EXECUTOR: Optional[ThreadPoolExecutor] = None
505
- CACHE = SimpleTTLCache(ttl_seconds=25)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  @app.on_event("startup")
508
- async def startup_event():
509
- global POOL, EXECUTOR
510
- # Switch headless here to False as you asked. The BrowserManager will start an Xvfb display automatically.
511
- POOL = BrowserPool(pool_size=1, headless=False)
512
- EXECUTOR = ThreadPoolExecutor(max_workers=2)
513
- app.state.executor = EXECUTOR
514
- app.state.pool = POOL
515
- logger.info("Startup: browser pool created (size=%d).", 1)
 
516
 
517
  @app.on_event("shutdown")
518
- async def shutdown_event():
519
- global POOL, EXECUTOR
520
- if POOL:
521
- POOL.close_all()
522
- if EXECUTOR:
523
- EXECUTOR.shutdown(wait=True)
524
- logger.info("Shutdown: browsers closed and executor stopped.")
525
-
526
- def _blocking_fetch_and_extract(manager: BrowserManager, url: str, wait_for_selector: Optional[str], wait_seconds: Optional[float]):
527
- start = time.time()
528
- html = manager.fetch_html(url, wait_seconds=wait_seconds, wait_for_selector=wait_for_selector)
529
- extracted = EXTRACT_DATA(html)
530
- duration = time.time() - start
531
- return {"url": url, "duration": duration, "data": extracted}
 
 
 
 
 
 
 
 
532
 
 
 
 
533
  @app.get("/health")
534
  async def health():
535
  return {"status": "ok"}
536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
  @app.get("/search")
538
- async def search(query: str = Query(..., min_length=1), wait_for_selector: Optional[str] = None):
539
  q = query.strip()
540
  if not q:
541
- raise HTTPException(status_code=400, detail="query parameter required")
542
 
543
  url = f"https://www.google.com/search?q={quote_plus(q)}"
544
- cache_key = f"search:{q}:{wait_for_selector}"
545
-
546
- cached = CACHE.get(cache_key)
547
  if cached:
548
  return JSONResponse(content={"cached": True, **cached})
549
 
550
- manager = app.state.pool.pick_manager()
551
- loop = asyncio.get_event_loop()
552
- fut = loop.run_in_executor(app.state.executor, _blocking_fetch_and_extract, manager, url, wait_for_selector, 5.0)
553
- result = await fut
554
- CACHE.set(cache_key, result)
555
- return JSONResponse(content={"cached": False, **result})
556
-
557
- @app.get("/fetch")
558
- async def fetch(url: str = Query(..., min_length=5), wait_for_selector: Optional[str] = None):
559
- manager = app.state.pool.pick_manager()
560
- loop = asyncio.get_event_loop()
561
- fut = loop.run_in_executor(app.state.executor, _blocking_fetch_and_extract, manager, url, wait_for_selector, 6.0)
562
- result = await fut
563
- return JSONResponse(content=result)
564
 
565
  @app.post("/search")
566
  async def post_search(body: SearchRequest = Body(...)):
567
  if not (body.query or body.url):
568
  raise HTTPException(status_code=400, detail="Either query or url must be provided")
569
- if body.url:
570
- target = body.url
571
- else:
572
- target = f"https://www.google.com/search?q={quote_plus(body.query)}"
573
-
574
- cache_key = f"search_post:{target}:{body.wait_for_selector}"
575
- cached = CACHE.get(cache_key)
576
- if cached:
577
- return JSONResponse(content={"cached": True, **cached})
578
-
579
- manager = app.state.pool.pick_manager()
580
- loop = asyncio.get_event_loop()
581
- fut = loop.run_in_executor(app.state.executor, _blocking_fetch_and_extract, manager, target, body.wait_for_selector, 6.0)
582
- result = await fut
583
- CACHE.set(cache_key, result)
584
- return JSONResponse(content={"cached": False, **result})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app_playwright.py
2
+ """
3
+ Fast concurrent search fetcher using Playwright + FastAPI.
4
+
5
+ Requirements:
6
+ pip install fastapi uvicorn playwright beautifulsoup4 pydantic
7
+ python -m playwright install chromium
8
+
9
+ Why Playwright:
10
+ - Single browser process, many lightweight contexts
11
+ - Fast request interception (block ads/fonts)
12
+ - Async API for high concurrency
13
+
14
+ Endpoints:
15
+ - GET /health
16
+ - GET /search?query=...
17
+ - GET /fetch?url=...&fast=true
18
+ - POST /search (body: { "query": "..."} or { "url": "..." })
19
+ - GET /search_pages?query=...&pages=5&concurrency=8&fast=true&ordered=false (streams NDJSON)
20
+ - GET /search_pages_aggregate?query=... (returns aggregated JSON)
21
+ Tuning via environment variables:
22
+ - PLAYWRIGHT_HEADLESS (true/false, default true)
23
+ - POOL_CONCURRENCY (default 8) -> concurrent page navigations
24
+ - PARSER_WORKERS (default 4) -> threadpool for BeautifulSoup parsing
25
+ - REQUEST_TIMEOUT_MS (default 20000) -> navigation timeout per page
26
+ """
27
+
28
  from __future__ import annotations
29
  import os
 
 
 
30
  import asyncio
31
+ import logging
32
+ import json
33
+ import time
34
+ import math
35
+ from typing import Optional, Dict, Any, List, Tuple
36
  from urllib.parse import quote_plus, urljoin
37
 
38
+ from fastapi import FastAPI, Query, Body, HTTPException
39
+ from fastapi.responses import JSONResponse, StreamingResponse
40
  from pydantic import BaseModel
 
41
 
 
 
 
 
 
42
  from bs4 import BeautifulSoup
43
+ import concurrent.futures
44
+
45
+ # Playwright async
46
+ from playwright.async_api import async_playwright, Playwright, Browser, BrowserContext, Page, Request as PWRequest, Error as PWError
47
+
48
+ # ---------------------
49
+ # Config / Logging
50
+ # ---------------------
51
+ LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO").upper()
52
+ logging.basicConfig(level=LOG_LEVEL, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
53
+ logger = logging.getLogger("fast_playwright_fetcher")
54
+
55
+ PLAYWRIGHT_HEADLESS = os.environ.get("PLAYWRIGHT_HEADLESS", "true").lower() in ("1", "true", "yes")
56
+ POOL_CONCURRENCY = int(os.environ.get("POOL_CONCURRENCY", "8")) # parallel navigations
57
+ PARSER_WORKERS = int(os.environ.get("PARSER_WORKERS", "4")) # threadpool size for BS parsing
58
+ REQUEST_TIMEOUT_MS = int(os.environ.get("REQUEST_TIMEOUT_MS", "20000")) # navigation timeout in ms
59
+ PAGE_LOAD_WAIT = os.environ.get("PAGE_LOAD_WAIT", "domcontentloaded") # "domcontentloaded" or "load"
60
+
61
+ # set of resource URL substrings to block (ads, analytics, fonts, etc.)
62
+ BLOCK_PATTERNS = [
63
+ "doubleclick.net", "google-analytics.com", "googlesyndication.com",
64
+ "adservice.google.com", "googletagmanager.com", "facebook.com",
65
+ "fonts.googleapis.com", "gstatic.com", "analytics.twitter.com",
66
+ ".woff", ".woff2", ".ttf", ".otf", "font.gstatic.com",
67
+ ]
68
+
69
+ # ---------------------
70
+ # Parsing helper (threadpool)
71
+ # ---------------------
72
+ PARSE_POOL = concurrent.futures.ThreadPoolExecutor(max_workers=PARSER_WORKERS)
73
+
74
+ def extract_data_sync(html: str) -> Dict[str, Any]:
75
+ """
76
+ Synchronous BeautifulSoup parsing (will be run inside PARSE_POOL).
77
+ Re-uses your previous EXTRACT_DATA logic (simplified/robust).
78
+ """
79
+ soup = BeautifulSoup(html or "", "html.parser")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  BASE_URL = "https://www.google.com"
81
 
82
  def safe_text(el):
 
86
  return el.get(attr) if el and el.has_attr(attr) else ""
87
 
88
  def abs_url(url):
89
+ try:
90
+ return urljoin(BASE_URL, url) if url else ""
91
+ except Exception:
92
+ return url or ""
93
 
94
  def clean_thumb(src):
95
  if src and not src.startswith("data:"):
96
  return abs_url(src)
97
  return None
98
 
99
+ # web results
 
 
 
 
 
100
  web_results = []
101
  for result in soup.select(".tF2Cxc"):
 
 
102
  title_tag = result.select_one("h3")
103
  link_tag = result.select_one("a")
104
  cite_tag = result.select_one("cite")
105
  snippet_tag = result.select_one(".VwiC3b")
 
 
106
  if title_tag and link_tag:
107
+ web_results.append({
 
108
  "title": safe_text(title_tag),
109
  "link": abs_url(safe_attr(link_tag, "href")),
110
  "displayed_url": safe_text(cite_tag),
111
  "snippet": safe_text(snippet_tag)
112
+ })
 
 
 
 
 
 
 
 
113
 
114
+ # images
115
  image_results = []
116
  for img_item in soup.select(".eA0Zlc"):
117
  img_tag = img_item.select_one("img")
118
  link_tag = img_item.select_one("a")
 
119
  src = safe_attr(img_tag, "data-src") or safe_attr(img_tag, "src")
120
  thumb = clean_thumb(src)
121
  if thumb:
122
+ image_results.append({"thumbnail": thumb, "alt": safe_attr(img_tag, "alt"), "link": abs_url(safe_attr(link_tag, "href"))})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
+ # videos/news/knowledge panels omitted for brevity but can be added similarly
125
  data = {}
126
  if web_results:
127
  data["web_results"] = web_results
128
  if image_results:
129
  data["image_results"] = image_results
 
 
 
 
 
 
 
 
 
 
130
 
131
+ # collect thumbnails
132
+ thumbnails = set()
133
+ for img in soup.select("img[data-src], img[src]"):
134
+ src = safe_attr(img, "data-src") or safe_attr(img, "src")
135
+ thumb = clean_thumb(src)
136
+ if thumb:
137
+ thumbnails.add(thumb)
138
+ if thumbnails:
139
+ data["all_thumbnail_urls"] = sorted(thumbnails)
140
 
141
+ return data
142
 
143
+ async def extract_data(html: str) -> Dict[str, Any]:
144
+ loop = asyncio.get_running_loop()
145
+ return await loop.run_in_executor(PARSE_POOL, extract_data_sync, html)
 
 
 
 
146
 
147
+ # ---------------------
148
+ # FastAPI + Playwright globals
149
+ # ---------------------
150
+ app = FastAPI(title="fast_playwright_fetcher", version="0.1")
151
+ PLAY: Optional[Playwright] = None
152
+ BROWSER: Optional[Browser] = None
153
 
154
+ # Semaphore to limit concurrency across endpoints
155
+ CONCURRENCY_SEMAPHORE = asyncio.Semaphore(POOL_CONCURRENCY)
 
 
 
 
156
 
157
+ # Simple TTL cache (in-memory)
158
  class SimpleTTLCache:
159
  def __init__(self, ttl_seconds: int = 20):
160
  self.ttl = ttl_seconds
161
+ self.store: Dict[str, Tuple[float, Any]] = {}
162
+ self._lock = asyncio.Lock()
163
 
164
+ async def get(self, key: str):
165
+ async with self._lock:
166
+ item = self.store.get(key)
167
  if not item:
168
  return None
169
  ts, value = item
170
  if time.time() - ts > self.ttl:
171
+ del self.store[key]
172
  return None
173
  return value
174
 
175
+ async def set(self, key: str, value: Any):
176
+ async with self._lock:
177
+ self.store[key] = (time.time(), value)
178
+
179
+ CACHE = SimpleTTLCache(ttl_seconds=int(os.environ.get("CACHE_TTL", "25")))
180
+
181
+ # ---------------------
182
+ # Playwright helpers
183
+ # ---------------------
184
+ async def start_playwright():
185
+ global PLAY, BROWSER
186
+ if PLAY is not None:
187
+ return
188
+ PLAY = await async_playwright().start()
189
+ BROWSER = await PLAY.chromium.launch(headless=PLAYWRIGHT_HEADLESS,
190
+ args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"])
191
+ logger.info("Playwright started (headless=%s)", PLAYWRIGHT_HEADLESS)
192
+
193
+ async def stop_playwright():
194
+ global PLAY, BROWSER
195
+ if BROWSER:
196
+ try:
197
+ await BROWSER.close()
198
+ except Exception:
199
+ pass
200
+ BROWSER = None
201
+ if PLAY:
202
+ try:
203
+ await PLAY.stop()
204
+ except Exception:
205
+ pass
206
+ PLAY = None
207
+ logger.info("Playwright stopped.")
208
 
209
+ def _should_block_request(url: str) -> bool:
210
+ if not url:
211
+ return False
212
+ u = url.lower()
213
+ for pat in BLOCK_PATTERNS:
214
+ if pat in u:
215
+ return True
216
+ return False
217
+
218
+ async def _fetch_with_context(url: str, wait_until: str = "domcontentloaded", timeout_ms: int = REQUEST_TIMEOUT_MS, user_agent: Optional[str] = None) -> str:
219
+ """
220
+ Create an isolated browser context, block unwanted requests, navigate, get HTML.
221
+ This is efficient: creating small contexts is cheap compared to full browser processes.
222
+ """
223
+ if BROWSER is None:
224
+ raise RuntimeError("Browser not started")
225
+
226
+ # create context with minimal resources
227
+ context: BrowserContext = await BROWSER.new_context(user_agent=user_agent or "Mozilla/5.0 (Playwright)", viewport={"width": 1366, "height": 768})
228
+ page: Page = await context.new_page()
229
+
230
+ # intercept requests to block common resources
231
+ async def route_handler(route, request: PWRequest):
232
+ try:
233
+ if _should_block_request(request.url):
234
+ await route.abort()
235
+ else:
236
+ await route.continue_()
237
+ except Exception:
238
+ # if anything fails, let the request through
239
+ try:
240
+ await route.continue_()
241
+ except Exception:
242
+ pass
243
 
244
+ try:
245
+ await page.route("**/*", route_handler)
246
+ except Exception:
247
+ # route might fail if the browser doesn't support it; ignore
248
+ pass
249
+
250
+ # navigate with retries/backoff
251
+ max_attempts = 3
252
+ backoff_base = 0.2
253
+ last_exc = None
254
+ for attempt in range(1, max_attempts + 1):
255
+ try:
256
+ # Playwright's goto uses wait_until values like "load" or "domcontentloaded"
257
+ await page.goto(url, wait_until=wait_until, timeout=timeout_ms)
258
+ content = await page.content()
259
+ return content
260
+ except PWError as e:
261
+ last_exc = e
262
+ logger.warning("Playwright navigation error (attempt %d/%d) for %s : %s", attempt, max_attempts, url, str(e))
263
+ # small backoff
264
+ await asyncio.sleep(backoff_base * attempt)
265
+ # if fatal, break and raise after attempts
266
+ except Exception as e:
267
+ last_exc = e
268
+ logger.exception("Unexpected navigation error (attempt %d/%d) for %s", attempt, max_attempts, url)
269
+ await asyncio.sleep(backoff_base * attempt)
270
+
271
+ # cleanup
272
+ try:
273
+ await page.close()
274
+ except Exception:
275
+ pass
276
+ try:
277
+ await context.close()
278
+ except Exception:
279
+ pass
280
+ raise RuntimeError(f"Failed to fetch {url} after {max_attempts} attempts: {last_exc}")
281
+
282
+ # ---------------------
283
+ # App startup/shutdown
284
+ # ---------------------
285
  @app.on_event("startup")
286
+ async def on_startup():
287
+ await start_playwright()
288
+ # pre-warm a small context to reduce first request latency
289
+ try:
290
+ async with CONCURRENCY_SEMAPHORE:
291
+ _ = await _fetch_with_context("about:blank")
292
+ except Exception:
293
+ pass
294
+ logger.info("App startup complete. concurrency=%d parser_workers=%d", POOL_CONCURRENCY, PARSER_WORKERS)
295
 
296
  @app.on_event("shutdown")
297
+ async def on_shutdown():
298
+ await stop_playwright()
299
+ PARSE_POOL.shutdown(wait=False)
300
+
301
+ # ---------------------
302
+ # Blocking helper wrapper
303
+ # ---------------------
304
+ async def fetch_and_extract(url: str, wait_until: str = PAGE_LOAD_WAIT, timeout_ms: int = REQUEST_TIMEOUT_MS, user_agent: Optional[str] = None) -> Dict[str, Any]:
305
+ """
306
+ Acquire concurrency semaphore, fetch page with Playwright, parse with BeautifulSoup in threadpool.
307
+ """
308
+ async with CONCURRENCY_SEMAPHORE:
309
+ html = await _fetch_with_context(url, wait_until=wait_until, timeout_ms=timeout_ms, user_agent=user_agent)
310
+ data = await extract_data(html)
311
+ return {"url": url, "data": data}
312
+
313
+ # ---------------------
314
+ # Request models
315
+ # ---------------------
316
+ class SearchRequest(BaseModel):
317
+ query: Optional[str] = None
318
+ url: Optional[str] = None
319
 
320
+ # ---------------------
321
+ # Endpoints
322
+ # ---------------------
323
  @app.get("/health")
324
  async def health():
325
  return {"status": "ok"}
326
 
327
+ @app.get("/fetch")
328
+ async def fetch(url: str = Query(..., min_length=5), fast: Optional[bool] = Query(True)):
329
+ """
330
+ Fetch a URL and extract data. Use fast=true to wait only for domcontentloaded (faster).
331
+ """
332
+ wait_until = "domcontentloaded" if fast else "load"
333
+ try:
334
+ result = await fetch_and_extract(url, wait_until=wait_until)
335
+ return JSONResponse(content=result)
336
+ except Exception as e:
337
+ logger.exception("Fetch error for %s", url)
338
+ raise HTTPException(status_code=500, detail=str(e))
339
+
340
  @app.get("/search")
341
+ async def search(query: str = Query(..., min_length=1), fast: Optional[bool] = Query(True)):
342
  q = query.strip()
343
  if not q:
344
+ raise HTTPException(status_code=400, detail="query required")
345
 
346
  url = f"https://www.google.com/search?q={quote_plus(q)}"
347
+ cache_key = f"search:{q}:{fast}"
348
+ cached = await CACHE.get(cache_key)
 
349
  if cached:
350
  return JSONResponse(content={"cached": True, **cached})
351
 
352
+ try:
353
+ res = await fetch_and_extract(url, wait_until=("domcontentloaded" if fast else "load"))
354
+ await CACHE.set(cache_key, res)
355
+ return JSONResponse(content={"cached": False, **res})
356
+ except Exception as e:
357
+ logger.exception("Search error for %s", q)
358
+ raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
359
 
360
  @app.post("/search")
361
  async def post_search(body: SearchRequest = Body(...)):
362
  if not (body.query or body.url):
363
  raise HTTPException(status_code=400, detail="Either query or url must be provided")
364
+ target = body.url if body.url else f"https://www.google.com/search?q={quote_plus(body.query)}"
365
+ try:
366
+ res = await fetch_and_extract(target, wait_until=PAGE_LOAD_WAIT)
367
+ return JSONResponse(content=res)
368
+ except Exception as e:
369
+ logger.exception("Post search error for %s", target)
370
+ raise HTTPException(status_code=500, detail=str(e))
371
+
372
+ # ---------------------
373
+ # Streaming / multi-page endpoints
374
+ # ---------------------
375
+ @app.get("/search_pages")
376
+ async def search_pages(
377
+ query: str = Query(..., min_length=1),
378
+ pages: int = Query(3, ge=1, le=50),
379
+ concurrency: Optional[int] = Query(None, ge=1),
380
+ ordered: Optional[bool] = Query(False),
381
+ fast: Optional[bool] = Query(True),
382
+ ):
383
+ """
384
+ Streams NDJSON results per Google page. pages=N means start=0,10,20...
385
+ Use concurrency to override default concurrency (POOL_CONCURRENCY).
386
+ """
387
+ q = query.strip()
388
+ if not q:
389
+ raise HTTPException(status_code=400, detail="query required")
390
+
391
+ pages = min(max(1, pages), 50)
392
+ items: List[Tuple[int, str]] = []
393
+ for i in range(pages):
394
+ start = i * 10
395
+ items.append((i + 1, f"https://www.google.com/search?q={quote_plus(q)}&start={start}"))
396
+
397
+ # determine concurrency (local, can't change global semaphore size, so we create local tasks but still rely on global)
398
+ local_concurrency = concurrency or POOL_CONCURRENCY
399
+ # create tasks
400
+ loop = asyncio.get_running_loop()
401
+ task_list = []
402
+ for page_no, url in items:
403
+ task = loop.create_task(_search_page_task(page_no, url, fast))
404
+ task_list.append(task)
405
+
406
+ async def streamer():
407
+ try:
408
+ if ordered:
409
+ # yield in page order
410
+ for t in task_list:
411
+ res = await t
412
+ yield (json.dumps(res, ensure_ascii=False) + "\n").encode("utf-8")
413
+ else:
414
+ # yield fastest-first
415
+ for fut in asyncio.as_completed(task_list):
416
+ res = await fut
417
+ yield (json.dumps(res, ensure_ascii=False) + "\n").encode("utf-8")
418
+ finally:
419
+ for t in task_list:
420
+ if not t.done():
421
+ t.cancel()
422
+
423
+ return StreamingResponse(streamer(), media_type="application/x-ndjson")
424
+
425
+ async def _search_page_task(page_no: int, url: str, fast: bool):
426
+ try:
427
+ wait_until = "domcontentloaded" if fast else "load"
428
+ result = await fetch_and_extract(url, wait_until=wait_until)
429
+ return {"page": page_no, "url": url, "ok": True, "duration": None, "data": result.get("data")}
430
+ except Exception as e:
431
+ logger.exception("Error fetching page %d (%s): %s", page_no, url, e)
432
+ return {"page": page_no, "url": url, "ok": False, "error": str(e)}
433
+
434
+ @app.get("/search_pages_aggregate")
435
+ async def search_pages_aggregate(
436
+ query: str = Query(..., min_length=1),
437
+ pages: int = Query(3, ge=1, le=50),
438
+ concurrency: Optional[int] = Query(None, ge=1),
439
+ fast: Optional[bool] = Query(True),
440
+ ):
441
+ q = query.strip()
442
+ if not q:
443
+ raise HTTPException(status_code=400, detail="query required")
444
+
445
+ pages = min(max(1, pages), 50)
446
+ items = []
447
+ for i in range(pages):
448
+ start = i * 10
449
+ items.append((i + 1, f"https://www.google.com/search?q={quote_plus(q)}&start={start}"))
450
+
451
+ tasks = [asyncio.create_task(_search_page_task(pno, url, fast)) for pno, url in items]
452
+ results = await asyncio.gather(*tasks, return_exceptions=False)
453
+ return JSONResponse(content={"pages": results})
454
+
455
+ # ---------------------
456
+ # Run notes:
457
+ # ---------------------
458
+ # start server:
459
+ # uvicorn app_playwright:app --host 0.0.0.0 --port 8000 --workers 1
460
+ #
461
+ # Important: use a single uvicorn worker (playwright and the asyncio loop should be in one process).
462
+ # You can scale horizontally by running multiple instances behind a load balancer if needed.
463
+ #
464
+ # ---------------------