AdarshJi commited on
Commit
b7b7249
·
verified ·
1 Parent(s): 38c1c8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +494 -399
app.py CHANGED
@@ -1,78 +1,270 @@
1
- # app_playwright_xvfb.py
2
- """
3
- Playwright FastAPI fetcher (headful with Xvfb support).
4
-
5
- - Default behavior: HEADLESS = False (i.e., run "headful")
6
- - If HEADLESS=false and DISPLAY is empty, this starts Xvfb via pyvirtualdisplay.
7
- - Endpoints:
8
- /health, /fetch, /search, POST /search, /search_pages (NDJSON), /search_pages_aggregate
9
-
10
- Requirements:
11
- pip install fastapi uvicorn playwright beautifulsoup4 pydantic pyvirtualdisplay
12
- python -m playwright install chromium
13
-
14
- Note: On Hugging Face Spaces you may need the system package 'xvfb' available.
15
- """
16
  from __future__ import annotations
17
  import os
18
- import asyncio
19
- import logging
20
- import json
21
  import time
22
- from typing import Optional, Dict, Any, List, Tuple
 
 
 
 
23
  from urllib.parse import quote_plus, urljoin
24
- import concurrent.futures
25
 
26
- from fastapi import FastAPI, Query, Body, HTTPException
27
- from fastapi.responses import JSONResponse, StreamingResponse
28
  from pydantic import BaseModel
 
29
 
 
 
 
 
 
30
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- # Playwright async imports
33
- from playwright.async_api import async_playwright, Playwright, Browser, BrowserContext, Page, Request as PWRequest, Error as PWError
34
-
35
- # Try to import pyvirtualdisplay; we'll conditionally use it
36
- try:
37
- from pyvirtualdisplay import Display as XvfbDisplay
38
- except Exception:
39
- XvfbDisplay = None
40
-
41
- # ---------------------
42
- # Config / Logging
43
- # ---------------------
44
- LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO").upper()
45
- logging.basicConfig(level=LOG_LEVEL, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
46
- logger = logging.getLogger("fast_playwright_fetcher_xvfb")
47
-
48
- # By default we set headless=false per your request. You can override with env var PLAYWRIGHT_HEADLESS=true
49
- PLAYWRIGHT_HEADLESS = os.environ.get("PLAYWRIGHT_HEADLESS", "false").lower() in ("1", "true", "yes")
50
-
51
- # Concurrency and parser pool
52
- POOL_CONCURRENCY = int(os.environ.get("POOL_CONCURRENCY", "8")) # concurrent navigations
53
- PARSER_WORKERS = int(os.environ.get("PARSER_WORKERS", "4")) # BeautifulSoup threadpool
54
- REQUEST_TIMEOUT_MS = int(os.environ.get("REQUEST_TIMEOUT_MS", "20000"))
55
- PAGE_LOAD_WAIT = os.environ.get("PAGE_LOAD_WAIT", "domcontentloaded") # "domcontentloaded" or "load"
56
-
57
- # Block patterns to avoid loading ads/fonts
58
- BLOCK_PATTERNS = [
59
- "doubleclick.net", "google-analytics.com", "googlesyndication.com",
60
- "adservice.google.com", "googletagmanager.com", "facebook.com",
61
- "fonts.googleapis.com", "gstatic.com", "analytics.twitter.com",
62
- ".woff", ".woff2", ".ttf", ".otf", "font.gstatic.com",
63
- ]
64
-
65
- # Virtual display object; will be created at startup if needed.
66
- XVFB: Optional[XvfbDisplay] = None
67
-
68
- # ---------------------
69
- # Parser threadpool
70
- # ---------------------
71
- PARSE_POOL = concurrent.futures.ThreadPoolExecutor(max_workers=PARSER_WORKERS)
72
-
73
- def extract_data_sync(html: str) -> Dict[str, Any]:
74
- """Synchronous BeautifulSoup extraction (run in PARSE_POOL)."""
75
- soup = BeautifulSoup(html or "", "html.parser")
76
  BASE_URL = "https://www.google.com"
77
 
78
  def safe_text(el):
@@ -82,408 +274,311 @@ def extract_data_sync(html: str) -> Dict[str, Any]:
82
  return el.get(attr) if el and el.has_attr(attr) else ""
83
 
84
  def abs_url(url):
85
- try:
86
- return urljoin(BASE_URL, url) if url else ""
87
- except Exception:
88
- return url or ""
89
 
90
  def clean_thumb(src):
91
  if src and not src.startswith("data:"):
92
  return abs_url(src)
93
  return None
94
 
95
- # web results
 
 
 
 
 
96
  web_results = []
97
  for result in soup.select(".tF2Cxc"):
 
 
98
  title_tag = result.select_one("h3")
99
  link_tag = result.select_one("a")
100
  cite_tag = result.select_one("cite")
101
  snippet_tag = result.select_one(".VwiC3b")
 
 
102
  if title_tag and link_tag:
103
- web_results.append({
 
104
  "title": safe_text(title_tag),
105
  "link": abs_url(safe_attr(link_tag, "href")),
106
  "displayed_url": safe_text(cite_tag),
107
  "snippet": safe_text(snippet_tag)
108
- })
 
 
 
 
 
 
 
 
109
 
110
  image_results = []
111
  for img_item in soup.select(".eA0Zlc"):
112
  img_tag = img_item.select_one("img")
113
  link_tag = img_item.select_one("a")
 
114
  src = safe_attr(img_tag, "data-src") or safe_attr(img_tag, "src")
115
  thumb = clean_thumb(src)
116
  if thumb:
117
- image_results.append({"thumbnail": thumb, "alt": safe_attr(img_tag, "alt"), "link": abs_url(safe_attr(link_tag, "href"))})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  data = {}
120
  if web_results:
121
  data["web_results"] = web_results
122
  if image_results:
123
  data["image_results"] = image_results
124
-
125
- thumbnails = set()
126
- for img in soup.select("img[data-src], img[src]"):
127
- src = safe_attr(img, "data-src") or safe_attr(img, "src")
128
- thumb = clean_thumb(src)
129
- if thumb:
130
- thumbnails.add(thumb)
131
- if thumbnails:
132
- data["all_thumbnail_urls"] = sorted(thumbnails)
 
133
 
134
  return data
135
 
136
- async def extract_data(html: str) -> Dict[str, Any]:
137
- loop = asyncio.get_running_loop()
138
- return await loop.run_in_executor(PARSE_POOL, extract_data_sync, html)
139
 
140
- # ---------------------
141
- # FastAPI + Playwright globals
142
- # ---------------------
143
- app = FastAPI(title="fast_playwright_fetcher_xvfb", version="0.2")
144
- PLAY: Optional[Playwright] = None
145
- BROWSER: Optional[Browser] = None
 
146
 
147
- # global concurrency semaphore
148
- CONCURRENCY_SEMAPHORE = asyncio.Semaphore(POOL_CONCURRENCY)
 
 
 
 
 
 
 
 
 
 
149
 
150
- # simple in-memory async TTL cache
151
  class SimpleTTLCache:
152
  def __init__(self, ttl_seconds: int = 20):
153
  self.ttl = ttl_seconds
154
- self.store: Dict[str, Tuple[float, Any]] = {}
155
- self._lock = asyncio.Lock()
156
 
157
- async def get(self, key: str):
158
- async with self._lock:
159
- item = self.store.get(key)
160
  if not item:
161
  return None
162
  ts, value = item
163
  if time.time() - ts > self.ttl:
164
- del self.store[key]
165
  return None
166
  return value
167
 
168
- async def set(self, key: str, value: Any):
169
- async with self._lock:
170
- self.store[key] = (time.time(), value)
171
-
172
- CACHE = SimpleTTLCache(ttl_seconds=int(os.environ.get("CACHE_TTL", "25")))
173
-
174
- # ---------------------
175
- # Xvfb helper
176
- # ---------------------
177
- def start_xvfb_if_needed():
178
- """Start Xvfb if headful mode and no DISPLAY present."""
179
- global XVFB
180
- if PLAYWRIGHT_HEADLESS:
181
- logger.info("PLAYWRIGHT_HEADLESS=true — not starting Xvfb.")
182
- return
183
-
184
- # If user requested headful but DISPLAY is set, assume display exists
185
- if os.environ.get("DISPLAY"):
186
- logger.info("DISPLAY is already set (%s) — not starting Xvfb.", os.environ.get("DISPLAY"))
187
- return
188
-
189
- if XvfbDisplay is None:
190
- # pyvirtualdisplay not installed
191
- logger.error("pyvirtualdisplay not installed. Please `pip install pyvirtualdisplay` and ensure Xvfb is available.")
192
- raise RuntimeError("pyvirtualdisplay missing and HEADLESS=false requires Xvfb on headless servers.")
193
-
194
- try:
195
- # Default size — adjust via env if needed
196
- width = int(os.environ.get("XVFB_WIDTH", "1366"))
197
- height = int(os.environ.get("XVFB_HEIGHT", "768"))
198
- logger.info("Starting Xvfb (width=%d height=%d)...", width, height)
199
- XVFB = XvfbDisplay(visible=0, size=(width, height))
200
- XVFB.start()
201
- logger.info("Xvfb started. DISPLAY=%s", os.environ.get("DISPLAY"))
202
- except Exception as e:
203
- logger.exception("Failed to start Xvfb: %s", e)
204
- raise
205
-
206
- def stop_xvfb_if_started():
207
- global XVFB
208
- if XVFB:
209
- try:
210
- XVFB.stop()
211
- logger.info("Xvfb stopped.")
212
- except Exception:
213
- pass
214
- XVFB = None
215
-
216
- # ---------------------
217
- # Playwright helpers
218
- # ---------------------
219
- async def start_playwright():
220
- global PLAY, BROWSER
221
- if PLAY is not None:
222
- return
223
- PLAY = await async_playwright().start()
224
- # when headful we still want a minimal set of args
225
- browser_args = ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"]
226
- BROWSER = await PLAY.chromium.launch(headless=PLAYWRIGHT_HEADLESS, args=browser_args)
227
- logger.info("Playwright browser launched (headless=%s).", PLAYWRIGHT_HEADLESS)
228
-
229
- async def stop_playwright():
230
- global PLAY, BROWSER
231
- if BROWSER:
232
- try:
233
- await BROWSER.close()
234
- except Exception:
235
- pass
236
- BROWSER = None
237
- if PLAY:
238
- try:
239
- await PLAY.stop()
240
- except Exception:
241
- pass
242
- PLAY = None
243
- logger.info("Playwright stopped.")
244
 
245
- def _should_block_request(url: str) -> bool:
246
- if not url:
247
- return False
248
- u = url.lower()
249
- for pat in BLOCK_PATTERNS:
250
- if pat in u:
251
- return True
252
- return False
253
-
254
- async def _fetch_with_context(url: str, wait_until: str = "domcontentloaded", timeout_ms: int = REQUEST_TIMEOUT_MS, user_agent: Optional[str] = None) -> str:
255
- """
256
- Create a context + page, block unwanted requests, navigate, return HTML.
257
- """
258
- if BROWSER is None:
259
- raise RuntimeError("Browser not started")
260
-
261
- context: BrowserContext = await BROWSER.new_context(user_agent=user_agent or "Mozilla/5.0 (Playwright)", viewport={"width": 1366, "height": 768})
262
- page: Page = await context.new_page()
263
-
264
- async def route_handler(route, request: PWRequest):
265
- try:
266
- if _should_block_request(request.url):
267
- await route.abort()
268
- else:
269
- await route.continue_()
270
- except Exception:
271
- try:
272
- await route.continue_()
273
- except Exception:
274
- pass
275
 
276
- try:
277
- await page.route("**/*", route_handler)
278
- except Exception:
279
- # routing might fail in some environments; continue
280
- pass
281
 
282
- max_attempts = 3
283
- backoff_base = 0.2
284
- last_exc = None
285
- for attempt in range(1, max_attempts + 1):
286
- try:
287
- await page.goto(url, wait_until=wait_until, timeout=timeout_ms)
288
- content = await page.content()
289
- # close context & page gracefully
290
- try:
291
- await page.close()
292
- except Exception:
293
- pass
294
- try:
295
- await context.close()
296
- except Exception:
297
- pass
298
- return content
299
- except PWError as e:
300
- last_exc = e
301
- logger.warning("Playwright navigation error (attempt %d/%d) for %s : %s", attempt, max_attempts, url, str(e))
302
- await asyncio.sleep(backoff_base * attempt)
303
- except Exception as e:
304
- last_exc = e
305
- logger.exception("Unexpected navigation error (attempt %d/%d) for %s", attempt, max_attempts, url)
306
- await asyncio.sleep(backoff_base * attempt)
307
-
308
- # ensure cleanup
309
- try:
310
- await page.close()
311
- except Exception:
312
- pass
313
- try:
314
- await context.close()
315
- except Exception:
316
- pass
317
- raise RuntimeError(f"Failed to fetch {url} after {max_attempts} attempts: {last_exc}")
318
-
319
- # ---------------------
320
- # Startup / Shutdown
321
- # ---------------------
322
  @app.on_event("startup")
323
- async def on_startup():
324
- # If running headful (PLAYWRIGHT_HEADLESS == False), ensure an X display is present
325
- if not PLAYWRIGHT_HEADLESS:
326
- start_xvfb_if_needed()
327
-
328
- # start playwright browser
329
- await start_playwright()
330
-
331
- # small pre-warm
332
- try:
333
- async with CONCURRENCY_SEMAPHORE:
334
- await _fetch_with_context("about:blank")
335
- except Exception:
336
- pass
337
-
338
- logger.info("Startup complete: headless=%s concurrency=%d parser_workers=%d", PLAYWRIGHT_HEADLESS, POOL_CONCURRENCY, PARSER_WORKERS)
339
 
340
  @app.on_event("shutdown")
341
- async def on_shutdown():
342
- await stop_playwright()
343
- stop_xvfb_if_started()
344
- PARSE_POOL.shutdown(wait=False)
345
- logger.info("Shutdown complete.")
346
-
347
- # ---------------------
348
- # Fetch wrapper that uses concurrency semaphore and parser threadpool
349
- # ---------------------
350
- async def fetch_and_extract(url: str, wait_until: str = PAGE_LOAD_WAIT, timeout_ms: int = REQUEST_TIMEOUT_MS, user_agent: Optional[str] = None) -> Dict[str, Any]:
351
- async with CONCURRENCY_SEMAPHORE:
352
- html = await _fetch_with_context(url, wait_until=wait_until, timeout_ms=timeout_ms, user_agent=user_agent)
353
- data = await extract_data(html)
354
- return {"url": url, "data": data}
355
-
356
- # ---------------------
357
- # Request models and endpoints
358
- # ---------------------
359
- class SearchRequest(BaseModel):
360
- query: Optional[str] = None
361
- url: Optional[str] = None
362
 
363
  @app.get("/health")
364
  async def health():
365
  return {"status": "ok"}
366
 
367
- @app.get("/fetch")
368
- async def fetch(url: str = Query(..., min_length=5), fast: Optional[bool] = Query(True)):
369
- wait_until = "domcontentloaded" if fast else "load"
370
- try:
371
- result = await fetch_and_extract(url, wait_until=wait_until)
372
- return JSONResponse(content=result)
373
- except Exception as e:
374
- logger.exception("Fetch error for %s", url)
375
- raise HTTPException(status_code=500, detail=str(e))
376
-
377
  @app.get("/search")
378
- async def search(query: str = Query(..., min_length=1), fast: Optional[bool] = Query(True)):
379
  q = query.strip()
380
  if not q:
381
- raise HTTPException(status_code=400, detail="query required")
 
382
  url = f"https://www.google.com/search?q={quote_plus(q)}"
383
- cache_key = f"search:{q}:{fast}"
384
- cached = await CACHE.get(cache_key)
 
385
  if cached:
386
  return JSONResponse(content={"cached": True, **cached})
387
- try:
388
- res = await fetch_and_extract(url, wait_until=("domcontentloaded" if fast else "load"))
389
- await CACHE.set(cache_key, res)
390
- return JSONResponse(content={"cached": False, **res})
391
- except Exception as e:
392
- logger.exception("Search error for %s", q)
393
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
394
 
395
  @app.post("/search")
396
  async def post_search(body: SearchRequest = Body(...)):
397
  if not (body.query or body.url):
398
  raise HTTPException(status_code=400, detail="Either query or url must be provided")
399
- target = body.url if body.url else f"https://www.google.com/search?q={quote_plus(body.query)}"
400
- try:
401
- res = await fetch_and_extract(target, wait_until=PAGE_LOAD_WAIT)
402
- return JSONResponse(content=res)
403
- except Exception as e:
404
- logger.exception("Post search error for %s", target)
405
- raise HTTPException(status_code=500, detail=str(e))
406
-
407
- @app.get("/search_pages")
408
- async def search_pages(
409
- query: str = Query(..., min_length=1),
410
- pages: int = Query(3, ge=1, le=50),
411
- concurrency: Optional[int] = Query(None, ge=1),
412
- ordered: Optional[bool] = Query(False),
413
- fast: Optional[bool] = Query(True),
414
- ):
415
- q = query.strip()
416
- if not q:
417
- raise HTTPException(status_code=400, detail="query required")
418
- pages = min(max(1, pages), 50)
419
- items: List[Tuple[int, str]] = []
420
- for i in range(pages):
421
- start = i * 10
422
- items.append((i + 1, f"https://www.google.com/search?q={quote_plus(q)}&start={start}"))
423
-
424
- loop = asyncio.get_running_loop()
425
- tasks = [loop.create_task(_search_page_task(pno, url, fast)) for pno, url in items]
426
 
427
- async def streamer():
428
- try:
429
- if ordered:
430
- for t in tasks:
431
- res = await t
432
- yield (json.dumps(res, ensure_ascii=False) + "\n").encode("utf-8")
433
- else:
434
- for fut in asyncio.as_completed(tasks):
435
- res = await fut
436
- yield (json.dumps(res, ensure_ascii=False) + "\n").encode("utf-8")
437
- finally:
438
- for t in tasks:
439
- if not t.done():
440
- try:
441
- t.cancel()
442
- except Exception:
443
- pass
444
 
445
- return StreamingResponse(streamer(), media_type="application/x-ndjson")
446
-
447
- async def _search_page_task(page_no: int, url: str, fast: bool):
448
- try:
449
- wait_until = "domcontentloaded" if fast else "load"
450
- result = await fetch_and_extract(url, wait_until=wait_until)
451
- return {"page": page_no, "url": url, "ok": True, "data": result.get("data")}
452
- except Exception as e:
453
- logger.exception("Error fetching page %d (%s): %s", page_no, url, e)
454
- return {"page": page_no, "url": url, "ok": False, "error": str(e)}
455
-
456
- @app.get("/search_pages_aggregate")
457
- async def search_pages_aggregate(
458
- query: str = Query(..., min_length=1),
459
- pages: int = Query(3, ge=1, le=50),
460
- concurrency: Optional[int] = Query(None, ge=1),
461
- fast: Optional[bool] = Query(True),
462
- ):
463
- q = query.strip()
464
- if not q:
465
- raise HTTPException(status_code=400, detail="query required")
466
- pages = min(max(1, pages), 50)
467
- items = []
468
- for i in range(pages):
469
- start = i * 10
470
- items.append((i + 1, f"https://www.google.com/search?q={quote_plus(q)}&start={start}"))
471
- tasks = [asyncio.create_task(_search_page_task(pno, url, fast)) for pno, url in items]
472
- results = await asyncio.gather(*tasks, return_exceptions=False)
473
- return JSONResponse(content={"pages": results})
474
-
475
- # ---------------------
476
- # Helpful notes
477
- # ---------------------
478
- # - Make sure you installed system package Xvfb in the environment (Hugging Face Spaces usually supports it,
479
- # otherwise install via apt: `apt-get update && apt-get install -y xvfb` when building the image).
480
- # - Install Python deps: pip install fastapi uvicorn playwright beautifulsoup4 pydantic pyvirtualdisplay
481
- # - Install Playwright browser binaries: python -m playwright install chromium
482
- # - Run the app: uvicorn app_playwright_xvfb:app --host 0.0.0.0 --port 8000 --workers 1
483
- #
484
- # On Hugging Face Spaces:
485
- # - Add required apt packages (xvfb) via your space's startup script or Dockerfile if necessary.
486
- # - Set environment variable PLAYWRIGHT_HEADLESS=false (optional: it's already defaulted to false here).
487
- # - Use a single uvicorn worker (workers=1). You can scale horizontally by running multiple space instances.
488
- #
489
- # ---------------------
 
1
+ # app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from __future__ import annotations
3
  import os
 
 
 
4
  import time
5
+ import logging
6
+ import threading
7
+ import asyncio
8
+ from typing import Optional, Dict, Any, Tuple
9
+ from concurrent.futures import ThreadPoolExecutor
10
  from urllib.parse import quote_plus, urljoin
 
11
 
12
+ from fastapi import FastAPI, HTTPException, Query, Body
 
13
  from pydantic import BaseModel
14
+ from starlette.responses import JSONResponse
15
 
16
+ from selenium import webdriver
17
+ from selenium.webdriver.chrome.options import Options
18
+ from selenium.webdriver.chrome.service import Service
19
+ from selenium.common.exceptions import WebDriverException, SessionNotCreatedException
20
+ from webdriver_manager.chrome import ChromeDriverManager
21
  from bs4 import BeautifulSoup
22
+ from selenium.webdriver.common.by import By
23
+ from selenium.webdriver.support.ui import WebDriverWait
24
+ from selenium.webdriver.support import expected_conditions as EC
25
+ from selenium.common.exceptions import TimeoutException
26
+
27
+ # virtual display
28
+ from pyvirtualdisplay import Display
29
+
30
+ # Logging
31
+ logging.basicConfig(level=logging.INFO)
32
+ logger = logging.getLogger("fast_fetcher")
33
+
34
+ # ---------------- BrowserManager ----------------
35
+ class BrowserManager:
36
+ def __init__(
37
+ self,
38
+ headless: bool = True,
39
+ user_agent: Optional[str] = None,
40
+ window_size: str = "1366,768",
41
+ disable_images: bool = True,
42
+ block_resource_urls: Optional[list[str]] = None,
43
+ ):
44
+ self.headless = headless
45
+ self.user_agent = user_agent
46
+ self.window_size = window_size
47
+ self.disable_images = disable_images
48
+ self.block_resource_urls = block_resource_urls or [
49
+ "*.doubleclick.net/*",
50
+ "*.google-analytics.com/*",
51
+ "*.googlesyndication.com/*",
52
+ "*.adservice.google.com/*",
53
+ ]
54
+ self._driver_lock = threading.Lock()
55
+ self._driver: Optional[webdriver.Chrome] = None
56
+ self._display: Optional[Display] = None
57
+ self._start_driver_with_retries()
58
+
59
+ def _build_options(self) -> Options:
60
+ opts = Options()
61
+ # If CHROME_BIN is present, point to it
62
+ chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/google-chrome-stable")
63
+ if os.path.exists(chrome_bin):
64
+ opts.binary_location = chrome_bin
65
+ logger.debug("Using chrome binary: %s", chrome_bin)
66
+ else:
67
+ logger.warning("Chrome binary not found at %s (will rely on system/browser manager).", chrome_bin)
68
+
69
+ if self.headless:
70
+ opts.add_argument("--headless=new")
71
+ opts.add_argument("--headless")
72
+
73
+ # container-friendly flags (and stable fallback)
74
+ opts.add_argument("--no-sandbox")
75
+ opts.add_argument("--disable-setuid-sandbox")
76
+ opts.add_argument("--disable-dev-shm-usage")
77
+ opts.add_argument("--disable-gpu")
78
+ opts.add_argument("--disable-extensions")
79
+ opts.add_argument("--disable-blink-features=AutomationControlled")
80
+ opts.add_argument("--disable-software-rasterizer")
81
+ opts.add_argument(f"--window-size={self.window_size}")
82
+ opts.add_argument("--remote-debugging-port=0")
83
+
84
+ if self.user_agent:
85
+ opts.add_argument(f"--user-agent={self.user_agent}")
86
+
87
+ if self.disable_images:
88
+ prefs = {
89
+ "profile.managed_default_content_settings.images": 2,
90
+ "profile.managed_default_content_settings.stylesheets": 2,
91
+ "profile.managed_default_content_settings.fonts": 2,
92
+ }
93
+ opts.add_experimental_option("prefs", prefs)
94
+
95
+ opts.add_experimental_option("excludeSwitches", ["enable-logging"])
96
+ opts.add_experimental_option("useAutomationExtension", False)
97
+ return opts
98
+
99
+ def _start_driver_with_retries(self, attempts: int = 3, delay_seconds: float = 1.0):
100
+ last_exc = None
101
+ for attempt in range(1, attempts + 1):
102
+ try:
103
+ logger.info("Starting Chrome driver (attempt %d/%d)...", attempt, attempts)
104
+ self._start_driver()
105
+ logger.info("Chrome driver started successfully.")
106
+ return
107
+ except Exception as exc:
108
+ logger.exception("Failed to start driver on attempt %d: %s", attempt, exc)
109
+ last_exc = exc
110
+ time.sleep(delay_seconds)
111
+ raise RuntimeError(f"Unable to start Chrome driver after {attempts} attempts: {last_exc}") from last_exc
112
+
113
+ def _start_xvfb_if_needed(self):
114
+ # If headless=False AND no DISPLAY, start Xvfb via pyvirtualdisplay
115
+ if not self.headless and os.environ.get("DISPLAY", "") == "":
116
+ try:
117
+ logger.info("No DISPLAY found and headless=False — starting virtual X display (Xvfb).")
118
+ self._display = Display(visible=0, size=(int(self.window_size.split(",")[0]), int(self.window_size.split(",")[1])))
119
+ self._display.start()
120
+ logger.info("Virtual X display started (DISPLAY=%s).", os.environ.get("DISPLAY"))
121
+ except Exception as e:
122
+ logger.exception("Failed to start virtual display: %s", e)
123
+ raise
124
+
125
+ def _stop_xvfb_if_started(self):
126
+ if self._display:
127
+ try:
128
+ self._display.stop()
129
+ logger.info("Virtual X display stopped.")
130
+ except Exception:
131
+ pass
132
+ self._display = None
133
+
134
+ def _start_driver(self):
135
+ # start virtual display if required BEFORE launching Chrome
136
+ self._start_xvfb_if_needed()
137
+
138
+ opts = self._build_options()
139
+
140
+ # 1) Try Selenium Manager (webdriver.Chrome(options=opts)). Selenium >=4.14 may use driver manager itself.
141
+ primary_exc = None
142
+ fallback_exc = None
143
+ try:
144
+ logger.debug("Attempting to start Chrome via Selenium Manager (webdriver.Chrome(options=opts))")
145
+ self._driver = webdriver.Chrome(options=opts)
146
+ # quick smoke test: ensure browser is responsive (may throw)
147
+ try:
148
+ self._driver.execute_script("return navigator.userAgent")
149
+ except Exception as e:
150
+ # browser started but died quickly
151
+ raise RuntimeError("Browser started by Selenium Manager but crashed immediately.") from e
152
+
153
+ self._post_start_setup()
154
+ return
155
+ except Exception as e_primary:
156
+ primary_exc = e_primary
157
+ logger.warning("Selenium Manager attempt failed: %s", e_primary)
158
+
159
+ # 2) Fallback: use webdriver-manager to download driver and start with the explicit Service
160
+ try:
161
+ driver_path = ChromeDriverManager().install()
162
+ logger.info("webdriver-manager installed chromedriver: %s", driver_path)
163
+ try:
164
+ os.chmod(driver_path, 0o755)
165
+ except Exception:
166
+ logger.debug("chmod on chromedriver failed or unnecessary.")
167
+
168
+ service = Service(driver_path)
169
+ self._driver = webdriver.Chrome(service=service, options=opts)
170
+ self._post_start_setup()
171
+ return
172
+ except Exception as e_fallback:
173
+ fallback_exc = e_fallback
174
+ logger.exception("webdriver-manager fallback failed: %s", e_fallback)
175
+
176
+ # 3) Final fallback: attempt system /usr/bin/chromedriver if available
177
+ try:
178
+ sys_path = "/usr/bin/chromedriver"
179
+ if os.path.exists(sys_path):
180
+ logger.info("Trying system chromedriver at %s", sys_path)
181
+ try:
182
+ os.chmod(sys_path, 0o755)
183
+ except Exception:
184
+ pass
185
+ service = Service(sys_path)
186
+ self._driver = webdriver.Chrome(service=service, options=opts)
187
+ self._post_start_setup()
188
+ return
189
+ except Exception as e_sys:
190
+ logger.exception("System chromedriver attempt failed: %s", e_sys)
191
+
192
+ # If all failed, stop virtual display (if started) and raise a helpful error
193
+ self._stop_xvfb_if_started()
194
+ # Include both primary and fallback messages in the raised exception
195
+ raise RuntimeError(f"Failed to start Chrome driver. primary_error={primary_exc}, fallback_error={fallback_exc}")
196
+
197
+ def _post_start_setup(self):
198
+ try:
199
+ self._driver.set_page_load_timeout(60)
200
+ # best-effort CDP network blocking
201
+ try:
202
+ self._driver.execute_cdp_cmd("Network.enable", {})
203
+ if self.block_resource_urls:
204
+ self._driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": self.block_resource_urls})
205
+ except Exception:
206
+ pass
207
+ except Exception:
208
+ pass
209
+
210
+ def fetch_html(
211
+ self,
212
+ url: str,
213
+ wait_seconds: Optional[float] = 10.0,
214
+ wait_for_selector: Optional[str] = None,
215
+ ) -> str:
216
+ if self._driver is None:
217
+ self._start_driver_with_retries()
218
+
219
+ with self._driver_lock:
220
+ driver = self._driver
221
+ try:
222
+ driver.get(url)
223
+
224
+ if wait_for_selector and wait_seconds:
225
+ try:
226
+ WebDriverWait(driver, wait_seconds).until(
227
+ EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_selector))
228
+ )
229
+ except TimeoutException:
230
+ pass
231
+ else:
232
+ if wait_seconds:
233
+ try:
234
+ WebDriverWait(driver, min(wait_seconds, 3)).until(
235
+ lambda d: d.execute_script("return document.readyState") == "complete"
236
+ )
237
+ except Exception:
238
+ time.sleep(0.5)
239
+
240
+ return driver.page_source
241
+ except WebDriverException as e:
242
+ logger.exception("WebDriver exception during fetch: %s", e)
243
+ # restart driver and raise
244
+ try:
245
+ self._safe_quit_driver()
246
+ except Exception:
247
+ pass
248
+ self._start_driver_with_retries()
249
+ raise RuntimeError(f"WebDriver error during fetch: {e}")
250
+
251
+ def _safe_quit_driver(self):
252
+ if self._driver:
253
+ try:
254
+ self._driver.quit()
255
+ except Exception:
256
+ pass
257
+ self._driver = None
258
+ # stop display if we started one
259
+ self._stop_xvfb_if_started()
260
+
261
+ def close(self):
262
+ self._safe_quit_driver()
263
 
264
+
265
+ # ---------------- EXTRACT_DATA (same as your earlier implementation) ----------------
266
+ def EXTRACT_DATA(html: str) -> Dict[str, Any]:
267
+ soup = BeautifulSoup(html, "html.parser")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  BASE_URL = "https://www.google.com"
269
 
270
  def safe_text(el):
 
274
  return el.get(attr) if el and el.has_attr(attr) else ""
275
 
276
  def abs_url(url):
277
+ return urljoin(BASE_URL, url) if url else ""
 
 
 
278
 
279
  def clean_thumb(src):
280
  if src and not src.startswith("data:"):
281
  return abs_url(src)
282
  return None
283
 
284
+ def is_ad_element(element):
285
+ for parent in element.parents:
286
+ if parent.get("id") in ["tads", "tadsb"] or "ads-ad" in parent.get("class", []):
287
+ return True
288
+ return False
289
+
290
  web_results = []
291
  for result in soup.select(".tF2Cxc"):
292
+ if is_ad_element(result):
293
+ continue
294
  title_tag = result.select_one("h3")
295
  link_tag = result.select_one("a")
296
  cite_tag = result.select_one("cite")
297
  snippet_tag = result.select_one(".VwiC3b")
298
+ read_more_tag = result.select_one(".vzmbzf")
299
+
300
  if title_tag and link_tag:
301
+ entry = {
302
+ "no": len(web_results) + 1,
303
  "title": safe_text(title_tag),
304
  "link": abs_url(safe_attr(link_tag, "href")),
305
  "displayed_url": safe_text(cite_tag),
306
  "snippet": safe_text(snippet_tag)
307
+ }
308
+ extra = []
309
+ if read_more_tag:
310
+ read_more_url = abs_url(safe_attr(read_more_tag, "href"))
311
+ if read_more_url:
312
+ extra.append({"read_more": read_more_url})
313
+ if extra:
314
+ entry["extra"] = extra
315
+ web_results.append(entry)
316
 
317
  image_results = []
318
  for img_item in soup.select(".eA0Zlc"):
319
  img_tag = img_item.select_one("img")
320
  link_tag = img_item.select_one("a")
321
+ source_tag = img_item.select_one(".s0fJje span")
322
  src = safe_attr(img_tag, "data-src") or safe_attr(img_tag, "src")
323
  thumb = clean_thumb(src)
324
  if thumb:
325
+ image_results.append({
326
+ "thumbnail": thumb,
327
+ "alt": safe_attr(img_tag, "alt"),
328
+ "source": safe_text(source_tag),
329
+ "link": abs_url(safe_attr(link_tag, "href"))
330
+ })
331
+
332
+ video_results = []
333
+ for video in soup.select(".KYaZsb"):
334
+ title_tag = video.select_one(".tNxQIb.ynAwRc")
335
+ link_tag = video.select_one("a.rIRoqf")
336
+ thumb_img = video.select_one(".AZJdrc img")
337
+ duration_tag = video.select_one(".c8rnLc")
338
+ channel_tag = video.select_one(".Sg4azc span:first-child")
339
+ date_tag = video.select_one(".rbYSKb span")
340
+ desc_tag = video.select_one(".wNifxf .p4wth")
341
+ thumb_src = safe_attr(thumb_img, "data-src") or safe_attr(thumb_img, "src")
342
+ thumb = clean_thumb(thumb_src)
343
+ if title_tag and link_tag:
344
+ video_results.append({
345
+ "title": safe_text(title_tag),
346
+ "link": abs_url(safe_attr(link_tag, "href")),
347
+ "thumbnail": thumb,
348
+ "duration": safe_text(duration_tag),
349
+ "channel": safe_text(channel_tag),
350
+ "date": safe_text(date_tag),
351
+ "description_snippet": safe_text(desc_tag)
352
+ })
353
+
354
+ news_results = []
355
+ for news in soup.select(".m7jPZ"):
356
+ title_tag = news.select_one(".n0jPhd")
357
+ link_tag = news.select_one("a")
358
+ source_tag = news.select_one(".MgUUmf span")
359
+ time_tag = news.select_one(".rbYSKb span")
360
+ thumb_img = news.select_one(".uhHOwf img")
361
+ thumb_src = safe_attr(thumb_img, "data-src") or safe_attr(thumb_img, "src")
362
+ thumb = clean_thumb(thumb_src)
363
+ if title_tag and link_tag:
364
+ news_results.append({
365
+ "title": safe_text(title_tag),
366
+ "link": abs_url(safe_attr(link_tag, "href")),
367
+ "source": safe_text(source_tag),
368
+ "time": safe_text(time_tag),
369
+ "thumbnail": thumb
370
+ })
371
+
372
+ knowledge_panel = {}
373
+ rhs = soup.find(id="rhs")
374
+ if rhs:
375
+ title_tag = rhs.select_one(".PZPZlf.ssJ7i")
376
+ subtitle_tag = rhs.select_one(".iAIpCb span")
377
+ if title_tag:
378
+ knowledge_panel["title"] = safe_text(title_tag)
379
+ if subtitle_tag:
380
+ knowledge_panel["subtitle"] = safe_text(subtitle_tag)
381
+
382
+ desc_tag = rhs.select_one(".kno-rdesc span")
383
+ if desc_tag:
384
+ knowledge_panel["description"] = safe_text(desc_tag)
385
+
386
+ facts = {}
387
+ for fact in rhs.select(".zloOqf"):
388
+ label_tag = fact.select_one(".w8qArf")
389
+ value_tag = fact.select_one(".LrzXr")
390
+ if label_tag and value_tag:
391
+ label = safe_text(label_tag).replace(":", "").strip()
392
+ links = value_tag.find_all("a")
393
+ if links and len(links) > 1:
394
+ names = [safe_text(a) for a in links if safe_text(a)]
395
+ if names:
396
+ facts[label] = names
397
+ else:
398
+ text = safe_text(value_tag)
399
+ if text:
400
+ facts[label] = text
401
+ if facts:
402
+ knowledge_panel["facts"] = facts
403
+
404
+ profiles = []
405
+ for profile in rhs.select(".dRrfkf a"):
406
+ name_tag = profile.select_one(".CtCigf")
407
+ link = safe_attr(profile, "href")
408
+ if name_tag and link:
409
+ profiles.append({
410
+ "platform": safe_text(name_tag),
411
+ "link": abs_url(link)
412
+ })
413
+ if profiles:
414
+ knowledge_panel["profiles"] = profiles
415
+
416
+ if not knowledge_panel:
417
+ knowledge_panel = None
418
+
419
+ ai_overview = None
420
+ ai_container = soup.select_one(".p2M1Qe .f5cPye")
421
+ if ai_container:
422
+ text = safe_text(ai_container)
423
+ if text:
424
+ ai_overview = text
425
+
426
+ thumbnails = set()
427
+ for img in soup.select("img[data-src], img[src]"):
428
+ src = safe_attr(img, "data-src") or safe_attr(img, "src")
429
+ clean = clean_thumb(src)
430
+ if clean:
431
+ thumbnails.add(clean)
432
+
433
+ all_thumbnails = sorted(thumbnails) if thumbnails else None
434
 
435
  data = {}
436
  if web_results:
437
  data["web_results"] = web_results
438
  if image_results:
439
  data["image_results"] = image_results
440
+ if video_results:
441
+ data["video_results"] = video_results
442
+ if news_results:
443
+ data["news_results"] = news_results
444
+ if knowledge_panel:
445
+ data["knowledge_panel"] = knowledge_panel
446
+ if ai_overview:
447
+ data["ai_overview"] = ai_overview
448
+ if all_thumbnails:
449
+ data["all_thumbnail_urls"] = all_thumbnails
450
 
451
  return data
452
 
 
 
 
453
 
454
+ # ---------------- BrowserPool and API ----------------
455
+ class BrowserPool:
456
+ def __init__(self, pool_size: int = 1, headless: bool = True):
457
+ self.pool_size = max(1, pool_size)
458
+ self.managers = [BrowserManager(headless=headless) for _ in range(self.pool_size)]
459
+ self._rr_index = 0
460
+ self._rr_lock = threading.Lock()
461
 
462
+ def pick_manager(self) -> BrowserManager:
463
+ with self._rr_lock:
464
+ idx = self._rr_index
465
+ self._rr_index = (self._rr_index + 1) % self.pool_size
466
+ return self.managers[idx]
467
+
468
+ def close_all(self):
469
+ for m in self.managers:
470
+ try:
471
+ m.close()
472
+ except Exception:
473
+ pass
474
 
 
475
  class SimpleTTLCache:
476
  def __init__(self, ttl_seconds: int = 20):
477
  self.ttl = ttl_seconds
478
+ self._cache: Dict[str, Tuple[float, Any]] = {}
479
+ self._lock = threading.Lock()
480
 
481
+ def get(self, key: str):
482
+ with self._lock:
483
+ item = self._cache.get(key)
484
  if not item:
485
  return None
486
  ts, value = item
487
  if time.time() - ts > self.ttl:
488
+ del self._cache[key]
489
  return None
490
  return value
491
 
492
+ def set(self, key: str, value: Any):
493
+ with self._lock:
494
+ self._cache[key] = (time.time(), value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
+ class SearchRequest(BaseModel):
497
+ query: Optional[str] = None
498
+ url: Optional[str] = None
499
+ wait_for_selector: Optional[str] = None
500
+ headless: Optional[bool] = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
 
502
+ app = FastAPI(title="fast_fetcher_api", version="0.1")
503
+ POOL: Optional[BrowserPool] = None
504
+ EXECUTOR: Optional[ThreadPoolExecutor] = None
505
+ CACHE = SimpleTTLCache(ttl_seconds=25)
 
506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  @app.on_event("startup")
508
+ async def startup_event():
509
+ global POOL, EXECUTOR
510
+ # Switch headless here to False as you asked. The BrowserManager will start an Xvfb display automatically.
511
+ POOL = BrowserPool(pool_size=1, headless=False)
512
+ EXECUTOR = ThreadPoolExecutor(max_workers=2)
513
+ app.state.executor = EXECUTOR
514
+ app.state.pool = POOL
515
+ logger.info("Startup: browser pool created (size=%d).", 1)
 
 
 
 
 
 
 
 
516
 
517
  @app.on_event("shutdown")
518
+ async def shutdown_event():
519
+ global POOL, EXECUTOR
520
+ if POOL:
521
+ POOL.close_all()
522
+ if EXECUTOR:
523
+ EXECUTOR.shutdown(wait=True)
524
+ logger.info("Shutdown: browsers closed and executor stopped.")
525
+
526
+ def _blocking_fetch_and_extract(manager: BrowserManager, url: str, wait_for_selector: Optional[str], wait_seconds: Optional[float]):
527
+ start = time.time()
528
+ html = manager.fetch_html(url, wait_seconds=wait_seconds, wait_for_selector=wait_for_selector)
529
+ extracted = EXTRACT_DATA(html)
530
+ duration = time.time() - start
531
+ return {"url": url, "duration": duration, "data": extracted}
 
 
 
 
 
 
 
532
 
533
  @app.get("/health")
534
  async def health():
535
  return {"status": "ok"}
536
 
 
 
 
 
 
 
 
 
 
 
537
  @app.get("/search")
538
+ async def search(query: str = Query(..., min_length=1), wait_for_selector: Optional[str] = None):
539
  q = query.strip()
540
  if not q:
541
+ raise HTTPException(status_code=400, detail="query parameter required")
542
+
543
  url = f"https://www.google.com/search?q={quote_plus(q)}"
544
+ cache_key = f"search:{q}:{wait_for_selector}"
545
+
546
+ cached = CACHE.get(cache_key)
547
  if cached:
548
  return JSONResponse(content={"cached": True, **cached})
549
+
550
+ manager = app.state.pool.pick_manager()
551
+ loop = asyncio.get_event_loop()
552
+ fut = loop.run_in_executor(app.state.executor, _blocking_fetch_and_extract, manager, url, wait_for_selector, 5.0)
553
+ result = await fut
554
+ CACHE.set(cache_key, result)
555
+ return JSONResponse(content={"cached": False, **result})
556
+
557
+ @app.get("/fetch")
558
+ async def fetch(url: str = Query(..., min_length=5), wait_for_selector: Optional[str] = None):
559
+ manager = app.state.pool.pick_manager()
560
+ loop = asyncio.get_event_loop()
561
+ fut = loop.run_in_executor(app.state.executor, _blocking_fetch_and_extract, manager, url, wait_for_selector, 6.0)
562
+ result = await fut
563
+ return JSONResponse(content=result)
564
 
565
  @app.post("/search")
566
  async def post_search(body: SearchRequest = Body(...)):
567
  if not (body.query or body.url):
568
  raise HTTPException(status_code=400, detail="Either query or url must be provided")
569
+ if body.url:
570
+ target = body.url
571
+ else:
572
+ target = f"https://www.google.com/search?q={quote_plus(body.query)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
 
574
+ cache_key = f"search_post:{target}:{body.wait_for_selector}"
575
+ cached = CACHE.get(cache_key)
576
+ if cached:
577
+ return JSONResponse(content={"cached": True, **cached})
 
 
 
 
 
 
 
 
 
 
 
 
 
578
 
579
+ manager = app.state.pool.pick_manager()
580
+ loop = asyncio.get_event_loop()
581
+ fut = loop.run_in_executor(app.state.executor, _blocking_fetch_and_extract, manager, target, body.wait_for_selector, 6.0)
582
+ result = await fut
583
+ CACHE.set(cache_key, result)
584
+ return JSONResponse(content={"cached": False, **result})