Update app.py
Browse files
app.py
CHANGED
|
@@ -1,270 +1,82 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from __future__ import annotations
|
| 3 |
import os
|
| 4 |
-
import time
|
| 5 |
-
import logging
|
| 6 |
-
import threading
|
| 7 |
import asyncio
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
| 10 |
from urllib.parse import quote_plus, urljoin
|
| 11 |
|
| 12 |
-
from fastapi import FastAPI,
|
|
|
|
| 13 |
from pydantic import BaseModel
|
| 14 |
-
from starlette.responses import JSONResponse
|
| 15 |
|
| 16 |
-
from selenium import webdriver
|
| 17 |
-
from selenium.webdriver.chrome.options import Options
|
| 18 |
-
from selenium.webdriver.chrome.service import Service
|
| 19 |
-
from selenium.common.exceptions import WebDriverException, SessionNotCreatedException
|
| 20 |
-
from webdriver_manager.chrome import ChromeDriverManager
|
| 21 |
from bs4 import BeautifulSoup
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
from
|
| 26 |
-
|
| 27 |
-
#
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
logging.basicConfig(level=
|
| 32 |
-
logger = logging.getLogger("
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def _build_options(self) -> Options:
|
| 60 |
-
opts = Options()
|
| 61 |
-
# If CHROME_BIN is present, point to it
|
| 62 |
-
chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/google-chrome-stable")
|
| 63 |
-
if os.path.exists(chrome_bin):
|
| 64 |
-
opts.binary_location = chrome_bin
|
| 65 |
-
logger.debug("Using chrome binary: %s", chrome_bin)
|
| 66 |
-
else:
|
| 67 |
-
logger.warning("Chrome binary not found at %s (will rely on system/browser manager).", chrome_bin)
|
| 68 |
-
|
| 69 |
-
if self.headless:
|
| 70 |
-
opts.add_argument("--headless=new")
|
| 71 |
-
opts.add_argument("--headless")
|
| 72 |
-
|
| 73 |
-
# container-friendly flags (and stable fallback)
|
| 74 |
-
opts.add_argument("--no-sandbox")
|
| 75 |
-
opts.add_argument("--disable-setuid-sandbox")
|
| 76 |
-
opts.add_argument("--disable-dev-shm-usage")
|
| 77 |
-
opts.add_argument("--disable-gpu")
|
| 78 |
-
opts.add_argument("--disable-extensions")
|
| 79 |
-
opts.add_argument("--disable-blink-features=AutomationControlled")
|
| 80 |
-
opts.add_argument("--disable-software-rasterizer")
|
| 81 |
-
opts.add_argument(f"--window-size={self.window_size}")
|
| 82 |
-
opts.add_argument("--remote-debugging-port=0")
|
| 83 |
-
|
| 84 |
-
if self.user_agent:
|
| 85 |
-
opts.add_argument(f"--user-agent={self.user_agent}")
|
| 86 |
-
|
| 87 |
-
if self.disable_images:
|
| 88 |
-
prefs = {
|
| 89 |
-
"profile.managed_default_content_settings.images": 2,
|
| 90 |
-
"profile.managed_default_content_settings.stylesheets": 2,
|
| 91 |
-
"profile.managed_default_content_settings.fonts": 2,
|
| 92 |
-
}
|
| 93 |
-
opts.add_experimental_option("prefs", prefs)
|
| 94 |
-
|
| 95 |
-
opts.add_experimental_option("excludeSwitches", ["enable-logging"])
|
| 96 |
-
opts.add_experimental_option("useAutomationExtension", False)
|
| 97 |
-
return opts
|
| 98 |
-
|
| 99 |
-
def _start_driver_with_retries(self, attempts: int = 3, delay_seconds: float = 1.0):
|
| 100 |
-
last_exc = None
|
| 101 |
-
for attempt in range(1, attempts + 1):
|
| 102 |
-
try:
|
| 103 |
-
logger.info("Starting Chrome driver (attempt %d/%d)...", attempt, attempts)
|
| 104 |
-
self._start_driver()
|
| 105 |
-
logger.info("Chrome driver started successfully.")
|
| 106 |
-
return
|
| 107 |
-
except Exception as exc:
|
| 108 |
-
logger.exception("Failed to start driver on attempt %d: %s", attempt, exc)
|
| 109 |
-
last_exc = exc
|
| 110 |
-
time.sleep(delay_seconds)
|
| 111 |
-
raise RuntimeError(f"Unable to start Chrome driver after {attempts} attempts: {last_exc}") from last_exc
|
| 112 |
-
|
| 113 |
-
def _start_xvfb_if_needed(self):
|
| 114 |
-
# If headless=False AND no DISPLAY, start Xvfb via pyvirtualdisplay
|
| 115 |
-
if not self.headless and os.environ.get("DISPLAY", "") == "":
|
| 116 |
-
try:
|
| 117 |
-
logger.info("No DISPLAY found and headless=False — starting virtual X display (Xvfb).")
|
| 118 |
-
self._display = Display(visible=0, size=(int(self.window_size.split(",")[0]), int(self.window_size.split(",")[1])))
|
| 119 |
-
self._display.start()
|
| 120 |
-
logger.info("Virtual X display started (DISPLAY=%s).", os.environ.get("DISPLAY"))
|
| 121 |
-
except Exception as e:
|
| 122 |
-
logger.exception("Failed to start virtual display: %s", e)
|
| 123 |
-
raise
|
| 124 |
-
|
| 125 |
-
def _stop_xvfb_if_started(self):
|
| 126 |
-
if self._display:
|
| 127 |
-
try:
|
| 128 |
-
self._display.stop()
|
| 129 |
-
logger.info("Virtual X display stopped.")
|
| 130 |
-
except Exception:
|
| 131 |
-
pass
|
| 132 |
-
self._display = None
|
| 133 |
-
|
| 134 |
-
def _start_driver(self):
|
| 135 |
-
# start virtual display if required BEFORE launching Chrome
|
| 136 |
-
self._start_xvfb_if_needed()
|
| 137 |
-
|
| 138 |
-
opts = self._build_options()
|
| 139 |
-
|
| 140 |
-
# 1) Try Selenium Manager (webdriver.Chrome(options=opts)). Selenium >=4.14 may use driver manager itself.
|
| 141 |
-
primary_exc = None
|
| 142 |
-
fallback_exc = None
|
| 143 |
-
try:
|
| 144 |
-
logger.debug("Attempting to start Chrome via Selenium Manager (webdriver.Chrome(options=opts))")
|
| 145 |
-
self._driver = webdriver.Chrome(options=opts)
|
| 146 |
-
# quick smoke test: ensure browser is responsive (may throw)
|
| 147 |
-
try:
|
| 148 |
-
self._driver.execute_script("return navigator.userAgent")
|
| 149 |
-
except Exception as e:
|
| 150 |
-
# browser started but died quickly
|
| 151 |
-
raise RuntimeError("Browser started by Selenium Manager but crashed immediately.") from e
|
| 152 |
-
|
| 153 |
-
self._post_start_setup()
|
| 154 |
-
return
|
| 155 |
-
except Exception as e_primary:
|
| 156 |
-
primary_exc = e_primary
|
| 157 |
-
logger.warning("Selenium Manager attempt failed: %s", e_primary)
|
| 158 |
-
|
| 159 |
-
# 2) Fallback: use webdriver-manager to download driver and start with the explicit Service
|
| 160 |
-
try:
|
| 161 |
-
driver_path = ChromeDriverManager().install()
|
| 162 |
-
logger.info("webdriver-manager installed chromedriver: %s", driver_path)
|
| 163 |
-
try:
|
| 164 |
-
os.chmod(driver_path, 0o755)
|
| 165 |
-
except Exception:
|
| 166 |
-
logger.debug("chmod on chromedriver failed or unnecessary.")
|
| 167 |
-
|
| 168 |
-
service = Service(driver_path)
|
| 169 |
-
self._driver = webdriver.Chrome(service=service, options=opts)
|
| 170 |
-
self._post_start_setup()
|
| 171 |
-
return
|
| 172 |
-
except Exception as e_fallback:
|
| 173 |
-
fallback_exc = e_fallback
|
| 174 |
-
logger.exception("webdriver-manager fallback failed: %s", e_fallback)
|
| 175 |
-
|
| 176 |
-
# 3) Final fallback: attempt system /usr/bin/chromedriver if available
|
| 177 |
-
try:
|
| 178 |
-
sys_path = "/usr/bin/chromedriver"
|
| 179 |
-
if os.path.exists(sys_path):
|
| 180 |
-
logger.info("Trying system chromedriver at %s", sys_path)
|
| 181 |
-
try:
|
| 182 |
-
os.chmod(sys_path, 0o755)
|
| 183 |
-
except Exception:
|
| 184 |
-
pass
|
| 185 |
-
service = Service(sys_path)
|
| 186 |
-
self._driver = webdriver.Chrome(service=service, options=opts)
|
| 187 |
-
self._post_start_setup()
|
| 188 |
-
return
|
| 189 |
-
except Exception as e_sys:
|
| 190 |
-
logger.exception("System chromedriver attempt failed: %s", e_sys)
|
| 191 |
-
|
| 192 |
-
# If all failed, stop virtual display (if started) and raise a helpful error
|
| 193 |
-
self._stop_xvfb_if_started()
|
| 194 |
-
# Include both primary and fallback messages in the raised exception
|
| 195 |
-
raise RuntimeError(f"Failed to start Chrome driver. primary_error={primary_exc}, fallback_error={fallback_exc}")
|
| 196 |
-
|
| 197 |
-
def _post_start_setup(self):
|
| 198 |
-
try:
|
| 199 |
-
self._driver.set_page_load_timeout(60)
|
| 200 |
-
# best-effort CDP network blocking
|
| 201 |
-
try:
|
| 202 |
-
self._driver.execute_cdp_cmd("Network.enable", {})
|
| 203 |
-
if self.block_resource_urls:
|
| 204 |
-
self._driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": self.block_resource_urls})
|
| 205 |
-
except Exception:
|
| 206 |
-
pass
|
| 207 |
-
except Exception:
|
| 208 |
-
pass
|
| 209 |
-
|
| 210 |
-
def fetch_html(
|
| 211 |
-
self,
|
| 212 |
-
url: str,
|
| 213 |
-
wait_seconds: Optional[float] = 10.0,
|
| 214 |
-
wait_for_selector: Optional[str] = None,
|
| 215 |
-
) -> str:
|
| 216 |
-
if self._driver is None:
|
| 217 |
-
self._start_driver_with_retries()
|
| 218 |
-
|
| 219 |
-
with self._driver_lock:
|
| 220 |
-
driver = self._driver
|
| 221 |
-
try:
|
| 222 |
-
driver.get(url)
|
| 223 |
-
|
| 224 |
-
if wait_for_selector and wait_seconds:
|
| 225 |
-
try:
|
| 226 |
-
WebDriverWait(driver, wait_seconds).until(
|
| 227 |
-
EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_selector))
|
| 228 |
-
)
|
| 229 |
-
except TimeoutException:
|
| 230 |
-
pass
|
| 231 |
-
else:
|
| 232 |
-
if wait_seconds:
|
| 233 |
-
try:
|
| 234 |
-
WebDriverWait(driver, min(wait_seconds, 3)).until(
|
| 235 |
-
lambda d: d.execute_script("return document.readyState") == "complete"
|
| 236 |
-
)
|
| 237 |
-
except Exception:
|
| 238 |
-
time.sleep(0.5)
|
| 239 |
-
|
| 240 |
-
return driver.page_source
|
| 241 |
-
except WebDriverException as e:
|
| 242 |
-
logger.exception("WebDriver exception during fetch: %s", e)
|
| 243 |
-
# restart driver and raise
|
| 244 |
-
try:
|
| 245 |
-
self._safe_quit_driver()
|
| 246 |
-
except Exception:
|
| 247 |
-
pass
|
| 248 |
-
self._start_driver_with_retries()
|
| 249 |
-
raise RuntimeError(f"WebDriver error during fetch: {e}")
|
| 250 |
-
|
| 251 |
-
def _safe_quit_driver(self):
|
| 252 |
-
if self._driver:
|
| 253 |
-
try:
|
| 254 |
-
self._driver.quit()
|
| 255 |
-
except Exception:
|
| 256 |
-
pass
|
| 257 |
-
self._driver = None
|
| 258 |
-
# stop display if we started one
|
| 259 |
-
self._stop_xvfb_if_started()
|
| 260 |
-
|
| 261 |
-
def close(self):
|
| 262 |
-
self._safe_quit_driver()
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
# ---------------- EXTRACT_DATA (same as your earlier implementation) ----------------
|
| 266 |
-
def EXTRACT_DATA(html: str) -> Dict[str, Any]:
|
| 267 |
-
soup = BeautifulSoup(html, "html.parser")
|
| 268 |
BASE_URL = "https://www.google.com"
|
| 269 |
|
| 270 |
def safe_text(el):
|
|
@@ -274,311 +86,379 @@ def EXTRACT_DATA(html: str) -> Dict[str, Any]:
|
|
| 274 |
return el.get(attr) if el and el.has_attr(attr) else ""
|
| 275 |
|
| 276 |
def abs_url(url):
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
def clean_thumb(src):
|
| 280 |
if src and not src.startswith("data:"):
|
| 281 |
return abs_url(src)
|
| 282 |
return None
|
| 283 |
|
| 284 |
-
|
| 285 |
-
for parent in element.parents:
|
| 286 |
-
if parent.get("id") in ["tads", "tadsb"] or "ads-ad" in parent.get("class", []):
|
| 287 |
-
return True
|
| 288 |
-
return False
|
| 289 |
-
|
| 290 |
web_results = []
|
| 291 |
for result in soup.select(".tF2Cxc"):
|
| 292 |
-
if is_ad_element(result):
|
| 293 |
-
continue
|
| 294 |
title_tag = result.select_one("h3")
|
| 295 |
link_tag = result.select_one("a")
|
| 296 |
cite_tag = result.select_one("cite")
|
| 297 |
snippet_tag = result.select_one(".VwiC3b")
|
| 298 |
-
read_more_tag = result.select_one(".vzmbzf")
|
| 299 |
-
|
| 300 |
if title_tag and link_tag:
|
| 301 |
-
|
| 302 |
-
"no": len(web_results) + 1,
|
| 303 |
"title": safe_text(title_tag),
|
| 304 |
"link": abs_url(safe_attr(link_tag, "href")),
|
| 305 |
"displayed_url": safe_text(cite_tag),
|
| 306 |
"snippet": safe_text(snippet_tag)
|
| 307 |
-
}
|
| 308 |
-
extra = []
|
| 309 |
-
if read_more_tag:
|
| 310 |
-
read_more_url = abs_url(safe_attr(read_more_tag, "href"))
|
| 311 |
-
if read_more_url:
|
| 312 |
-
extra.append({"read_more": read_more_url})
|
| 313 |
-
if extra:
|
| 314 |
-
entry["extra"] = extra
|
| 315 |
-
web_results.append(entry)
|
| 316 |
|
|
|
|
| 317 |
image_results = []
|
| 318 |
for img_item in soup.select(".eA0Zlc"):
|
| 319 |
img_tag = img_item.select_one("img")
|
| 320 |
link_tag = img_item.select_one("a")
|
| 321 |
-
source_tag = img_item.select_one(".s0fJje span")
|
| 322 |
src = safe_attr(img_tag, "data-src") or safe_attr(img_tag, "src")
|
| 323 |
thumb = clean_thumb(src)
|
| 324 |
if thumb:
|
| 325 |
-
image_results.append({
|
| 326 |
-
"thumbnail": thumb,
|
| 327 |
-
"alt": safe_attr(img_tag, "alt"),
|
| 328 |
-
"source": safe_text(source_tag),
|
| 329 |
-
"link": abs_url(safe_attr(link_tag, "href"))
|
| 330 |
-
})
|
| 331 |
-
|
| 332 |
-
video_results = []
|
| 333 |
-
for video in soup.select(".KYaZsb"):
|
| 334 |
-
title_tag = video.select_one(".tNxQIb.ynAwRc")
|
| 335 |
-
link_tag = video.select_one("a.rIRoqf")
|
| 336 |
-
thumb_img = video.select_one(".AZJdrc img")
|
| 337 |
-
duration_tag = video.select_one(".c8rnLc")
|
| 338 |
-
channel_tag = video.select_one(".Sg4azc span:first-child")
|
| 339 |
-
date_tag = video.select_one(".rbYSKb span")
|
| 340 |
-
desc_tag = video.select_one(".wNifxf .p4wth")
|
| 341 |
-
thumb_src = safe_attr(thumb_img, "data-src") or safe_attr(thumb_img, "src")
|
| 342 |
-
thumb = clean_thumb(thumb_src)
|
| 343 |
-
if title_tag and link_tag:
|
| 344 |
-
video_results.append({
|
| 345 |
-
"title": safe_text(title_tag),
|
| 346 |
-
"link": abs_url(safe_attr(link_tag, "href")),
|
| 347 |
-
"thumbnail": thumb,
|
| 348 |
-
"duration": safe_text(duration_tag),
|
| 349 |
-
"channel": safe_text(channel_tag),
|
| 350 |
-
"date": safe_text(date_tag),
|
| 351 |
-
"description_snippet": safe_text(desc_tag)
|
| 352 |
-
})
|
| 353 |
-
|
| 354 |
-
news_results = []
|
| 355 |
-
for news in soup.select(".m7jPZ"):
|
| 356 |
-
title_tag = news.select_one(".n0jPhd")
|
| 357 |
-
link_tag = news.select_one("a")
|
| 358 |
-
source_tag = news.select_one(".MgUUmf span")
|
| 359 |
-
time_tag = news.select_one(".rbYSKb span")
|
| 360 |
-
thumb_img = news.select_one(".uhHOwf img")
|
| 361 |
-
thumb_src = safe_attr(thumb_img, "data-src") or safe_attr(thumb_img, "src")
|
| 362 |
-
thumb = clean_thumb(thumb_src)
|
| 363 |
-
if title_tag and link_tag:
|
| 364 |
-
news_results.append({
|
| 365 |
-
"title": safe_text(title_tag),
|
| 366 |
-
"link": abs_url(safe_attr(link_tag, "href")),
|
| 367 |
-
"source": safe_text(source_tag),
|
| 368 |
-
"time": safe_text(time_tag),
|
| 369 |
-
"thumbnail": thumb
|
| 370 |
-
})
|
| 371 |
-
|
| 372 |
-
knowledge_panel = {}
|
| 373 |
-
rhs = soup.find(id="rhs")
|
| 374 |
-
if rhs:
|
| 375 |
-
title_tag = rhs.select_one(".PZPZlf.ssJ7i")
|
| 376 |
-
subtitle_tag = rhs.select_one(".iAIpCb span")
|
| 377 |
-
if title_tag:
|
| 378 |
-
knowledge_panel["title"] = safe_text(title_tag)
|
| 379 |
-
if subtitle_tag:
|
| 380 |
-
knowledge_panel["subtitle"] = safe_text(subtitle_tag)
|
| 381 |
-
|
| 382 |
-
desc_tag = rhs.select_one(".kno-rdesc span")
|
| 383 |
-
if desc_tag:
|
| 384 |
-
knowledge_panel["description"] = safe_text(desc_tag)
|
| 385 |
-
|
| 386 |
-
facts = {}
|
| 387 |
-
for fact in rhs.select(".zloOqf"):
|
| 388 |
-
label_tag = fact.select_one(".w8qArf")
|
| 389 |
-
value_tag = fact.select_one(".LrzXr")
|
| 390 |
-
if label_tag and value_tag:
|
| 391 |
-
label = safe_text(label_tag).replace(":", "").strip()
|
| 392 |
-
links = value_tag.find_all("a")
|
| 393 |
-
if links and len(links) > 1:
|
| 394 |
-
names = [safe_text(a) for a in links if safe_text(a)]
|
| 395 |
-
if names:
|
| 396 |
-
facts[label] = names
|
| 397 |
-
else:
|
| 398 |
-
text = safe_text(value_tag)
|
| 399 |
-
if text:
|
| 400 |
-
facts[label] = text
|
| 401 |
-
if facts:
|
| 402 |
-
knowledge_panel["facts"] = facts
|
| 403 |
-
|
| 404 |
-
profiles = []
|
| 405 |
-
for profile in rhs.select(".dRrfkf a"):
|
| 406 |
-
name_tag = profile.select_one(".CtCigf")
|
| 407 |
-
link = safe_attr(profile, "href")
|
| 408 |
-
if name_tag and link:
|
| 409 |
-
profiles.append({
|
| 410 |
-
"platform": safe_text(name_tag),
|
| 411 |
-
"link": abs_url(link)
|
| 412 |
-
})
|
| 413 |
-
if profiles:
|
| 414 |
-
knowledge_panel["profiles"] = profiles
|
| 415 |
-
|
| 416 |
-
if not knowledge_panel:
|
| 417 |
-
knowledge_panel = None
|
| 418 |
-
|
| 419 |
-
ai_overview = None
|
| 420 |
-
ai_container = soup.select_one(".p2M1Qe .f5cPye")
|
| 421 |
-
if ai_container:
|
| 422 |
-
text = safe_text(ai_container)
|
| 423 |
-
if text:
|
| 424 |
-
ai_overview = text
|
| 425 |
-
|
| 426 |
-
thumbnails = set()
|
| 427 |
-
for img in soup.select("img[data-src], img[src]"):
|
| 428 |
-
src = safe_attr(img, "data-src") or safe_attr(img, "src")
|
| 429 |
-
clean = clean_thumb(src)
|
| 430 |
-
if clean:
|
| 431 |
-
thumbnails.add(clean)
|
| 432 |
-
|
| 433 |
-
all_thumbnails = sorted(thumbnails) if thumbnails else None
|
| 434 |
|
|
|
|
| 435 |
data = {}
|
| 436 |
if web_results:
|
| 437 |
data["web_results"] = web_results
|
| 438 |
if image_results:
|
| 439 |
data["image_results"] = image_results
|
| 440 |
-
if video_results:
|
| 441 |
-
data["video_results"] = video_results
|
| 442 |
-
if news_results:
|
| 443 |
-
data["news_results"] = news_results
|
| 444 |
-
if knowledge_panel:
|
| 445 |
-
data["knowledge_panel"] = knowledge_panel
|
| 446 |
-
if ai_overview:
|
| 447 |
-
data["ai_overview"] = ai_overview
|
| 448 |
-
if all_thumbnails:
|
| 449 |
-
data["all_thumbnail_urls"] = all_thumbnails
|
| 450 |
|
| 451 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
|
|
|
|
| 453 |
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
self.pool_size = max(1, pool_size)
|
| 458 |
-
self.managers = [BrowserManager(headless=headless) for _ in range(self.pool_size)]
|
| 459 |
-
self._rr_index = 0
|
| 460 |
-
self._rr_lock = threading.Lock()
|
| 461 |
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
|
|
|
| 467 |
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
try:
|
| 471 |
-
m.close()
|
| 472 |
-
except Exception:
|
| 473 |
-
pass
|
| 474 |
|
|
|
|
| 475 |
class SimpleTTLCache:
|
| 476 |
def __init__(self, ttl_seconds: int = 20):
|
| 477 |
self.ttl = ttl_seconds
|
| 478 |
-
self.
|
| 479 |
-
self._lock =
|
| 480 |
|
| 481 |
-
def get(self, key: str):
|
| 482 |
-
with self._lock:
|
| 483 |
-
item = self.
|
| 484 |
if not item:
|
| 485 |
return None
|
| 486 |
ts, value = item
|
| 487 |
if time.time() - ts > self.ttl:
|
| 488 |
-
del self.
|
| 489 |
return None
|
| 490 |
return value
|
| 491 |
|
| 492 |
-
def set(self, key: str, value: Any):
|
| 493 |
-
with self._lock:
|
| 494 |
-
self.
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
@app.on_event("startup")
|
| 508 |
-
async def
|
| 509 |
-
|
| 510 |
-
#
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
|
|
|
| 516 |
|
| 517 |
@app.on_event("shutdown")
|
| 518 |
-
async def
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
|
|
|
|
|
|
|
|
|
|
| 533 |
@app.get("/health")
|
| 534 |
async def health():
|
| 535 |
return {"status": "ok"}
|
| 536 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
@app.get("/search")
|
| 538 |
-
async def search(query: str = Query(..., min_length=1),
|
| 539 |
q = query.strip()
|
| 540 |
if not q:
|
| 541 |
-
raise HTTPException(status_code=400, detail="query
|
| 542 |
|
| 543 |
url = f"https://www.google.com/search?q={quote_plus(q)}"
|
| 544 |
-
cache_key = f"search:{q}:{
|
| 545 |
-
|
| 546 |
-
cached = CACHE.get(cache_key)
|
| 547 |
if cached:
|
| 548 |
return JSONResponse(content={"cached": True, **cached})
|
| 549 |
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
@app.get("/fetch")
|
| 558 |
-
async def fetch(url: str = Query(..., min_length=5), wait_for_selector: Optional[str] = None):
|
| 559 |
-
manager = app.state.pool.pick_manager()
|
| 560 |
-
loop = asyncio.get_event_loop()
|
| 561 |
-
fut = loop.run_in_executor(app.state.executor, _blocking_fetch_and_extract, manager, url, wait_for_selector, 6.0)
|
| 562 |
-
result = await fut
|
| 563 |
-
return JSONResponse(content=result)
|
| 564 |
|
| 565 |
@app.post("/search")
|
| 566 |
async def post_search(body: SearchRequest = Body(...)):
|
| 567 |
if not (body.query or body.url):
|
| 568 |
raise HTTPException(status_code=400, detail="Either query or url must be provided")
|
| 569 |
-
if body.url:
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app_playwright.py
|
| 2 |
+
"""
|
| 3 |
+
Fast concurrent search fetcher using Playwright + FastAPI.
|
| 4 |
+
|
| 5 |
+
Requirements:
|
| 6 |
+
pip install fastapi uvicorn playwright beautifulsoup4 pydantic
|
| 7 |
+
python -m playwright install chromium
|
| 8 |
+
|
| 9 |
+
Why Playwright:
|
| 10 |
+
- Single browser process, many lightweight contexts
|
| 11 |
+
- Fast request interception (block ads/fonts)
|
| 12 |
+
- Async API for high concurrency
|
| 13 |
+
|
| 14 |
+
Endpoints:
|
| 15 |
+
- GET /health
|
| 16 |
+
- GET /search?query=...
|
| 17 |
+
- GET /fetch?url=...&fast=true
|
| 18 |
+
- POST /search (body: { "query": "..."} or { "url": "..." })
|
| 19 |
+
- GET /search_pages?query=...&pages=5&concurrency=8&fast=true&ordered=false (streams NDJSON)
|
| 20 |
+
- GET /search_pages_aggregate?query=... (returns aggregated JSON)
|
| 21 |
+
Tuning via environment variables:
|
| 22 |
+
- PLAYWRIGHT_HEADLESS (true/false, default true)
|
| 23 |
+
- POOL_CONCURRENCY (default 8) -> concurrent page navigations
|
| 24 |
+
- PARSER_WORKERS (default 4) -> threadpool for BeautifulSoup parsing
|
| 25 |
+
- REQUEST_TIMEOUT_MS (default 20000) -> navigation timeout per page
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
from __future__ import annotations
|
| 29 |
import os
|
|
|
|
|
|
|
|
|
|
| 30 |
import asyncio
|
| 31 |
+
import logging
|
| 32 |
+
import json
|
| 33 |
+
import time
|
| 34 |
+
import math
|
| 35 |
+
from typing import Optional, Dict, Any, List, Tuple
|
| 36 |
from urllib.parse import quote_plus, urljoin
|
| 37 |
|
| 38 |
+
from fastapi import FastAPI, Query, Body, HTTPException
|
| 39 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
| 40 |
from pydantic import BaseModel
|
|
|
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
from bs4 import BeautifulSoup
|
| 43 |
+
import concurrent.futures
|
| 44 |
+
|
| 45 |
+
# Playwright async
|
| 46 |
+
from playwright.async_api import async_playwright, Playwright, Browser, BrowserContext, Page, Request as PWRequest, Error as PWError
|
| 47 |
+
|
| 48 |
+
# ---------------------
|
| 49 |
+
# Config / Logging
|
| 50 |
+
# ---------------------
|
| 51 |
+
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO").upper()
|
| 52 |
+
logging.basicConfig(level=LOG_LEVEL, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
| 53 |
+
logger = logging.getLogger("fast_playwright_fetcher")
|
| 54 |
+
|
| 55 |
+
PLAYWRIGHT_HEADLESS = os.environ.get("PLAYWRIGHT_HEADLESS", "true").lower() in ("1", "true", "yes")
|
| 56 |
+
POOL_CONCURRENCY = int(os.environ.get("POOL_CONCURRENCY", "8")) # parallel navigations
|
| 57 |
+
PARSER_WORKERS = int(os.environ.get("PARSER_WORKERS", "4")) # threadpool size for BS parsing
|
| 58 |
+
REQUEST_TIMEOUT_MS = int(os.environ.get("REQUEST_TIMEOUT_MS", "20000")) # navigation timeout in ms
|
| 59 |
+
PAGE_LOAD_WAIT = os.environ.get("PAGE_LOAD_WAIT", "domcontentloaded") # "domcontentloaded" or "load"
|
| 60 |
+
|
| 61 |
+
# set of resource URL substrings to block (ads, analytics, fonts, etc.)
|
| 62 |
+
BLOCK_PATTERNS = [
|
| 63 |
+
"doubleclick.net", "google-analytics.com", "googlesyndication.com",
|
| 64 |
+
"adservice.google.com", "googletagmanager.com", "facebook.com",
|
| 65 |
+
"fonts.googleapis.com", "gstatic.com", "analytics.twitter.com",
|
| 66 |
+
".woff", ".woff2", ".ttf", ".otf", "font.gstatic.com",
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
# ---------------------
|
| 70 |
+
# Parsing helper (threadpool)
|
| 71 |
+
# ---------------------
|
| 72 |
+
PARSE_POOL = concurrent.futures.ThreadPoolExecutor(max_workers=PARSER_WORKERS)
|
| 73 |
+
|
| 74 |
+
def extract_data_sync(html: str) -> Dict[str, Any]:
|
| 75 |
+
"""
|
| 76 |
+
Synchronous BeautifulSoup parsing (will be run inside PARSE_POOL).
|
| 77 |
+
Re-uses your previous EXTRACT_DATA logic (simplified/robust).
|
| 78 |
+
"""
|
| 79 |
+
soup = BeautifulSoup(html or "", "html.parser")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
BASE_URL = "https://www.google.com"
|
| 81 |
|
| 82 |
def safe_text(el):
|
|
|
|
| 86 |
return el.get(attr) if el and el.has_attr(attr) else ""
|
| 87 |
|
| 88 |
def abs_url(url):
|
| 89 |
+
try:
|
| 90 |
+
return urljoin(BASE_URL, url) if url else ""
|
| 91 |
+
except Exception:
|
| 92 |
+
return url or ""
|
| 93 |
|
| 94 |
def clean_thumb(src):
|
| 95 |
if src and not src.startswith("data:"):
|
| 96 |
return abs_url(src)
|
| 97 |
return None
|
| 98 |
|
| 99 |
+
# web results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
web_results = []
|
| 101 |
for result in soup.select(".tF2Cxc"):
|
|
|
|
|
|
|
| 102 |
title_tag = result.select_one("h3")
|
| 103 |
link_tag = result.select_one("a")
|
| 104 |
cite_tag = result.select_one("cite")
|
| 105 |
snippet_tag = result.select_one(".VwiC3b")
|
|
|
|
|
|
|
| 106 |
if title_tag and link_tag:
|
| 107 |
+
web_results.append({
|
|
|
|
| 108 |
"title": safe_text(title_tag),
|
| 109 |
"link": abs_url(safe_attr(link_tag, "href")),
|
| 110 |
"displayed_url": safe_text(cite_tag),
|
| 111 |
"snippet": safe_text(snippet_tag)
|
| 112 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
+
# images
|
| 115 |
image_results = []
|
| 116 |
for img_item in soup.select(".eA0Zlc"):
|
| 117 |
img_tag = img_item.select_one("img")
|
| 118 |
link_tag = img_item.select_one("a")
|
|
|
|
| 119 |
src = safe_attr(img_tag, "data-src") or safe_attr(img_tag, "src")
|
| 120 |
thumb = clean_thumb(src)
|
| 121 |
if thumb:
|
| 122 |
+
image_results.append({"thumbnail": thumb, "alt": safe_attr(img_tag, "alt"), "link": abs_url(safe_attr(link_tag, "href"))})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
+
# videos/news/knowledge panels omitted for brevity but can be added similarly
|
| 125 |
data = {}
|
| 126 |
if web_results:
|
| 127 |
data["web_results"] = web_results
|
| 128 |
if image_results:
|
| 129 |
data["image_results"] = image_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
+
# collect thumbnails
|
| 132 |
+
thumbnails = set()
|
| 133 |
+
for img in soup.select("img[data-src], img[src]"):
|
| 134 |
+
src = safe_attr(img, "data-src") or safe_attr(img, "src")
|
| 135 |
+
thumb = clean_thumb(src)
|
| 136 |
+
if thumb:
|
| 137 |
+
thumbnails.add(thumb)
|
| 138 |
+
if thumbnails:
|
| 139 |
+
data["all_thumbnail_urls"] = sorted(thumbnails)
|
| 140 |
|
| 141 |
+
return data
|
| 142 |
|
| 143 |
+
async def extract_data(html: str) -> Dict[str, Any]:
|
| 144 |
+
loop = asyncio.get_running_loop()
|
| 145 |
+
return await loop.run_in_executor(PARSE_POOL, extract_data_sync, html)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
# ---------------------
|
| 148 |
+
# FastAPI + Playwright globals
|
| 149 |
+
# ---------------------
|
| 150 |
+
app = FastAPI(title="fast_playwright_fetcher", version="0.1")
|
| 151 |
+
PLAY: Optional[Playwright] = None
|
| 152 |
+
BROWSER: Optional[Browser] = None
|
| 153 |
|
| 154 |
+
# Semaphore to limit concurrency across endpoints
|
| 155 |
+
CONCURRENCY_SEMAPHORE = asyncio.Semaphore(POOL_CONCURRENCY)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
+
# Simple TTL cache (in-memory)
|
| 158 |
class SimpleTTLCache:
|
| 159 |
def __init__(self, ttl_seconds: int = 20):
|
| 160 |
self.ttl = ttl_seconds
|
| 161 |
+
self.store: Dict[str, Tuple[float, Any]] = {}
|
| 162 |
+
self._lock = asyncio.Lock()
|
| 163 |
|
| 164 |
+
async def get(self, key: str):
|
| 165 |
+
async with self._lock:
|
| 166 |
+
item = self.store.get(key)
|
| 167 |
if not item:
|
| 168 |
return None
|
| 169 |
ts, value = item
|
| 170 |
if time.time() - ts > self.ttl:
|
| 171 |
+
del self.store[key]
|
| 172 |
return None
|
| 173 |
return value
|
| 174 |
|
| 175 |
+
async def set(self, key: str, value: Any):
|
| 176 |
+
async with self._lock:
|
| 177 |
+
self.store[key] = (time.time(), value)
|
| 178 |
+
|
| 179 |
+
CACHE = SimpleTTLCache(ttl_seconds=int(os.environ.get("CACHE_TTL", "25")))
|
| 180 |
+
|
| 181 |
+
# ---------------------
|
| 182 |
+
# Playwright helpers
|
| 183 |
+
# ---------------------
|
| 184 |
+
async def start_playwright():
|
| 185 |
+
global PLAY, BROWSER
|
| 186 |
+
if PLAY is not None:
|
| 187 |
+
return
|
| 188 |
+
PLAY = await async_playwright().start()
|
| 189 |
+
BROWSER = await PLAY.chromium.launch(headless=PLAYWRIGHT_HEADLESS,
|
| 190 |
+
args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"])
|
| 191 |
+
logger.info("Playwright started (headless=%s)", PLAYWRIGHT_HEADLESS)
|
| 192 |
+
|
| 193 |
+
async def stop_playwright():
|
| 194 |
+
global PLAY, BROWSER
|
| 195 |
+
if BROWSER:
|
| 196 |
+
try:
|
| 197 |
+
await BROWSER.close()
|
| 198 |
+
except Exception:
|
| 199 |
+
pass
|
| 200 |
+
BROWSER = None
|
| 201 |
+
if PLAY:
|
| 202 |
+
try:
|
| 203 |
+
await PLAY.stop()
|
| 204 |
+
except Exception:
|
| 205 |
+
pass
|
| 206 |
+
PLAY = None
|
| 207 |
+
logger.info("Playwright stopped.")
|
| 208 |
|
| 209 |
+
def _should_block_request(url: str) -> bool:
|
| 210 |
+
if not url:
|
| 211 |
+
return False
|
| 212 |
+
u = url.lower()
|
| 213 |
+
for pat in BLOCK_PATTERNS:
|
| 214 |
+
if pat in u:
|
| 215 |
+
return True
|
| 216 |
+
return False
|
| 217 |
+
|
| 218 |
+
async def _fetch_with_context(url: str, wait_until: str = "domcontentloaded", timeout_ms: int = REQUEST_TIMEOUT_MS, user_agent: Optional[str] = None) -> str:
|
| 219 |
+
"""
|
| 220 |
+
Create an isolated browser context, block unwanted requests, navigate, get HTML.
|
| 221 |
+
This is efficient: creating small contexts is cheap compared to full browser processes.
|
| 222 |
+
"""
|
| 223 |
+
if BROWSER is None:
|
| 224 |
+
raise RuntimeError("Browser not started")
|
| 225 |
+
|
| 226 |
+
# create context with minimal resources
|
| 227 |
+
context: BrowserContext = await BROWSER.new_context(user_agent=user_agent or "Mozilla/5.0 (Playwright)", viewport={"width": 1366, "height": 768})
|
| 228 |
+
page: Page = await context.new_page()
|
| 229 |
+
|
| 230 |
+
# intercept requests to block common resources
|
| 231 |
+
async def route_handler(route, request: PWRequest):
|
| 232 |
+
try:
|
| 233 |
+
if _should_block_request(request.url):
|
| 234 |
+
await route.abort()
|
| 235 |
+
else:
|
| 236 |
+
await route.continue_()
|
| 237 |
+
except Exception:
|
| 238 |
+
# if anything fails, let the request through
|
| 239 |
+
try:
|
| 240 |
+
await route.continue_()
|
| 241 |
+
except Exception:
|
| 242 |
+
pass
|
| 243 |
|
| 244 |
+
try:
|
| 245 |
+
await page.route("**/*", route_handler)
|
| 246 |
+
except Exception:
|
| 247 |
+
# route might fail if the browser doesn't support it; ignore
|
| 248 |
+
pass
|
| 249 |
+
|
| 250 |
+
# navigate with retries/backoff
|
| 251 |
+
max_attempts = 3
|
| 252 |
+
backoff_base = 0.2
|
| 253 |
+
last_exc = None
|
| 254 |
+
for attempt in range(1, max_attempts + 1):
|
| 255 |
+
try:
|
| 256 |
+
# Playwright's goto uses wait_until values like "load" or "domcontentloaded"
|
| 257 |
+
await page.goto(url, wait_until=wait_until, timeout=timeout_ms)
|
| 258 |
+
content = await page.content()
|
| 259 |
+
return content
|
| 260 |
+
except PWError as e:
|
| 261 |
+
last_exc = e
|
| 262 |
+
logger.warning("Playwright navigation error (attempt %d/%d) for %s : %s", attempt, max_attempts, url, str(e))
|
| 263 |
+
# small backoff
|
| 264 |
+
await asyncio.sleep(backoff_base * attempt)
|
| 265 |
+
# if fatal, break and raise after attempts
|
| 266 |
+
except Exception as e:
|
| 267 |
+
last_exc = e
|
| 268 |
+
logger.exception("Unexpected navigation error (attempt %d/%d) for %s", attempt, max_attempts, url)
|
| 269 |
+
await asyncio.sleep(backoff_base * attempt)
|
| 270 |
+
|
| 271 |
+
# cleanup
|
| 272 |
+
try:
|
| 273 |
+
await page.close()
|
| 274 |
+
except Exception:
|
| 275 |
+
pass
|
| 276 |
+
try:
|
| 277 |
+
await context.close()
|
| 278 |
+
except Exception:
|
| 279 |
+
pass
|
| 280 |
+
raise RuntimeError(f"Failed to fetch {url} after {max_attempts} attempts: {last_exc}")
|
| 281 |
+
|
| 282 |
+
# ---------------------
|
| 283 |
+
# App startup/shutdown
|
| 284 |
+
# ---------------------
|
| 285 |
@app.on_event("startup")
|
| 286 |
+
async def on_startup():
|
| 287 |
+
await start_playwright()
|
| 288 |
+
# pre-warm a small context to reduce first request latency
|
| 289 |
+
try:
|
| 290 |
+
async with CONCURRENCY_SEMAPHORE:
|
| 291 |
+
_ = await _fetch_with_context("about:blank")
|
| 292 |
+
except Exception:
|
| 293 |
+
pass
|
| 294 |
+
logger.info("App startup complete. concurrency=%d parser_workers=%d", POOL_CONCURRENCY, PARSER_WORKERS)
|
| 295 |
|
| 296 |
@app.on_event("shutdown")
|
| 297 |
+
async def on_shutdown():
|
| 298 |
+
await stop_playwright()
|
| 299 |
+
PARSE_POOL.shutdown(wait=False)
|
| 300 |
+
|
| 301 |
+
# ---------------------
|
| 302 |
+
# Blocking helper wrapper
|
| 303 |
+
# ---------------------
|
| 304 |
+
async def fetch_and_extract(url: str, wait_until: str = PAGE_LOAD_WAIT, timeout_ms: int = REQUEST_TIMEOUT_MS, user_agent: Optional[str] = None) -> Dict[str, Any]:
|
| 305 |
+
"""
|
| 306 |
+
Acquire concurrency semaphore, fetch page with Playwright, parse with BeautifulSoup in threadpool.
|
| 307 |
+
"""
|
| 308 |
+
async with CONCURRENCY_SEMAPHORE:
|
| 309 |
+
html = await _fetch_with_context(url, wait_until=wait_until, timeout_ms=timeout_ms, user_agent=user_agent)
|
| 310 |
+
data = await extract_data(html)
|
| 311 |
+
return {"url": url, "data": data}
|
| 312 |
+
|
| 313 |
+
# ---------------------
|
| 314 |
+
# Request models
|
| 315 |
+
# ---------------------
|
| 316 |
+
class SearchRequest(BaseModel):
|
| 317 |
+
query: Optional[str] = None
|
| 318 |
+
url: Optional[str] = None
|
| 319 |
|
| 320 |
+
# ---------------------
|
| 321 |
+
# Endpoints
|
| 322 |
+
# ---------------------
|
| 323 |
@app.get("/health")
|
| 324 |
async def health():
|
| 325 |
return {"status": "ok"}
|
| 326 |
|
| 327 |
+
@app.get("/fetch")
|
| 328 |
+
async def fetch(url: str = Query(..., min_length=5), fast: Optional[bool] = Query(True)):
|
| 329 |
+
"""
|
| 330 |
+
Fetch a URL and extract data. Use fast=true to wait only for domcontentloaded (faster).
|
| 331 |
+
"""
|
| 332 |
+
wait_until = "domcontentloaded" if fast else "load"
|
| 333 |
+
try:
|
| 334 |
+
result = await fetch_and_extract(url, wait_until=wait_until)
|
| 335 |
+
return JSONResponse(content=result)
|
| 336 |
+
except Exception as e:
|
| 337 |
+
logger.exception("Fetch error for %s", url)
|
| 338 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 339 |
+
|
| 340 |
@app.get("/search")
|
| 341 |
+
async def search(query: str = Query(..., min_length=1), fast: Optional[bool] = Query(True)):
|
| 342 |
q = query.strip()
|
| 343 |
if not q:
|
| 344 |
+
raise HTTPException(status_code=400, detail="query required")
|
| 345 |
|
| 346 |
url = f"https://www.google.com/search?q={quote_plus(q)}"
|
| 347 |
+
cache_key = f"search:{q}:{fast}"
|
| 348 |
+
cached = await CACHE.get(cache_key)
|
|
|
|
| 349 |
if cached:
|
| 350 |
return JSONResponse(content={"cached": True, **cached})
|
| 351 |
|
| 352 |
+
try:
|
| 353 |
+
res = await fetch_and_extract(url, wait_until=("domcontentloaded" if fast else "load"))
|
| 354 |
+
await CACHE.set(cache_key, res)
|
| 355 |
+
return JSONResponse(content={"cached": False, **res})
|
| 356 |
+
except Exception as e:
|
| 357 |
+
logger.exception("Search error for %s", q)
|
| 358 |
+
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
@app.post("/search")
|
| 361 |
async def post_search(body: SearchRequest = Body(...)):
|
| 362 |
if not (body.query or body.url):
|
| 363 |
raise HTTPException(status_code=400, detail="Either query or url must be provided")
|
| 364 |
+
target = body.url if body.url else f"https://www.google.com/search?q={quote_plus(body.query)}"
|
| 365 |
+
try:
|
| 366 |
+
res = await fetch_and_extract(target, wait_until=PAGE_LOAD_WAIT)
|
| 367 |
+
return JSONResponse(content=res)
|
| 368 |
+
except Exception as e:
|
| 369 |
+
logger.exception("Post search error for %s", target)
|
| 370 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 371 |
+
|
| 372 |
+
# ---------------------
|
| 373 |
+
# Streaming / multi-page endpoints
|
| 374 |
+
# ---------------------
|
| 375 |
+
@app.get("/search_pages")
|
| 376 |
+
async def search_pages(
|
| 377 |
+
query: str = Query(..., min_length=1),
|
| 378 |
+
pages: int = Query(3, ge=1, le=50),
|
| 379 |
+
concurrency: Optional[int] = Query(None, ge=1),
|
| 380 |
+
ordered: Optional[bool] = Query(False),
|
| 381 |
+
fast: Optional[bool] = Query(True),
|
| 382 |
+
):
|
| 383 |
+
"""
|
| 384 |
+
Streams NDJSON results per Google page. pages=N means start=0,10,20...
|
| 385 |
+
Use concurrency to override default concurrency (POOL_CONCURRENCY).
|
| 386 |
+
"""
|
| 387 |
+
q = query.strip()
|
| 388 |
+
if not q:
|
| 389 |
+
raise HTTPException(status_code=400, detail="query required")
|
| 390 |
+
|
| 391 |
+
pages = min(max(1, pages), 50)
|
| 392 |
+
items: List[Tuple[int, str]] = []
|
| 393 |
+
for i in range(pages):
|
| 394 |
+
start = i * 10
|
| 395 |
+
items.append((i + 1, f"https://www.google.com/search?q={quote_plus(q)}&start={start}"))
|
| 396 |
+
|
| 397 |
+
# determine concurrency (local, can't change global semaphore size, so we create local tasks but still rely on global)
|
| 398 |
+
local_concurrency = concurrency or POOL_CONCURRENCY
|
| 399 |
+
# create tasks
|
| 400 |
+
loop = asyncio.get_running_loop()
|
| 401 |
+
task_list = []
|
| 402 |
+
for page_no, url in items:
|
| 403 |
+
task = loop.create_task(_search_page_task(page_no, url, fast))
|
| 404 |
+
task_list.append(task)
|
| 405 |
+
|
| 406 |
+
async def streamer():
|
| 407 |
+
try:
|
| 408 |
+
if ordered:
|
| 409 |
+
# yield in page order
|
| 410 |
+
for t in task_list:
|
| 411 |
+
res = await t
|
| 412 |
+
yield (json.dumps(res, ensure_ascii=False) + "\n").encode("utf-8")
|
| 413 |
+
else:
|
| 414 |
+
# yield fastest-first
|
| 415 |
+
for fut in asyncio.as_completed(task_list):
|
| 416 |
+
res = await fut
|
| 417 |
+
yield (json.dumps(res, ensure_ascii=False) + "\n").encode("utf-8")
|
| 418 |
+
finally:
|
| 419 |
+
for t in task_list:
|
| 420 |
+
if not t.done():
|
| 421 |
+
t.cancel()
|
| 422 |
+
|
| 423 |
+
return StreamingResponse(streamer(), media_type="application/x-ndjson")
|
| 424 |
+
|
| 425 |
+
async def _search_page_task(page_no: int, url: str, fast: bool):
|
| 426 |
+
try:
|
| 427 |
+
wait_until = "domcontentloaded" if fast else "load"
|
| 428 |
+
result = await fetch_and_extract(url, wait_until=wait_until)
|
| 429 |
+
return {"page": page_no, "url": url, "ok": True, "duration": None, "data": result.get("data")}
|
| 430 |
+
except Exception as e:
|
| 431 |
+
logger.exception("Error fetching page %d (%s): %s", page_no, url, e)
|
| 432 |
+
return {"page": page_no, "url": url, "ok": False, "error": str(e)}
|
| 433 |
+
|
| 434 |
+
@app.get("/search_pages_aggregate")
|
| 435 |
+
async def search_pages_aggregate(
|
| 436 |
+
query: str = Query(..., min_length=1),
|
| 437 |
+
pages: int = Query(3, ge=1, le=50),
|
| 438 |
+
concurrency: Optional[int] = Query(None, ge=1),
|
| 439 |
+
fast: Optional[bool] = Query(True),
|
| 440 |
+
):
|
| 441 |
+
q = query.strip()
|
| 442 |
+
if not q:
|
| 443 |
+
raise HTTPException(status_code=400, detail="query required")
|
| 444 |
+
|
| 445 |
+
pages = min(max(1, pages), 50)
|
| 446 |
+
items = []
|
| 447 |
+
for i in range(pages):
|
| 448 |
+
start = i * 10
|
| 449 |
+
items.append((i + 1, f"https://www.google.com/search?q={quote_plus(q)}&start={start}"))
|
| 450 |
+
|
| 451 |
+
tasks = [asyncio.create_task(_search_page_task(pno, url, fast)) for pno, url in items]
|
| 452 |
+
results = await asyncio.gather(*tasks, return_exceptions=False)
|
| 453 |
+
return JSONResponse(content={"pages": results})
|
| 454 |
+
|
| 455 |
+
# ---------------------
|
| 456 |
+
# Run notes:
|
| 457 |
+
# ---------------------
|
| 458 |
+
# start server:
|
| 459 |
+
# uvicorn app_playwright:app --host 0.0.0.0 --port 8000 --workers 1
|
| 460 |
+
#
|
| 461 |
+
# Important: use a single uvicorn worker (playwright and the asyncio loop should be in one process).
|
| 462 |
+
# You can scale horizontally by running multiple instances behind a load balancer if needed.
|
| 463 |
+
#
|
| 464 |
+
# ---------------------
|