AdarshJi commited on
Commit
88b5aac
·
verified ·
1 Parent(s): a67cd68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -41
app.py CHANGED
@@ -2,6 +2,7 @@
2
  from __future__ import annotations
3
  import os
4
  import time
 
5
  import threading
6
  import asyncio
7
  from typing import Optional, Dict, Any, Tuple
@@ -13,20 +14,22 @@ from pydantic import BaseModel
13
  from starlette.responses import JSONResponse
14
 
15
  from selenium import webdriver
16
- from selenium.webdriver.chrome.service import Service
17
  from selenium.webdriver.chrome.options import Options
 
 
 
 
18
  from selenium.webdriver.common.by import By
19
  from selenium.webdriver.support.ui import WebDriverWait
20
  from selenium.webdriver.support import expected_conditions as EC
21
- from selenium.common.exceptions import TimeoutException, WebDriverException
22
- from webdriver_manager.chrome import ChromeDriverManager
23
- from bs4 import BeautifulSoup
 
 
24
 
25
  # ---------------- BrowserManager ----------------
26
  class BrowserManager:
27
- """
28
- Manages a single Chrome webdriver instance.
29
- """
30
  def __init__(
31
  self,
32
  headless: bool = True,
@@ -34,7 +37,6 @@ class BrowserManager:
34
  window_size: str = "1366,768",
35
  disable_images: bool = True,
36
  block_resource_urls: Optional[list[str]] = None,
37
- page_load_strategy: str = "eager",
38
  ):
39
  self.headless = headless
40
  self.user_agent = user_agent
@@ -46,34 +48,37 @@ class BrowserManager:
46
  "*.googlesyndication.com/*",
47
  "*.adservice.google.com/*",
48
  ]
49
- self.page_load_strategy = page_load_strategy
50
  self._driver_lock = threading.Lock()
51
  self._driver: Optional[webdriver.Chrome] = None
52
- self._start_driver()
53
 
54
- def _start_driver(self):
55
  opts = Options()
56
 
57
- # If CHROME_BIN env var is set (we set it in Dockerfile), point to it
58
  chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/google-chrome-stable")
59
  if os.path.exists(chrome_bin):
60
  opts.binary_location = chrome_bin
 
 
 
61
 
62
- # Headless flags
63
  if self.headless:
64
- # keep both forms for compatibility
65
  opts.add_argument("--headless=new")
66
  opts.add_argument("--headless")
67
 
68
- # Docker/container friendly flags
69
  opts.add_argument("--no-sandbox")
70
  opts.add_argument("--disable-setuid-sandbox")
71
  opts.add_argument("--disable-dev-shm-usage")
72
  opts.add_argument("--disable-gpu")
73
  opts.add_argument("--disable-extensions")
74
  opts.add_argument("--disable-blink-features=AutomationControlled")
75
- opts.add_argument("--single-process")
 
76
  opts.add_argument(f"--window-size={self.window_size}")
 
77
 
78
  if self.user_agent:
79
  opts.add_argument(f"--user-agent={self.user_agent}")
@@ -88,29 +93,86 @@ class BrowserManager:
88
 
89
  opts.add_experimental_option("excludeSwitches", ["enable-logging"])
90
  opts.add_experimental_option("useAutomationExtension", False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- # Attempt to download a matching chromedriver using webdriver_manager
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  try:
94
  driver_path = ChromeDriverManager().install()
 
 
 
 
 
 
95
  service = Service(driver_path)
96
  self._driver = webdriver.Chrome(service=service, options=opts)
97
- except Exception as e:
98
- # If webdriver_manager failed or binary mismatch, try a minimal fallback
 
 
 
99
  try:
100
- # attempt to use chromedriver in PATH if available
101
- service = Service("/usr/bin/chromedriver")
102
- self._driver = webdriver.Chrome(service=service, options=opts)
103
- except Exception as ee:
104
- raise RuntimeError(f"Failed to start Chrome driver: {e}; fallback: {ee}")
 
 
 
 
 
 
 
 
 
 
105
 
106
- # Optional quick setup: allow CDP commands to block resources (best-effort)
 
107
  try:
108
  self._driver.set_page_load_timeout(60)
109
  self._driver.execute_cdp_cmd("Network.enable", {})
110
  if self.block_resource_urls:
111
- self._driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": self.block_resource_urls})
 
 
 
 
112
  except Exception:
113
- # ignore CDP or timeout errors
114
  pass
115
 
116
  def fetch_html(
@@ -118,10 +180,9 @@ class BrowserManager:
118
  url: str,
119
  wait_seconds: Optional[float] = 10.0,
120
  wait_for_selector: Optional[str] = None,
121
- force_reload: bool = True,
122
  ) -> str:
123
  if self._driver is None:
124
- self._start_driver()
125
 
126
  with self._driver_lock:
127
  driver = self._driver
@@ -146,13 +207,13 @@ class BrowserManager:
146
 
147
  return driver.page_source
148
  except WebDriverException as e:
149
- # try to recover by restarting driver
 
150
  try:
151
  self._safe_quit_driver()
152
  except Exception:
153
  pass
154
- # restart and raise to caller
155
- self._start_driver()
156
  raise RuntimeError(f"WebDriver error during fetch: {e}")
157
 
158
  def _safe_quit_driver(self):
@@ -167,7 +228,7 @@ class BrowserManager:
167
  self._safe_quit_driver()
168
 
169
 
170
- # ---------------- EXTRACT_DATA (kept same as in your original) ----------------
171
  def EXTRACT_DATA(html: str) -> Dict[str, Any]:
172
  soup = BeautifulSoup(html, "html.parser")
173
  BASE_URL = "https://www.google.com"
@@ -355,7 +416,8 @@ def EXTRACT_DATA(html: str) -> Dict[str, Any]:
355
 
356
  return data
357
 
358
- # ---------------- BrowserPool and cache ----------------
 
359
  class BrowserPool:
360
  def __init__(self, pool_size: int = 1, headless: bool = True):
361
  self.pool_size = max(1, pool_size)
@@ -397,17 +459,13 @@ class SimpleTTLCache:
397
  with self._lock:
398
  self._cache[key] = (time.time(), value)
399
 
400
- # API models
401
  class SearchRequest(BaseModel):
402
  query: Optional[str] = None
403
  url: Optional[str] = None
404
  wait_for_selector: Optional[str] = None
405
  headless: Optional[bool] = True
406
 
407
- # FastAPI app
408
  app = FastAPI(title="fast_fetcher_api", version="0.1")
409
-
410
- # Global shared objects (initialized on startup)
411
  POOL: Optional[BrowserPool] = None
412
  EXECUTOR: Optional[ThreadPoolExecutor] = None
413
  CACHE = SimpleTTLCache(ttl_seconds=25)
@@ -415,12 +473,12 @@ CACHE = SimpleTTLCache(ttl_seconds=25)
415
  @app.on_event("startup")
416
  async def startup_event():
417
  global POOL, EXECUTOR
418
- # set headless=True for Docker. Tune pool_size as needed.
419
  POOL = BrowserPool(pool_size=1, headless=True)
420
  EXECUTOR = ThreadPoolExecutor(max_workers=2)
421
  app.state.executor = EXECUTOR
422
  app.state.pool = POOL
423
- print("Startup: browser pool created (size=1).")
424
 
425
  @app.on_event("shutdown")
426
  async def shutdown_event():
@@ -429,7 +487,7 @@ async def shutdown_event():
429
  POOL.close_all()
430
  if EXECUTOR:
431
  EXECUTOR.shutdown(wait=True)
432
- print("Shutdown: browsers closed and executor stopped.")
433
 
434
  def _blocking_fetch_and_extract(manager: BrowserManager, url: str, wait_for_selector: Optional[str], wait_seconds: Optional[float]):
435
  start = time.time()
 
2
  from __future__ import annotations
3
  import os
4
  import time
5
+ import logging
6
  import threading
7
  import asyncio
8
  from typing import Optional, Dict, Any, Tuple
 
14
  from starlette.responses import JSONResponse
15
 
16
  from selenium import webdriver
 
17
  from selenium.webdriver.chrome.options import Options
18
+ from selenium.webdriver.chrome.service import Service
19
+ from selenium.common.exceptions import WebDriverException
20
+ from webdriver_manager.chrome import ChromeDriverManager
21
+ from bs4 import BeautifulSoup
22
  from selenium.webdriver.common.by import By
23
  from selenium.webdriver.support.ui import WebDriverWait
24
  from selenium.webdriver.support import expected_conditions as EC
25
+ from selenium.common.exceptions import TimeoutException
26
+
27
+ # Configure logging
28
+ logging.basicConfig(level=logging.INFO)
29
+ logger = logging.getLogger("fast_fetcher")
30
 
31
  # ---------------- BrowserManager ----------------
32
  class BrowserManager:
 
 
 
33
  def __init__(
34
  self,
35
  headless: bool = True,
 
37
  window_size: str = "1366,768",
38
  disable_images: bool = True,
39
  block_resource_urls: Optional[list[str]] = None,
 
40
  ):
41
  self.headless = headless
42
  self.user_agent = user_agent
 
48
  "*.googlesyndication.com/*",
49
  "*.adservice.google.com/*",
50
  ]
 
51
  self._driver_lock = threading.Lock()
52
  self._driver: Optional[webdriver.Chrome] = None
53
+ self._start_driver_with_retries()
54
 
55
+ def _build_options(self) -> Options:
56
  opts = Options()
57
 
58
+ # set binary location from env, fallback to default
59
  chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/google-chrome-stable")
60
  if os.path.exists(chrome_bin):
61
  opts.binary_location = chrome_bin
62
+ logger.debug("Using chrome binary: %s", chrome_bin)
63
+ else:
64
+ logger.warning("Chrome binary not found at %s; Selenium Manager may still locate a browser.", chrome_bin)
65
 
 
66
  if self.headless:
67
+ # keep compatibility for versions: add both
68
  opts.add_argument("--headless=new")
69
  opts.add_argument("--headless")
70
 
71
+ # container-friendly and stability flags
72
  opts.add_argument("--no-sandbox")
73
  opts.add_argument("--disable-setuid-sandbox")
74
  opts.add_argument("--disable-dev-shm-usage")
75
  opts.add_argument("--disable-gpu")
76
  opts.add_argument("--disable-extensions")
77
  opts.add_argument("--disable-blink-features=AutomationControlled")
78
+ opts.add_argument("--disable-software-rasterizer")
79
+ opts.add_argument("--disable-accelerated-2d-canvas")
80
  opts.add_argument(f"--window-size={self.window_size}")
81
+ opts.add_argument("--remote-debugging-port=0")
82
 
83
  if self.user_agent:
84
  opts.add_argument(f"--user-agent={self.user_agent}")
 
93
 
94
  opts.add_experimental_option("excludeSwitches", ["enable-logging"])
95
  opts.add_experimental_option("useAutomationExtension", False)
96
+ return opts
97
+
98
+ def _start_driver_with_retries(self, attempts: int = 3, delay_seconds: float = 1.0):
99
+ last_exc = None
100
+ for attempt in range(1, attempts + 1):
101
+ try:
102
+ logger.info("Starting Chrome driver (attempt %d/%d)...", attempt, attempts)
103
+ self._start_driver()
104
+ logger.info("Chrome driver started successfully.")
105
+ return
106
+ except Exception as e:
107
+ logger.exception("Failed to start driver on attempt %d: %s", attempt, e)
108
+ last_exc = e
109
+ time.sleep(delay_seconds)
110
+ raise RuntimeError(f"Unable to start Chrome driver after {attempts} attempts: {last_exc}") from last_exc
111
+
112
+ def _start_driver(self):
113
+ opts = self._build_options()
114
 
115
+ # First: try letting Selenium Manager handle driver (selenium >=4.14)
116
+ try:
117
+ logger.debug("Trying webdriver.Chrome(options=opts) to allow Selenium Manager to find driver.")
118
+ self._driver = webdriver.Chrome(options=opts)
119
+ # quick smoke test: get version via execute_script -> may throw if browser crashed
120
+ try:
121
+ self._driver.execute_script("return navigator.userAgent")
122
+ except Exception:
123
+ # driver started but browser died; fallthrough to fallback
124
+ raise RuntimeError("Selenium Manager started a session but the browser crashed immediately.")
125
+ # if success, configure network blocking best-effort
126
+ self._post_start_setup()
127
+ return
128
+ except Exception as e_primary:
129
+ logger.warning("Selenium Manager attempt failed: %s", e_primary)
130
+
131
+ # Fallback: use webdriver-manager to download compatible chromedriver, ensure executable, and use it
132
  try:
133
  driver_path = ChromeDriverManager().install()
134
+ logger.info("webdriver-manager installed chromedriver: %s", driver_path)
135
+ # ensure executable bit set
136
+ try:
137
+ os.chmod(driver_path, 0o755)
138
+ except Exception as ex_perm:
139
+ logger.warning("Could not chmod chromedriver: %s", ex_perm)
140
  service = Service(driver_path)
141
  self._driver = webdriver.Chrome(service=service, options=opts)
142
+ self._post_start_setup()
143
+ return
144
+ except Exception as e_fallback:
145
+ logger.exception("webdriver-manager fallback failed: %s", e_fallback)
146
+ # try one more variation: if /usr/bin/chromedriver exists, use it
147
  try:
148
+ system_path = "/usr/bin/chromedriver"
149
+ if os.path.exists(system_path):
150
+ logger.info("Trying system chromedriver at %s", system_path)
151
+ os.chmod(system_path, 0o755)
152
+ service = Service(system_path)
153
+ self._driver = webdriver.Chrome(service=service, options=opts)
154
+ self._post_start_setup()
155
+ return
156
+ else:
157
+ logger.debug("No system chromedriver at %s", system_path)
158
+ except Exception as e_sys:
159
+ logger.exception("System chromedriver attempt failed: %s", e_sys)
160
+
161
+ # raise combined error
162
+ raise RuntimeError(f"Failed to start Chrome driver. primary_error={e_primary}, fallback_error={e_fallback}")
163
 
164
+ def _post_start_setup(self):
165
+ # set reasonable timeout and (best-effort) block some urls using CDP
166
  try:
167
  self._driver.set_page_load_timeout(60)
168
  self._driver.execute_cdp_cmd("Network.enable", {})
169
  if self.block_resource_urls:
170
+ try:
171
+ self._driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": self.block_resource_urls})
172
+ except Exception:
173
+ # some Chrome builds may not support this; ignore
174
+ pass
175
  except Exception:
 
176
  pass
177
 
178
  def fetch_html(
 
180
  url: str,
181
  wait_seconds: Optional[float] = 10.0,
182
  wait_for_selector: Optional[str] = None,
 
183
  ) -> str:
184
  if self._driver is None:
185
+ self._start_driver_with_retries()
186
 
187
  with self._driver_lock:
188
  driver = self._driver
 
207
 
208
  return driver.page_source
209
  except WebDriverException as e:
210
+ logger.exception("WebDriver exception during fetch: %s", e)
211
+ # restart driver and raise
212
  try:
213
  self._safe_quit_driver()
214
  except Exception:
215
  pass
216
+ self._start_driver_with_retries()
 
217
  raise RuntimeError(f"WebDriver error during fetch: {e}")
218
 
219
  def _safe_quit_driver(self):
 
228
  self._safe_quit_driver()
229
 
230
 
231
+ # ---------------- EXTRACT_DATA ----------------
232
  def EXTRACT_DATA(html: str) -> Dict[str, Any]:
233
  soup = BeautifulSoup(html, "html.parser")
234
  BASE_URL = "https://www.google.com"
 
416
 
417
  return data
418
 
419
+
420
+ # ---------------- BrowserPool and API ----------------
421
  class BrowserPool:
422
  def __init__(self, pool_size: int = 1, headless: bool = True):
423
  self.pool_size = max(1, pool_size)
 
459
  with self._lock:
460
  self._cache[key] = (time.time(), value)
461
 
 
462
  class SearchRequest(BaseModel):
463
  query: Optional[str] = None
464
  url: Optional[str] = None
465
  wait_for_selector: Optional[str] = None
466
  headless: Optional[bool] = True
467
 
 
468
  app = FastAPI(title="fast_fetcher_api", version="0.1")
 
 
469
  POOL: Optional[BrowserPool] = None
470
  EXECUTOR: Optional[ThreadPoolExecutor] = None
471
  CACHE = SimpleTTLCache(ttl_seconds=25)
 
473
  @app.on_event("startup")
474
  async def startup_event():
475
  global POOL, EXECUTOR
476
+ # in Docker use headless True
477
  POOL = BrowserPool(pool_size=1, headless=True)
478
  EXECUTOR = ThreadPoolExecutor(max_workers=2)
479
  app.state.executor = EXECUTOR
480
  app.state.pool = POOL
481
+ logger.info("Startup: browser pool created (size=%d).", 1)
482
 
483
  @app.on_event("shutdown")
484
  async def shutdown_event():
 
487
  POOL.close_all()
488
  if EXECUTOR:
489
  EXECUTOR.shutdown(wait=True)
490
+ logger.info("Shutdown: browsers closed and executor stopped.")
491
 
492
  def _blocking_fetch_and_extract(manager: BrowserManager, url: str, wait_for_selector: Optional[str], wait_seconds: Optional[float]):
493
  start = time.time()