AdarshJi commited on
Commit
d11ef09
·
verified ·
1 Parent(s): fc2f62d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -48
app.py CHANGED
@@ -16,7 +16,7 @@ from starlette.responses import JSONResponse
16
  from selenium import webdriver
17
  from selenium.webdriver.chrome.options import Options
18
  from selenium.webdriver.chrome.service import Service
19
- from selenium.common.exceptions import WebDriverException
20
  from webdriver_manager.chrome import ChromeDriverManager
21
  from bs4 import BeautifulSoup
22
  from selenium.webdriver.common.by import By
@@ -24,7 +24,10 @@ from selenium.webdriver.support.ui import WebDriverWait
24
  from selenium.webdriver.support import expected_conditions as EC
25
  from selenium.common.exceptions import TimeoutException
26
 
27
- # Configure logging
 
 
 
28
  logging.basicConfig(level=logging.INFO)
29
  logger = logging.getLogger("fast_fetcher")
30
 
@@ -50,25 +53,24 @@ class BrowserManager:
50
  ]
51
  self._driver_lock = threading.Lock()
52
  self._driver: Optional[webdriver.Chrome] = None
 
53
  self._start_driver_with_retries()
54
 
55
  def _build_options(self) -> Options:
56
  opts = Options()
57
-
58
- # set binary location from env, fallback to default
59
  chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/google-chrome-stable")
60
  if os.path.exists(chrome_bin):
61
  opts.binary_location = chrome_bin
62
  logger.debug("Using chrome binary: %s", chrome_bin)
63
  else:
64
- logger.warning("Chrome binary not found at %s; Selenium Manager may still locate a browser.", chrome_bin)
65
 
66
  if self.headless:
67
- # keep compatibility for versions: add both
68
  opts.add_argument("--headless=new")
69
  opts.add_argument("--headless")
70
 
71
- # container-friendly and stability flags
72
  opts.add_argument("--no-sandbox")
73
  opts.add_argument("--disable-setuid-sandbox")
74
  opts.add_argument("--disable-dev-shm-usage")
@@ -76,7 +78,6 @@ class BrowserManager:
76
  opts.add_argument("--disable-extensions")
77
  opts.add_argument("--disable-blink-features=AutomationControlled")
78
  opts.add_argument("--disable-software-rasterizer")
79
- opts.add_argument("--disable-accelerated-2d-canvas")
80
  opts.add_argument(f"--window-size={self.window_size}")
81
  opts.add_argument("--remote-debugging-port=0")
82
 
@@ -103,75 +104,106 @@ class BrowserManager:
103
  self._start_driver()
104
  logger.info("Chrome driver started successfully.")
105
  return
106
- except Exception as e:
107
- logger.exception("Failed to start driver on attempt %d: %s", attempt, e)
108
- last_exc = e
109
  time.sleep(delay_seconds)
110
  raise RuntimeError(f"Unable to start Chrome driver after {attempts} attempts: {last_exc}") from last_exc
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  def _start_driver(self):
 
 
 
113
  opts = self._build_options()
114
 
115
- # First: try letting Selenium Manager handle driver (selenium >=4.14)
 
 
116
  try:
117
- logger.debug("Trying webdriver.Chrome(options=opts) to allow Selenium Manager to find driver.")
118
  self._driver = webdriver.Chrome(options=opts)
119
- # quick smoke test: get version via execute_script -> may throw if browser crashed
120
  try:
121
  self._driver.execute_script("return navigator.userAgent")
122
- except Exception:
123
- # driver started but browser died; fallthrough to fallback
124
- raise RuntimeError("Selenium Manager started a session but the browser crashed immediately.")
125
- # if success, configure network blocking best-effort
126
  self._post_start_setup()
127
  return
128
  except Exception as e_primary:
 
129
  logger.warning("Selenium Manager attempt failed: %s", e_primary)
130
 
131
- # Fallback: use webdriver-manager to download compatible chromedriver, ensure executable, and use it
132
  try:
133
  driver_path = ChromeDriverManager().install()
134
  logger.info("webdriver-manager installed chromedriver: %s", driver_path)
135
- # ensure executable bit set
136
  try:
137
  os.chmod(driver_path, 0o755)
138
- except Exception as ex_perm:
139
- logger.warning("Could not chmod chromedriver: %s", ex_perm)
 
140
  service = Service(driver_path)
141
  self._driver = webdriver.Chrome(service=service, options=opts)
142
  self._post_start_setup()
143
  return
144
  except Exception as e_fallback:
 
145
  logger.exception("webdriver-manager fallback failed: %s", e_fallback)
146
- # try one more variation: if /usr/bin/chromedriver exists, use it
147
- try:
148
- system_path = "/usr/bin/chromedriver"
149
- if os.path.exists(system_path):
150
- logger.info("Trying system chromedriver at %s", system_path)
151
- os.chmod(system_path, 0o755)
152
- service = Service(system_path)
153
- self._driver = webdriver.Chrome(service=service, options=opts)
154
- self._post_start_setup()
155
- return
156
- else:
157
- logger.debug("No system chromedriver at %s", system_path)
158
- except Exception as e_sys:
159
- logger.exception("System chromedriver attempt failed: %s", e_sys)
160
 
161
- # raise combined error
162
- raise RuntimeError(f"Failed to start Chrome driver. primary_error={e_primary}, fallback_error={e_fallback}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  def _post_start_setup(self):
165
- # set reasonable timeout and (best-effort) block some urls using CDP
166
  try:
167
  self._driver.set_page_load_timeout(60)
168
- self._driver.execute_cdp_cmd("Network.enable", {})
169
- if self.block_resource_urls:
170
- try:
 
171
  self._driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": self.block_resource_urls})
172
- except Exception:
173
- # some Chrome builds may not support this; ignore
174
- pass
175
  except Exception:
176
  pass
177
 
@@ -223,12 +255,14 @@ class BrowserManager:
223
  except Exception:
224
  pass
225
  self._driver = None
 
 
226
 
227
  def close(self):
228
  self._safe_quit_driver()
229
 
230
 
231
- # ---------------- EXTRACT_DATA ----------------
232
  def EXTRACT_DATA(html: str) -> Dict[str, Any]:
233
  soup = BeautifulSoup(html, "html.parser")
234
  BASE_URL = "https://www.google.com"
@@ -473,8 +507,8 @@ CACHE = SimpleTTLCache(ttl_seconds=25)
473
  @app.on_event("startup")
474
  async def startup_event():
475
  global POOL, EXECUTOR
476
- # in Docker use headless True
477
- POOL = BrowserPool(pool_size=1, headless=True)
478
  EXECUTOR = ThreadPoolExecutor(max_workers=2)
479
  app.state.executor = EXECUTOR
480
  app.state.pool = POOL
 
16
  from selenium import webdriver
17
  from selenium.webdriver.chrome.options import Options
18
  from selenium.webdriver.chrome.service import Service
19
+ from selenium.common.exceptions import WebDriverException, SessionNotCreatedException
20
  from webdriver_manager.chrome import ChromeDriverManager
21
  from bs4 import BeautifulSoup
22
  from selenium.webdriver.common.by import By
 
24
  from selenium.webdriver.support import expected_conditions as EC
25
  from selenium.common.exceptions import TimeoutException
26
 
27
+ # virtual display
28
+ from pyvirtualdisplay import Display
29
+
30
+ # Logging
31
  logging.basicConfig(level=logging.INFO)
32
  logger = logging.getLogger("fast_fetcher")
33
 
 
53
  ]
54
  self._driver_lock = threading.Lock()
55
  self._driver: Optional[webdriver.Chrome] = None
56
+ self._display: Optional[Display] = None
57
  self._start_driver_with_retries()
58
 
59
  def _build_options(self) -> Options:
60
  opts = Options()
61
+ # If CHROME_BIN is present, point to it
 
62
  chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/google-chrome-stable")
63
  if os.path.exists(chrome_bin):
64
  opts.binary_location = chrome_bin
65
  logger.debug("Using chrome binary: %s", chrome_bin)
66
  else:
67
+ logger.warning("Chrome binary not found at %s (will rely on system/browser manager).", chrome_bin)
68
 
69
  if self.headless:
 
70
  opts.add_argument("--headless=new")
71
  opts.add_argument("--headless")
72
 
73
+ # container-friendly flags (and stable fallback)
74
  opts.add_argument("--no-sandbox")
75
  opts.add_argument("--disable-setuid-sandbox")
76
  opts.add_argument("--disable-dev-shm-usage")
 
78
  opts.add_argument("--disable-extensions")
79
  opts.add_argument("--disable-blink-features=AutomationControlled")
80
  opts.add_argument("--disable-software-rasterizer")
 
81
  opts.add_argument(f"--window-size={self.window_size}")
82
  opts.add_argument("--remote-debugging-port=0")
83
 
 
104
  self._start_driver()
105
  logger.info("Chrome driver started successfully.")
106
  return
107
+ except Exception as exc:
108
+ logger.exception("Failed to start driver on attempt %d: %s", attempt, exc)
109
+ last_exc = exc
110
  time.sleep(delay_seconds)
111
  raise RuntimeError(f"Unable to start Chrome driver after {attempts} attempts: {last_exc}") from last_exc
112
 
113
+ def _start_xvfb_if_needed(self):
114
+ # If headless=False AND no DISPLAY, start Xvfb via pyvirtualdisplay
115
+ if not self.headless and os.environ.get("DISPLAY", "") == "":
116
+ try:
117
+ logger.info("No DISPLAY found and headless=False — starting virtual X display (Xvfb).")
118
+ self._display = Display(visible=0, size=(int(self.window_size.split(",")[0]), int(self.window_size.split(",")[1])))
119
+ self._display.start()
120
+ logger.info("Virtual X display started (DISPLAY=%s).", os.environ.get("DISPLAY"))
121
+ except Exception as e:
122
+ logger.exception("Failed to start virtual display: %s", e)
123
+ raise
124
+
125
+ def _stop_xvfb_if_started(self):
126
+ if self._display:
127
+ try:
128
+ self._display.stop()
129
+ logger.info("Virtual X display stopped.")
130
+ except Exception:
131
+ pass
132
+ self._display = None
133
+
134
  def _start_driver(self):
135
+ # start virtual display if required BEFORE launching Chrome
136
+ self._start_xvfb_if_needed()
137
+
138
  opts = self._build_options()
139
 
140
+ # 1) Try Selenium Manager (webdriver.Chrome(options=opts)). Selenium >=4.14 may use driver manager itself.
141
+ primary_exc = None
142
+ fallback_exc = None
143
  try:
144
+ logger.debug("Attempting to start Chrome via Selenium Manager (webdriver.Chrome(options=opts))")
145
  self._driver = webdriver.Chrome(options=opts)
146
+ # quick smoke test: ensure browser is responsive (may throw)
147
  try:
148
  self._driver.execute_script("return navigator.userAgent")
149
+ except Exception as e:
150
+ # browser started but died quickly
151
+ raise RuntimeError("Browser started by Selenium Manager but crashed immediately.") from e
152
+
153
  self._post_start_setup()
154
  return
155
  except Exception as e_primary:
156
+ primary_exc = e_primary
157
  logger.warning("Selenium Manager attempt failed: %s", e_primary)
158
 
159
+ # 2) Fallback: use webdriver-manager to download driver and start with the explicit Service
160
  try:
161
  driver_path = ChromeDriverManager().install()
162
  logger.info("webdriver-manager installed chromedriver: %s", driver_path)
 
163
  try:
164
  os.chmod(driver_path, 0o755)
165
+ except Exception:
166
+ logger.debug("chmod on chromedriver failed or unnecessary.")
167
+
168
  service = Service(driver_path)
169
  self._driver = webdriver.Chrome(service=service, options=opts)
170
  self._post_start_setup()
171
  return
172
  except Exception as e_fallback:
173
+ fallback_exc = e_fallback
174
  logger.exception("webdriver-manager fallback failed: %s", e_fallback)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
+ # 3) Final fallback: attempt system /usr/bin/chromedriver if available
177
+ try:
178
+ sys_path = "/usr/bin/chromedriver"
179
+ if os.path.exists(sys_path):
180
+ logger.info("Trying system chromedriver at %s", sys_path)
181
+ try:
182
+ os.chmod(sys_path, 0o755)
183
+ except Exception:
184
+ pass
185
+ service = Service(sys_path)
186
+ self._driver = webdriver.Chrome(service=service, options=opts)
187
+ self._post_start_setup()
188
+ return
189
+ except Exception as e_sys:
190
+ logger.exception("System chromedriver attempt failed: %s", e_sys)
191
+
192
+ # If all failed, stop virtual display (if started) and raise a helpful error
193
+ self._stop_xvfb_if_started()
194
+ # Include both primary and fallback messages in the raised exception
195
+ raise RuntimeError(f"Failed to start Chrome driver. primary_error={primary_exc}, fallback_error={fallback_exc}")
196
 
197
  def _post_start_setup(self):
 
198
  try:
199
  self._driver.set_page_load_timeout(60)
200
+ # best-effort CDP network blocking
201
+ try:
202
+ self._driver.execute_cdp_cmd("Network.enable", {})
203
+ if self.block_resource_urls:
204
  self._driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": self.block_resource_urls})
205
+ except Exception:
206
+ pass
 
207
  except Exception:
208
  pass
209
 
 
255
  except Exception:
256
  pass
257
  self._driver = None
258
+ # stop display if we started one
259
+ self._stop_xvfb_if_started()
260
 
261
  def close(self):
262
  self._safe_quit_driver()
263
 
264
 
265
+ # ---------------- EXTRACT_DATA (same as your earlier implementation) ----------------
266
  def EXTRACT_DATA(html: str) -> Dict[str, Any]:
267
  soup = BeautifulSoup(html, "html.parser")
268
  BASE_URL = "https://www.google.com"
 
507
  @app.on_event("startup")
508
  async def startup_event():
509
  global POOL, EXECUTOR
510
+ # Switch headless here to False as you asked. The BrowserManager will start an Xvfb display automatically.
511
+ POOL = BrowserPool(pool_size=1, headless=False)
512
  EXECUTOR = ThreadPoolExecutor(max_workers=2)
513
  app.state.executor = EXECUTOR
514
  app.state.pool = POOL