Update app.py
Browse files
app.py
CHANGED
|
@@ -16,7 +16,7 @@ from starlette.responses import JSONResponse
|
|
| 16 |
from selenium import webdriver
|
| 17 |
from selenium.webdriver.chrome.options import Options
|
| 18 |
from selenium.webdriver.chrome.service import Service
|
| 19 |
-
from selenium.common.exceptions import WebDriverException
|
| 20 |
from webdriver_manager.chrome import ChromeDriverManager
|
| 21 |
from bs4 import BeautifulSoup
|
| 22 |
from selenium.webdriver.common.by import By
|
|
@@ -24,7 +24,10 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|
| 24 |
from selenium.webdriver.support import expected_conditions as EC
|
| 25 |
from selenium.common.exceptions import TimeoutException
|
| 26 |
|
| 27 |
-
#
|
|
|
|
|
|
|
|
|
|
| 28 |
logging.basicConfig(level=logging.INFO)
|
| 29 |
logger = logging.getLogger("fast_fetcher")
|
| 30 |
|
|
@@ -50,25 +53,24 @@ class BrowserManager:
|
|
| 50 |
]
|
| 51 |
self._driver_lock = threading.Lock()
|
| 52 |
self._driver: Optional[webdriver.Chrome] = None
|
|
|
|
| 53 |
self._start_driver_with_retries()
|
| 54 |
|
| 55 |
def _build_options(self) -> Options:
|
| 56 |
opts = Options()
|
| 57 |
-
|
| 58 |
-
# set binary location from env, fallback to default
|
| 59 |
chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/google-chrome-stable")
|
| 60 |
if os.path.exists(chrome_bin):
|
| 61 |
opts.binary_location = chrome_bin
|
| 62 |
logger.debug("Using chrome binary: %s", chrome_bin)
|
| 63 |
else:
|
| 64 |
-
logger.warning("Chrome binary not found at %s
|
| 65 |
|
| 66 |
if self.headless:
|
| 67 |
-
# keep compatibility for versions: add both
|
| 68 |
opts.add_argument("--headless=new")
|
| 69 |
opts.add_argument("--headless")
|
| 70 |
|
| 71 |
-
# container-friendly and
|
| 72 |
opts.add_argument("--no-sandbox")
|
| 73 |
opts.add_argument("--disable-setuid-sandbox")
|
| 74 |
opts.add_argument("--disable-dev-shm-usage")
|
|
@@ -76,7 +78,6 @@ class BrowserManager:
|
|
| 76 |
opts.add_argument("--disable-extensions")
|
| 77 |
opts.add_argument("--disable-blink-features=AutomationControlled")
|
| 78 |
opts.add_argument("--disable-software-rasterizer")
|
| 79 |
-
opts.add_argument("--disable-accelerated-2d-canvas")
|
| 80 |
opts.add_argument(f"--window-size={self.window_size}")
|
| 81 |
opts.add_argument("--remote-debugging-port=0")
|
| 82 |
|
|
@@ -103,75 +104,106 @@ class BrowserManager:
|
|
| 103 |
self._start_driver()
|
| 104 |
logger.info("Chrome driver started successfully.")
|
| 105 |
return
|
| 106 |
-
except Exception as
|
| 107 |
-
logger.exception("Failed to start driver on attempt %d: %s", attempt,
|
| 108 |
-
last_exc =
|
| 109 |
time.sleep(delay_seconds)
|
| 110 |
raise RuntimeError(f"Unable to start Chrome driver after {attempts} attempts: {last_exc}") from last_exc
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
def _start_driver(self):
|
|
|
|
|
|
|
|
|
|
| 113 |
opts = self._build_options()
|
| 114 |
|
| 115 |
-
#
|
|
|
|
|
|
|
| 116 |
try:
|
| 117 |
-
logger.debug("
|
| 118 |
self._driver = webdriver.Chrome(options=opts)
|
| 119 |
-
# quick smoke test:
|
| 120 |
try:
|
| 121 |
self._driver.execute_script("return navigator.userAgent")
|
| 122 |
-
except Exception:
|
| 123 |
-
#
|
| 124 |
-
raise RuntimeError("
|
| 125 |
-
|
| 126 |
self._post_start_setup()
|
| 127 |
return
|
| 128 |
except Exception as e_primary:
|
|
|
|
| 129 |
logger.warning("Selenium Manager attempt failed: %s", e_primary)
|
| 130 |
|
| 131 |
-
# Fallback: use webdriver-manager to download
|
| 132 |
try:
|
| 133 |
driver_path = ChromeDriverManager().install()
|
| 134 |
logger.info("webdriver-manager installed chromedriver: %s", driver_path)
|
| 135 |
-
# ensure executable bit set
|
| 136 |
try:
|
| 137 |
os.chmod(driver_path, 0o755)
|
| 138 |
-
except Exception
|
| 139 |
-
logger.
|
|
|
|
| 140 |
service = Service(driver_path)
|
| 141 |
self._driver = webdriver.Chrome(service=service, options=opts)
|
| 142 |
self._post_start_setup()
|
| 143 |
return
|
| 144 |
except Exception as e_fallback:
|
|
|
|
| 145 |
logger.exception("webdriver-manager fallback failed: %s", e_fallback)
|
| 146 |
-
# try one more variation: if /usr/bin/chromedriver exists, use it
|
| 147 |
-
try:
|
| 148 |
-
system_path = "/usr/bin/chromedriver"
|
| 149 |
-
if os.path.exists(system_path):
|
| 150 |
-
logger.info("Trying system chromedriver at %s", system_path)
|
| 151 |
-
os.chmod(system_path, 0o755)
|
| 152 |
-
service = Service(system_path)
|
| 153 |
-
self._driver = webdriver.Chrome(service=service, options=opts)
|
| 154 |
-
self._post_start_setup()
|
| 155 |
-
return
|
| 156 |
-
else:
|
| 157 |
-
logger.debug("No system chromedriver at %s", system_path)
|
| 158 |
-
except Exception as e_sys:
|
| 159 |
-
logger.exception("System chromedriver attempt failed: %s", e_sys)
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
def _post_start_setup(self):
|
| 165 |
-
# set reasonable timeout and (best-effort) block some urls using CDP
|
| 166 |
try:
|
| 167 |
self._driver.set_page_load_timeout(60)
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
| 171 |
self._driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": self.block_resource_urls})
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
pass
|
| 175 |
except Exception:
|
| 176 |
pass
|
| 177 |
|
|
@@ -223,12 +255,14 @@ class BrowserManager:
|
|
| 223 |
except Exception:
|
| 224 |
pass
|
| 225 |
self._driver = None
|
|
|
|
|
|
|
| 226 |
|
| 227 |
def close(self):
|
| 228 |
self._safe_quit_driver()
|
| 229 |
|
| 230 |
|
| 231 |
-
# ---------------- EXTRACT_DATA ----------------
|
| 232 |
def EXTRACT_DATA(html: str) -> Dict[str, Any]:
|
| 233 |
soup = BeautifulSoup(html, "html.parser")
|
| 234 |
BASE_URL = "https://www.google.com"
|
|
@@ -473,8 +507,8 @@ CACHE = SimpleTTLCache(ttl_seconds=25)
|
|
| 473 |
@app.on_event("startup")
|
| 474 |
async def startup_event():
|
| 475 |
global POOL, EXECUTOR
|
| 476 |
-
#
|
| 477 |
-
POOL = BrowserPool(pool_size=1, headless=
|
| 478 |
EXECUTOR = ThreadPoolExecutor(max_workers=2)
|
| 479 |
app.state.executor = EXECUTOR
|
| 480 |
app.state.pool = POOL
|
|
|
|
| 16 |
from selenium import webdriver
|
| 17 |
from selenium.webdriver.chrome.options import Options
|
| 18 |
from selenium.webdriver.chrome.service import Service
|
| 19 |
+
from selenium.common.exceptions import WebDriverException, SessionNotCreatedException
|
| 20 |
from webdriver_manager.chrome import ChromeDriverManager
|
| 21 |
from bs4 import BeautifulSoup
|
| 22 |
from selenium.webdriver.common.by import By
|
|
|
|
| 24 |
from selenium.webdriver.support import expected_conditions as EC
|
| 25 |
from selenium.common.exceptions import TimeoutException
|
| 26 |
|
| 27 |
+
# virtual display
|
| 28 |
+
from pyvirtualdisplay import Display
|
| 29 |
+
|
| 30 |
+
# Logging
|
| 31 |
logging.basicConfig(level=logging.INFO)
|
| 32 |
logger = logging.getLogger("fast_fetcher")
|
| 33 |
|
|
|
|
| 53 |
]
|
| 54 |
self._driver_lock = threading.Lock()
|
| 55 |
self._driver: Optional[webdriver.Chrome] = None
|
| 56 |
+
self._display: Optional[Display] = None
|
| 57 |
self._start_driver_with_retries()
|
| 58 |
|
| 59 |
def _build_options(self) -> Options:
|
| 60 |
opts = Options()
|
| 61 |
+
# If CHROME_BIN is present, point to it
|
|
|
|
| 62 |
chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/google-chrome-stable")
|
| 63 |
if os.path.exists(chrome_bin):
|
| 64 |
opts.binary_location = chrome_bin
|
| 65 |
logger.debug("Using chrome binary: %s", chrome_bin)
|
| 66 |
else:
|
| 67 |
+
logger.warning("Chrome binary not found at %s (will rely on system/browser manager).", chrome_bin)
|
| 68 |
|
| 69 |
if self.headless:
|
|
|
|
| 70 |
opts.add_argument("--headless=new")
|
| 71 |
opts.add_argument("--headless")
|
| 72 |
|
| 73 |
+
# container-friendly flags (and stable fallback)
|
| 74 |
opts.add_argument("--no-sandbox")
|
| 75 |
opts.add_argument("--disable-setuid-sandbox")
|
| 76 |
opts.add_argument("--disable-dev-shm-usage")
|
|
|
|
| 78 |
opts.add_argument("--disable-extensions")
|
| 79 |
opts.add_argument("--disable-blink-features=AutomationControlled")
|
| 80 |
opts.add_argument("--disable-software-rasterizer")
|
|
|
|
| 81 |
opts.add_argument(f"--window-size={self.window_size}")
|
| 82 |
opts.add_argument("--remote-debugging-port=0")
|
| 83 |
|
|
|
|
| 104 |
self._start_driver()
|
| 105 |
logger.info("Chrome driver started successfully.")
|
| 106 |
return
|
| 107 |
+
except Exception as exc:
|
| 108 |
+
logger.exception("Failed to start driver on attempt %d: %s", attempt, exc)
|
| 109 |
+
last_exc = exc
|
| 110 |
time.sleep(delay_seconds)
|
| 111 |
raise RuntimeError(f"Unable to start Chrome driver after {attempts} attempts: {last_exc}") from last_exc
|
| 112 |
|
| 113 |
+
def _start_xvfb_if_needed(self):
|
| 114 |
+
# If headless=False AND no DISPLAY, start Xvfb via pyvirtualdisplay
|
| 115 |
+
if not self.headless and os.environ.get("DISPLAY", "") == "":
|
| 116 |
+
try:
|
| 117 |
+
logger.info("No DISPLAY found and headless=False — starting virtual X display (Xvfb).")
|
| 118 |
+
self._display = Display(visible=0, size=(int(self.window_size.split(",")[0]), int(self.window_size.split(",")[1])))
|
| 119 |
+
self._display.start()
|
| 120 |
+
logger.info("Virtual X display started (DISPLAY=%s).", os.environ.get("DISPLAY"))
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.exception("Failed to start virtual display: %s", e)
|
| 123 |
+
raise
|
| 124 |
+
|
| 125 |
+
def _stop_xvfb_if_started(self):
|
| 126 |
+
if self._display:
|
| 127 |
+
try:
|
| 128 |
+
self._display.stop()
|
| 129 |
+
logger.info("Virtual X display stopped.")
|
| 130 |
+
except Exception:
|
| 131 |
+
pass
|
| 132 |
+
self._display = None
|
| 133 |
+
|
| 134 |
def _start_driver(self):
|
| 135 |
+
# start virtual display if required BEFORE launching Chrome
|
| 136 |
+
self._start_xvfb_if_needed()
|
| 137 |
+
|
| 138 |
opts = self._build_options()
|
| 139 |
|
| 140 |
+
# 1) Try Selenium Manager (webdriver.Chrome(options=opts)). Selenium >=4.14 may use driver manager itself.
|
| 141 |
+
primary_exc = None
|
| 142 |
+
fallback_exc = None
|
| 143 |
try:
|
| 144 |
+
logger.debug("Attempting to start Chrome via Selenium Manager (webdriver.Chrome(options=opts))")
|
| 145 |
self._driver = webdriver.Chrome(options=opts)
|
| 146 |
+
# quick smoke test: ensure browser is responsive (may throw)
|
| 147 |
try:
|
| 148 |
self._driver.execute_script("return navigator.userAgent")
|
| 149 |
+
except Exception as e:
|
| 150 |
+
# browser started but died quickly
|
| 151 |
+
raise RuntimeError("Browser started by Selenium Manager but crashed immediately.") from e
|
| 152 |
+
|
| 153 |
self._post_start_setup()
|
| 154 |
return
|
| 155 |
except Exception as e_primary:
|
| 156 |
+
primary_exc = e_primary
|
| 157 |
logger.warning("Selenium Manager attempt failed: %s", e_primary)
|
| 158 |
|
| 159 |
+
# 2) Fallback: use webdriver-manager to download driver and start with the explicit Service
|
| 160 |
try:
|
| 161 |
driver_path = ChromeDriverManager().install()
|
| 162 |
logger.info("webdriver-manager installed chromedriver: %s", driver_path)
|
|
|
|
| 163 |
try:
|
| 164 |
os.chmod(driver_path, 0o755)
|
| 165 |
+
except Exception:
|
| 166 |
+
logger.debug("chmod on chromedriver failed or unnecessary.")
|
| 167 |
+
|
| 168 |
service = Service(driver_path)
|
| 169 |
self._driver = webdriver.Chrome(service=service, options=opts)
|
| 170 |
self._post_start_setup()
|
| 171 |
return
|
| 172 |
except Exception as e_fallback:
|
| 173 |
+
fallback_exc = e_fallback
|
| 174 |
logger.exception("webdriver-manager fallback failed: %s", e_fallback)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
+
# 3) Final fallback: attempt system /usr/bin/chromedriver if available
|
| 177 |
+
try:
|
| 178 |
+
sys_path = "/usr/bin/chromedriver"
|
| 179 |
+
if os.path.exists(sys_path):
|
| 180 |
+
logger.info("Trying system chromedriver at %s", sys_path)
|
| 181 |
+
try:
|
| 182 |
+
os.chmod(sys_path, 0o755)
|
| 183 |
+
except Exception:
|
| 184 |
+
pass
|
| 185 |
+
service = Service(sys_path)
|
| 186 |
+
self._driver = webdriver.Chrome(service=service, options=opts)
|
| 187 |
+
self._post_start_setup()
|
| 188 |
+
return
|
| 189 |
+
except Exception as e_sys:
|
| 190 |
+
logger.exception("System chromedriver attempt failed: %s", e_sys)
|
| 191 |
+
|
| 192 |
+
# If all failed, stop virtual display (if started) and raise a helpful error
|
| 193 |
+
self._stop_xvfb_if_started()
|
| 194 |
+
# Include both primary and fallback messages in the raised exception
|
| 195 |
+
raise RuntimeError(f"Failed to start Chrome driver. primary_error={primary_exc}, fallback_error={fallback_exc}")
|
| 196 |
|
| 197 |
def _post_start_setup(self):
|
|
|
|
| 198 |
try:
|
| 199 |
self._driver.set_page_load_timeout(60)
|
| 200 |
+
# best-effort CDP network blocking
|
| 201 |
+
try:
|
| 202 |
+
self._driver.execute_cdp_cmd("Network.enable", {})
|
| 203 |
+
if self.block_resource_urls:
|
| 204 |
self._driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": self.block_resource_urls})
|
| 205 |
+
except Exception:
|
| 206 |
+
pass
|
|
|
|
| 207 |
except Exception:
|
| 208 |
pass
|
| 209 |
|
|
|
|
| 255 |
except Exception:
|
| 256 |
pass
|
| 257 |
self._driver = None
|
| 258 |
+
# stop display if we started one
|
| 259 |
+
self._stop_xvfb_if_started()
|
| 260 |
|
| 261 |
def close(self):
|
| 262 |
self._safe_quit_driver()
|
| 263 |
|
| 264 |
|
| 265 |
+
# ---------------- EXTRACT_DATA (same as your earlier implementation) ----------------
|
| 266 |
def EXTRACT_DATA(html: str) -> Dict[str, Any]:
|
| 267 |
soup = BeautifulSoup(html, "html.parser")
|
| 268 |
BASE_URL = "https://www.google.com"
|
|
|
|
| 507 |
@app.on_event("startup")
|
| 508 |
async def startup_event():
|
| 509 |
global POOL, EXECUTOR
|
| 510 |
+
# Switch headless here to False as you asked. The BrowserManager will start an Xvfb display automatically.
|
| 511 |
+
POOL = BrowserPool(pool_size=1, headless=False)
|
| 512 |
EXECUTOR = ThreadPoolExecutor(max_workers=2)
|
| 513 |
app.state.executor = EXECUTOR
|
| 514 |
app.state.pool = POOL
|