AdarshJi commited on
Commit
77a846b
·
verified ·
1 Parent(s): 562b2e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -16
app.py CHANGED
@@ -17,7 +17,7 @@ from starlette.responses import JSONResponse
17
  # --- bring in your BrowserManager and EXTRACT_DATA functions (adapted) ---
18
  # For brevity we import them inline. If you already have them in a module,
19
  # replace with: from your_module import BrowserManager, EXTRACT_DATA
20
-
21
  import threading
22
  from selenium import webdriver
23
  from selenium.webdriver.chrome.service import Service
@@ -60,15 +60,26 @@ class BrowserManager:
60
 
61
  def _start_driver(self):
62
  opts = Options()
 
 
 
 
 
 
 
63
  if self.headless:
 
64
  opts.add_argument("--headless=new")
65
  opts.add_argument("--headless")
66
- opts.add_argument(f"--window-size={self.window_size}")
67
  opts.add_argument("--no-sandbox")
 
68
  opts.add_argument("--disable-dev-shm-usage")
69
  opts.add_argument("--disable-gpu")
70
  opts.add_argument("--disable-extensions")
71
  opts.add_argument("--disable-blink-features=AutomationControlled")
 
 
72
  if self.user_agent:
73
  opts.add_argument(f"--user-agent={self.user_agent}")
74
 
@@ -81,22 +92,22 @@ class BrowserManager:
81
  opts.add_experimental_option("prefs", prefs)
82
 
83
  opts.add_experimental_option("excludeSwitches", ["enable-logging"])
 
 
84
 
85
- # Some systems may not accept the desired_capabilities arg in this signature,
86
- # so we set pageLoadStrategy later via browser options / capabilities if needed.
87
- service = Service(ChromeDriverManager().install())
88
- self._driver = webdriver.Chrome(service=service, options=opts)
89
-
90
- # quick defaults
91
  try:
92
- self._driver.set_page_load_timeout(60)
93
- # Enable CDP to block some resource URLs (best-effort)
94
- self._driver.execute_cdp_cmd("Network.enable", {})
95
- if self.block_resource_urls:
96
- self._driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": self.block_resource_urls})
97
- except Exception:
98
- # ignore CDP failures gracefully
99
- pass
 
 
100
 
101
  def fetch_html(
102
  self,
 
17
  # --- bring in your BrowserManager and EXTRACT_DATA functions (adapted) ---
18
  # For brevity we import them inline. If you already have them in a module,
19
  # replace with: from your_module import BrowserManager, EXTRACT_DATA
20
+ import os
21
  import threading
22
  from selenium import webdriver
23
  from selenium.webdriver.chrome.service import Service
 
60
 
61
  def _start_driver(self):
62
  opts = Options()
63
+
64
+ # set binary location from env if provided (in Dockerfile we set CHROME_BIN)
65
+ chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/google-chrome-stable")
66
+ if os.path.exists(chrome_bin):
67
+ opts.binary_location = chrome_bin
68
+
69
+ # headless flags
70
  if self.headless:
71
+ # new headless mode for newer chrome versions; keep both to be safe
72
  opts.add_argument("--headless=new")
73
  opts.add_argument("--headless")
74
+ # essential flags for running chrome in containers
75
  opts.add_argument("--no-sandbox")
76
+ opts.add_argument("--disable-setuid-sandbox")
77
  opts.add_argument("--disable-dev-shm-usage")
78
  opts.add_argument("--disable-gpu")
79
  opts.add_argument("--disable-extensions")
80
  opts.add_argument("--disable-blink-features=AutomationControlled")
81
+ opts.add_argument("--single-process")
82
+ opts.add_argument(f"--window-size={self.window_size}")
83
  if self.user_agent:
84
  opts.add_argument(f"--user-agent={self.user_agent}")
85
 
 
92
  opts.add_experimental_option("prefs", prefs)
93
 
94
  opts.add_experimental_option("excludeSwitches", ["enable-logging"])
95
+ # optional: avoid "Chrome is being controlled by automated test software" banner
96
+ opts.add_experimental_option("useAutomationExtension", False)
97
 
98
+ # create service: webdriver_manager will download a chromedriver binary
99
+ # that matches the detected Chrome version (usually)
 
 
 
 
100
  try:
101
+ driver_path = ChromeDriverManager().install()
102
+ service = Service(driver_path)
103
+ self._driver = webdriver.Chrome(service=service, options=opts)
104
+ except Exception as e:
105
+ # fallback: try system chromedriver if present
106
+ try:
107
+ service = Service("/usr/bin/chromedriver")
108
+ self._driver = webdriver.Chrome(service=service, options=opts)
109
+ except Exception as ee:
110
+ raise RuntimeError(f"Failed to start Chrome driver: {e}; fallback: {ee}")
111
 
112
  def fetch_html(
113
  self,