Spaces:
Paused
Paused
File size: 5,657 Bytes
046723b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | import os
import time
from loguru import logger
from changedetectionio.content_fetchers.base import Fetcher
class fetcher(Fetcher):
if os.getenv("WEBDRIVER_URL"):
fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
else:
fetcher_description = "WebDriver Chrome/Javascript"
proxy = None
proxy_url = None
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
super().__init__()
from urllib.parse import urlparse
from selenium.webdriver.common.proxy import Proxy
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
if not custom_browser_connection_url:
self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
else:
self.browser_connection_is_custom = True
self.browser_connection_url = custom_browser_connection_url
##### PROXY SETUP #####
proxy_sources = [
self.system_http_proxy,
self.system_https_proxy,
os.getenv('webdriver_proxySocks'),
os.getenv('webdriver_socksProxy'),
os.getenv('webdriver_proxyHttp'),
os.getenv('webdriver_httpProxy'),
os.getenv('webdriver_proxyHttps'),
os.getenv('webdriver_httpsProxy'),
os.getenv('webdriver_sslProxy'),
proxy_override, # last one should override
]
# The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server=
for k in filter(None, proxy_sources):
if not k:
continue
self.proxy_url = k.strip()
async def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False,
empty_pages_are_a_change=False):
import asyncio
# Wrap the entire selenium operation in a thread executor
def _run_sync():
from selenium.webdriver.chrome.options import Options as ChromeOptions
# request_body, request_method unused for now, until some magic in the future happens.
options = ChromeOptions()
# Load Chrome options from env
CHROME_OPTIONS = [
line.strip()
for line in os.getenv("CHROME_OPTIONS", "").strip().splitlines()
if line.strip()
]
for opt in CHROME_OPTIONS:
options.add_argument(opt)
# 1. proxy_config /Proxy(proxy_config) selenium object is REALLY unreliable
# 2. selenium-wire cant be used because the websocket version conflicts with pypeteer-ng
# 3. selenium only allows ONE runner at a time by default!
# 4. driver must use quit() or it will continue to block/hold the selenium process!!
if self.proxy_url:
options.add_argument(f'--proxy-server={self.proxy_url}')
from selenium.webdriver.remote.remote_connection import RemoteConnection
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
driver = None
try:
# Create the RemoteConnection and set timeout (e.g., 30 seconds)
remote_connection = RemoteConnection(
self.browser_connection_url,
)
remote_connection.set_timeout(30) # seconds
# Now create the driver with the RemoteConnection
driver = RemoteWebDriver(
command_executor=remote_connection,
options=options
)
driver.set_page_load_timeout(int(os.getenv("WEBDRIVER_PAGELOAD_TIMEOUT", 45)))
except Exception as e:
if driver:
driver.quit()
raise e
try:
driver.get(url)
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
driver.set_window_size(1280, 1024)
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver
# raise EmptyReply(url=url, status_code=r.status_code)
# @todo - dom wait loaded?
import time
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
self.content = driver.page_source
self.headers = {}
self.screenshot = driver.get_screenshot_as_png()
except Exception as e:
driver.quit()
raise e
driver.quit()
# Run the selenium operations in a thread pool to avoid blocking the event loop
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, _run_sync)
|