SACC / webdriver_utils.py
cacodex's picture
Add proxy toggle and DDDDOCR captcha solver
abd380b verified
Raw
History Blame Contribute Delete
23.8 kB
from __future__ import annotations
import select
import shutil
import socket
import threading
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import unquote, urlsplit
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.support.wait import WebDriverWait
DEFAULT_USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
def _get_login_2fa_bypass_script() -> str:
return """(() => {
const RULES = [
{
url: "/api/bff/v1.2/2factor/select",
modify(json) {
if (json?.data?.formDto?.userTwoFactory !== true) return false;
json.data.formDto.userTwoFactory = false;
return true;
}
},
{
url: "/api/bff/v1.2/commons/user_setting_info",
modify(json) {
if (json?.data?.user2factor !== true) return false;
json.data.user2factor = false;
return true;
}
}
];
function findRule(url) {
return RULES.find((rule) => String(url || '').includes(rule.url));
}
function applyModify(json, url) {
const rule = findRule(url);
return rule ? rule.modify(json) : false;
}
const origFetch = window.fetch;
window.fetch = function(input, init) {
const url = typeof input === 'string'
? input
: (input instanceof URL ? input.href : (input && input.url ? input.url : ''));
return origFetch.call(window, input, init).then(function(response) {
if (!findRule(url)) return response;
return response.clone().json().then(function(json) {
if (!applyModify(json, url)) return response;
return new Response(JSON.stringify(json), {
status: response.status,
statusText: response.statusText,
headers: response.headers
});
}).catch(function() {
return response;
});
});
};
const origOpen = XMLHttpRequest.prototype.open;
XMLHttpRequest.prototype.open = function(method, url, async, username, password) {
this.__saccLogin2faUrl = typeof url === 'string' ? url : (url && url.href ? url.href : '');
return origOpen.call(this, method, url, async !== false, username, password);
};
const textDescriptor = Object.getOwnPropertyDescriptor(XMLHttpRequest.prototype, 'responseText');
if (textDescriptor?.get) {
const origTextGetter = textDescriptor.get;
Object.defineProperty(XMLHttpRequest.prototype, 'responseText', {
get: function() {
const value = origTextGetter.call(this);
if (this.readyState !== 4 || this.status !== 200) return value;
const url = this.__saccLogin2faUrl || '';
if (!url || !findRule(url)) return value;
try {
const json = JSON.parse(value);
if (applyModify(json, url)) return JSON.stringify(json);
} catch (error) {}
return value;
},
configurable: true,
enumerable: true
});
}
const respDescriptor = Object.getOwnPropertyDescriptor(XMLHttpRequest.prototype, 'response');
if (respDescriptor?.get) {
const origRespGetter = respDescriptor.get;
Object.defineProperty(XMLHttpRequest.prototype, 'response', {
get: function() {
const value = origRespGetter.call(this);
if (this.readyState !== 4 || this.status !== 200) return value;
const url = this.__saccLogin2faUrl || '';
if (!url || !findRule(url)) return value;
try {
const obj = typeof value === 'string' ? JSON.parse(value) : value;
if (applyModify(obj, url)) {
return this.responseType === 'json' ? obj : JSON.stringify(obj);
}
} catch (error) {}
return value;
},
configurable: true,
enumerable: true
});
}
})();"""
def _get_password_popup_blocker_script() -> str:
return """(() => {
const REDIRECT_URL = 'https://id.scu.edu.cn/enduser/sp/sso/scdxplugin_jwt23?enterpriseId=scdx&target_url=index';
let stopped = false;
let observer = null;
let origPushState = null;
let origReplaceState = null;
function removeAllListeners() {
stopped = true;
try { if (origPushState) history.pushState = origPushState; } catch (error) {}
try { if (origReplaceState) history.replaceState = origReplaceState; } catch (error) {}
try { window.removeEventListener('popstate', checkAndRedirect); } catch (error) {}
try { window.removeEventListener('hashchange', checkAndRedirect); } catch (error) {}
try { if (observer) observer.disconnect(); } catch (error) {}
}
function redirectToSso() {
removeAllListeners();
try { window.location.replace(REDIRECT_URL); } catch (error) { window.location.href = REDIRECT_URL; }
}
function checkAndRedirect() {
if (stopped) return;
try {
const href = location.href || '';
if (!href.includes('id.scu.edu.cn')) return;
if (href.includes('zhjw.scu.edu.cn')) {
removeAllListeners();
return;
}
if (/modifyPassword/.test(href) && /needModifyPasswordOfPwdExpire/.test(href)) {
redirectToSso();
}
} catch (error) {}
}
if ((location.href || '').includes('id.scu.edu.cn')) {
checkAndRedirect();
origPushState = history.pushState;
origReplaceState = history.replaceState;
history.pushState = function() {
const result = origPushState.apply(this, arguments);
try { setTimeout(checkAndRedirect, 50); } catch (error) {}
return result;
};
history.replaceState = function() {
const result = origReplaceState.apply(this, arguments);
try { setTimeout(checkAndRedirect, 50); } catch (error) {}
return result;
};
window.addEventListener('popstate', checkAndRedirect);
window.addEventListener('hashchange', checkAndRedirect);
try {
observer = new MutationObserver(checkAndRedirect);
observer.observe(document, { subtree: true, childList: true });
} catch (error) {}
}
function removeIfPasswordModal(modalDialog) {
try {
const text = modalDialog.textContent || '';
const hasKeywords = /修改密码|更改密码/.test(text);
const hasPasswordFields = Boolean(
modalDialog.querySelector &&
(modalDialog.querySelector('#oldPass') ||
modalDialog.querySelector('#newPass1') ||
modalDialog.querySelector('#newPass2'))
);
if (!hasKeywords && !hasPasswordFields) return;
const container = (modalDialog.closest && modalDialog.closest('.modal')) || modalDialog.parentElement || modalDialog;
if (container && typeof container.remove === 'function') {
container.remove();
}
} catch (error) {}
}
function removePasswordArtifacts() {
try {
document.querySelectorAll('.modal-dialog').forEach(removeIfPasswordModal);
document.querySelectorAll('.modal-backdrop.fade.in, .modal-backdrop').forEach((node) => {
try { node.remove(); } catch (error) {}
});
} catch (error) {}
}
removePasswordArtifacts();
try {
const modalObserver = new MutationObserver(removePasswordArtifacts);
modalObserver.observe(document, { subtree: true, childList: true });
} catch (error) {}
})();"""
@dataclass(slots=True)
class BrowserProxyConfig:
scheme: str
host: str
port: int
username: str = ""
password: str = ""
@property
def requires_auth_relay(self) -> bool:
return self.scheme in {"socks5", "socks5h"} and bool(self.username or self.password)
@property
def display_label(self) -> str:
return f"{self.scheme}://{self.host}:{self.port}"
class Socks5AuthRelay:
"""Local SOCKS5 relay that authenticates to an upstream SOCKS5 proxy."""
def __init__(self, proxy: BrowserProxyConfig) -> None:
self.proxy = proxy
self.port = 0
self._server: socket.socket | None = None
self._stop_event = threading.Event()
self._thread: threading.Thread | None = None
def start(self) -> None:
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.bind(("127.0.0.1", 0))
server.listen(64)
server.settimeout(0.5)
self._server = server
self.port = int(server.getsockname()[1])
self._thread = threading.Thread(
target=self._accept_loop,
name=f"sacc-socks5-relay-{self.port}",
daemon=True,
)
self._thread.start()
def close(self) -> None:
self._stop_event.set()
if self._server is not None:
try:
self._server.close()
except OSError:
pass
if self._thread is not None and self._thread.is_alive():
self._thread.join(timeout=1)
def _accept_loop(self) -> None:
while not self._stop_event.is_set():
try:
client, _addr = self._server.accept() if self._server is not None else (None, None)
except (OSError, TimeoutError):
continue
if client is None:
continue
threading.Thread(target=self._handle_client, args=(client,), daemon=True).start()
def _handle_client(self, client: socket.socket) -> None:
upstream: socket.socket | None = None
try:
client.settimeout(20)
request = self._read_client_connect_request(client)
if request is None:
return
upstream = socket.create_connection((self.proxy.host, self.proxy.port), timeout=20)
upstream.settimeout(20)
if not self._authenticate_upstream(upstream):
self._send_client_failure(client)
return
upstream.sendall(request)
upstream_response = self._read_socks5_response(upstream)
if not upstream_response or len(upstream_response) < 2 or upstream_response[1] != 0:
self._send_client_failure(client)
return
client.sendall(b"\x05\x00\x00\x01\x00\x00\x00\x00\x00\x00")
self._relay(client, upstream)
except OSError:
return
finally:
for sock in (client, upstream):
if sock is not None:
try:
sock.close()
except OSError:
pass
def _read_client_connect_request(self, client: socket.socket) -> bytes | None:
header = self._recv_exact(client, 2)
if not header or header[0] != 5:
return None
methods = self._recv_exact(client, header[1])
if methods is None:
return None
client.sendall(b"\x05\x00")
request_header = self._recv_exact(client, 4)
if not request_header or request_header[0] != 5 or request_header[1] != 1:
self._send_client_failure(client)
return None
address = self._read_socks5_address(client, request_header[3])
port = self._recv_exact(client, 2)
if address is None or port is None:
self._send_client_failure(client)
return None
return request_header + address + port
def _authenticate_upstream(self, upstream: socket.socket) -> bool:
username = self.proxy.username.encode("utf-8")
password = self.proxy.password.encode("utf-8")
if len(username) > 255 or len(password) > 255:
return False
if username or password:
upstream.sendall(b"\x05\x01\x02")
if self._recv_exact(upstream, 2) != b"\x05\x02":
return False
upstream.sendall(b"\x01" + bytes([len(username)]) + username + bytes([len(password)]) + password)
return self._recv_exact(upstream, 2) == b"\x01\x00"
upstream.sendall(b"\x05\x01\x00")
return self._recv_exact(upstream, 2) == b"\x05\x00"
def _read_socks5_response(self, sock: socket.socket) -> bytes | None:
header = self._recv_exact(sock, 4)
if not header:
return None
address = self._read_socks5_address(sock, header[3])
port = self._recv_exact(sock, 2)
if address is None or port is None:
return None
return header + address + port
def _read_socks5_address(self, sock: socket.socket, address_type: int) -> bytes | None:
if address_type == 1:
return self._recv_exact(sock, 4)
if address_type == 3:
length = self._recv_exact(sock, 1)
if not length:
return None
domain = self._recv_exact(sock, length[0])
return None if domain is None else length + domain
if address_type == 4:
return self._recv_exact(sock, 16)
return None
@staticmethod
def _recv_exact(sock: socket.socket, size: int) -> bytes | None:
data = bytearray()
while len(data) < size:
chunk = sock.recv(size - len(data))
if not chunk:
return None
data.extend(chunk)
return bytes(data)
@staticmethod
def _send_client_failure(client: socket.socket) -> None:
try:
client.sendall(b"\x05\x01\x00\x01\x00\x00\x00\x00\x00\x00")
except OSError:
pass
def _relay(self, client: socket.socket, upstream: socket.socket) -> None:
client.settimeout(None)
upstream.settimeout(None)
sockets = [client, upstream]
while not self._stop_event.is_set():
readable, _writable, errored = select.select(sockets, [], sockets, 30)
if errored:
return
if not readable:
continue
for source in readable:
target = upstream if source is client else client
data = source.recv(65536)
if not data:
return
target.sendall(data)
def parse_browser_proxy(proxy_url: str) -> BrowserProxyConfig | None:
raw_url = str(proxy_url or "").strip()
if not raw_url:
return None
parts = urlsplit(raw_url)
scheme = (parts.scheme or "socks5").lower()
if scheme not in {"socks5", "socks5h", "http", "https"}:
raise ValueError(f"Unsupported browser proxy scheme: {scheme}")
if not parts.hostname or not parts.port:
raise ValueError("Browser proxy URL must include host and port.")
return BrowserProxyConfig(
scheme=scheme,
host=parts.hostname,
port=int(parts.port),
username=unquote(parts.username or ""),
password=unquote(parts.password or ""),
)
def probe_browser_proxy(
proxy_url: str,
*,
target_host: str = "id.scu.edu.cn",
target_port: int = 80,
timeout_seconds: float = 8.0,
) -> tuple[bool, str]:
proxy = parse_browser_proxy(proxy_url)
if proxy is None:
return True, "proxy disabled"
if proxy.scheme not in {"socks5", "socks5h"}:
return _probe_tcp_proxy(proxy, timeout_seconds=timeout_seconds)
try:
with socket.create_connection((proxy.host, proxy.port), timeout=timeout_seconds) as sock:
sock.settimeout(timeout_seconds)
if not _authenticate_socks5(sock, proxy):
return False, f"{proxy.display_label} authentication failed"
request = _build_socks5_connect_request(proxy, target_host, int(target_port))
sock.sendall(request)
response = _read_socks5_response(sock)
if not response or len(response) < 2:
return False, f"{proxy.display_label} did not return a SOCKS5 response"
if response[1] == 0:
return True, f"{proxy.display_label} can reach {target_host}:{target_port}"
return False, f"{proxy.display_label} returned SOCKS5 status {response[1]} for {target_host}:{target_port}"
except OSError as exc:
return False, f"{proxy.display_label} probe failed: {exc}"
def _probe_tcp_proxy(proxy: BrowserProxyConfig, *, timeout_seconds: float) -> tuple[bool, str]:
try:
with socket.create_connection((proxy.host, proxy.port), timeout=timeout_seconds):
return True, f"{proxy.display_label} TCP endpoint is reachable"
except OSError as exc:
return False, f"{proxy.display_label} TCP probe failed: {exc}"
def _authenticate_socks5(sock: socket.socket, proxy: BrowserProxyConfig) -> bool:
username = proxy.username.encode("utf-8")
password = proxy.password.encode("utf-8")
if len(username) > 255 or len(password) > 255:
return False
if username or password:
sock.sendall(b"\x05\x01\x02")
if Socks5AuthRelay._recv_exact(sock, 2) != b"\x05\x02":
return False
sock.sendall(b"\x01" + bytes([len(username)]) + username + bytes([len(password)]) + password)
return Socks5AuthRelay._recv_exact(sock, 2) == b"\x01\x00"
sock.sendall(b"\x05\x01\x00")
return Socks5AuthRelay._recv_exact(sock, 2) == b"\x05\x00"
def _build_socks5_connect_request(proxy: BrowserProxyConfig, target_host: str, target_port: int) -> bytes:
if proxy.scheme == "socks5":
try:
address = b"\x01" + socket.inet_aton(socket.gethostbyname(target_host))
except OSError:
encoded_host = target_host.encode("idna")
address = b"\x03" + bytes([len(encoded_host)]) + encoded_host
else:
encoded_host = target_host.encode("idna")
address = b"\x03" + bytes([len(encoded_host)]) + encoded_host
return b"\x05\x01\x00" + address + int(target_port).to_bytes(2, "big")
def _read_socks5_response(sock: socket.socket) -> bytes | None:
header = Socks5AuthRelay._recv_exact(sock, 4)
if not header:
return None
address_type = header[3]
if address_type == 1:
address = Socks5AuthRelay._recv_exact(sock, 4)
elif address_type == 3:
length = Socks5AuthRelay._recv_exact(sock, 1)
if not length:
address = None
else:
domain = Socks5AuthRelay._recv_exact(sock, length[0])
address = None if domain is None else length + domain
elif address_type == 4:
address = Socks5AuthRelay._recv_exact(sock, 16)
else:
address = None
port = Socks5AuthRelay._recv_exact(sock, 2)
if address is None or port is None:
return None
return header + address + port
def configure_browser(
*,
chrome_binary: str,
chromedriver_path: str,
page_timeout: int = 40,
proxy_url: str = "",
enable_login_2fa_bypass: bool = False,
) -> WebDriver:
options = webdriver.ChromeOptions()
options.binary_location = chrome_binary
options.page_load_strategy = "eager"
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--disable-background-networking")
options.add_argument("--disable-background-timer-throttling")
options.add_argument("--disable-backgrounding-occluded-windows")
options.add_argument("--disable-renderer-backgrounding")
options.add_argument("--disable-extensions")
options.add_argument("--disable-default-apps")
options.add_argument("--no-first-run")
options.add_argument("--no-default-browser-check")
options.add_argument("--mute-audio")
options.add_argument("--window-size=1440,1280")
options.add_argument("--lang=zh-CN")
options.add_argument(f"--user-agent={DEFAULT_USER_AGENT}")
options.add_argument("--remote-debugging-pipe")
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
options.add_experimental_option("useAutomationExtension", False)
options.set_capability("goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"})
proxy_relay: Socks5AuthRelay | None = None
proxy = parse_browser_proxy(proxy_url)
if proxy is not None:
if proxy.requires_auth_relay:
proxy_relay = Socks5AuthRelay(proxy)
proxy_relay.start()
options.add_argument(f"--proxy-server=socks5://127.0.0.1:{proxy_relay.port}")
else:
options.add_argument(f"--proxy-server={proxy.scheme}://{proxy.host}:{proxy.port}")
resolved_driver_path = str(chromedriver_path or "").strip()
if resolved_driver_path and not Path(resolved_driver_path).exists() and shutil.which(resolved_driver_path) is None:
resolved_driver_path = ""
service = ChromeService(executable_path=resolved_driver_path) if resolved_driver_path else ChromeService()
try:
driver = webdriver.Chrome(service=service, options=options)
except Exception:
if proxy_relay is not None:
proxy_relay.close()
raise
driver._sacc_proxy_relay = proxy_relay
driver.set_page_load_timeout(page_timeout)
driver.set_script_timeout(min(page_timeout, 20))
driver.implicitly_wait(6)
try:
driver.execute_cdp_cmd("Network.enable", {})
except WebDriverException:
pass
driver.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{
"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined});"
},
)
driver.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{"source": _get_password_popup_blocker_script()},
)
if enable_login_2fa_bypass:
driver.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{"source": _get_login_2fa_bypass_script()},
)
return driver
def quit_browser(driver: WebDriver) -> None:
proxy_relay = getattr(driver, "_sacc_proxy_relay", None)
try:
driver.quit()
finally:
if proxy_relay is not None:
proxy_relay.close()
def open_with_recovery(driver: WebDriver, url: str) -> bool:
try:
driver.get(url)
return False
except TimeoutException:
try:
driver.execute_script("window.stop();")
except WebDriverException:
pass
return True
def wait_for_ready(driver_wait: WebDriverWait, *, allow_interactive: bool = True) -> str:
acceptable_states = {"complete", "interactive"} if allow_interactive else {"complete"}
driver_wait.until(
lambda web_driver: web_driver.execute_script("return document.readyState") in acceptable_states,
"The target page did not finish loading in time.",
)
return str(driver_wait._driver.execute_script("return document.readyState"))