| import os |
| import cv2 |
| import random |
| import numpy as np |
| from typing import List |
|
|
| from selenium import webdriver |
| from selenium.webdriver.chrome.service import Service as ChromeService |
| from selenium.webdriver.chrome.options import Options |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.support.ui import WebDriverWait |
| from selenium.webdriver.support import expected_conditions as EC |
| from webdriver_manager.chrome import ChromeDriverManager |
|
|
|
|
| class RenderWorker: |
| """ |
| 一个使用 Selenium Headless Chrome 渲染HTML内容的工具类。 |
| 它可以加载一个HTML模板,通过JavaScript渲染内容(如数学公式), |
| 并截取渲染后各元素的图像。 |
| """ |
|
|
| def __init__(self, template_file: str, timeout: int = 15, driver_path: str = None): |
| |
| opts = Options() |
| opts.add_argument("--headless") |
| opts.add_argument("--no-sandbox") |
| opts.add_argument("--disable-gpu") |
| opts.add_argument("--hide-scrollbars") |
| opts.add_argument("--disable-dev-shm-usage") |
| opts.add_argument("--log-level=3") |
| opts.add_experimental_option("excludeSwitches", ["enable-logging"]) |
| opts.add_argument("--disable-font-antialiasing") |
| opts.add_argument("--allow-file-access-from-files") |
|
|
| |
| if driver_path is None: |
| driver_path = ChromeDriverManager().install() |
| print(f"Installed driver_path: {driver_path}") |
| elif not os.path.exists(driver_path): |
| |
| raise FileNotFoundError(f"ChromeDriver 未找到:{driver_path}") |
|
|
| |
| saved_http_proxy = os.environ.pop("http_proxy", None) |
| saved_https_proxy = os.environ.pop("https_proxy", None) |
|
|
| try: |
| service = ChromeService(driver_path) |
| |
| self.driver = webdriver.Chrome(service=service, options=opts) |
| finally: |
| |
| if saved_http_proxy: |
| os.environ["http_proxy"] = saved_http_proxy |
| if saved_https_proxy: |
| os.environ["https_proxy"] = saved_https_proxy |
|
|
| self.driver.get(template_file) |
|
|
| self.timeout = timeout |
|
|
| |
| self.window_fix_width = 2000 |
| self.window_init_height = 300 |
|
|
| self.driver.set_window_size(self.window_fix_width, self.window_init_height) |
|
|
| |
| self.outer_height = self.window_init_height - self.driver.execute_script( |
| "return window.innerHeight" |
| ) |
|
|
| |
| WebDriverWait(self.driver, self.timeout).until( |
| EC.presence_of_all_elements_located((By.ID, "container")) |
| ) |
|
|
| def render(self, contents: List[str]) -> List[np.ndarray]: |
| """ |
| 渲染一组内容并返回每个元素的截图。 |
| """ |
| |
| self.driver.execute_script( |
| "document.body.classList.remove('rendering-complete');" |
| ) |
| self.driver.execute_script(f"render({contents}, false)") |
|
|
| |
| WebDriverWait(self.driver, self.timeout).until( |
| EC.presence_of_element_located((By.CLASS_NAME, "rendering-complete")) |
| ) |
|
|
| |
| scroll_height = self.driver.execute_script( |
| "return document.getElementById('container').scrollHeight" |
| ) |
| |
| |
| MAX_WINDOW_HEIGHT = 10000 |
| |
| |
| safe_outer_height = max(abs(self.outer_height), 100) |
| target_height = min( |
| max(scroll_height + safe_outer_height, 100), MAX_WINDOW_HEIGHT |
| ) |
| self.driver.set_window_size(self.window_fix_width, target_height) |
|
|
| |
| png = self.driver.get_screenshot_as_png() |
| nparr = np.frombuffer(png, np.uint8) |
| fullpage_img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) |
|
|
| |
| rects = self.get_rects() |
| cropped_imgs = [] |
| img_h, img_w = fullpage_img.shape[:2] |
|
|
| |
| for rect in rects: |
| if rect is None: |
| cropped_imgs.append(None) |
| else: |
| x, y, w, h = rect |
| |
| max_side = max(w, h) |
| base_border = int(max_side * 0.03) |
| border_size = int(base_border * random.uniform(0.8, 1.2)) |
| x1 = max(0, x - border_size) |
| y1 = max(0, y - border_size) |
| x2 = min(img_w, x + w + border_size) |
| y2 = min(img_h, y + h + border_size) |
|
|
| cropped = fullpage_img[y1:y2, x1:x2] |
| cropped_imgs.append(cropped) |
|
|
| return cropped_imgs |
|
|
| def get_rects(self) -> list: |
| """ |
| 获取页面上所有渲染元素的位置和大小信息。 |
| """ |
| elements = WebDriverWait(self.driver, self.timeout).until( |
| EC.presence_of_all_elements_located((By.CLASS_NAME, "screenshot")) |
| ) |
|
|
| rects = [] |
| for element in elements: |
| location = element.location |
| size = element.size |
| x = int(location["x"]) |
| y = int(location["y"]) |
| w = int(size["width"]) |
| h = int(size["height"]) |
|
|
| |
| if w > self.window_fix_width: |
| rects.append(None) |
| else: |
| rects.append((x, y, w, h)) |
|
|
| return rects |
|
|
| def close(self): |
| """ |
| 关闭浏览器驱动并释放资源。 |
| """ |
| if self.driver: |
| self.driver.quit() |
| self.driver = None |
|
|
| def __del__(self): |
| """ |
| 对象销毁时确保浏览器被关闭。 |
| """ |
| try: |
| self.close() |
| except: |
| pass |
|
|