| | """Selenium web scraping module.""" |
| | from __future__ import annotations |
| |
|
| | import logging |
| | from pathlib import Path |
| | from sys import platform |
| |
|
| | from bs4 import BeautifulSoup |
| | from selenium import webdriver |
| | from selenium.webdriver.chrome.options import Options as ChromeOptions |
| | from selenium.webdriver.common.by import By |
| | from selenium.webdriver.firefox.options import Options as FirefoxOptions |
| | from selenium.webdriver.remote.webdriver import WebDriver |
| | from selenium.webdriver.safari.options import Options as SafariOptions |
| | from selenium.webdriver.support import expected_conditions as EC |
| | from selenium.webdriver.support.wait import WebDriverWait |
| | from webdriver_manager.chrome import ChromeDriverManager |
| | from webdriver_manager.firefox import GeckoDriverManager |
| |
|
| | import autogpt.processing.text as summary |
| | from autogpt.config import Config |
| | from autogpt.processing.html import extract_hyperlinks, format_hyperlinks |
| |
|
| | FILE_DIR = Path(__file__).parent.parent |
| | CFG = Config() |
| |
|
| |
|
| | def browse_website(url: str, question: str) -> tuple[str, WebDriver]: |
| | """Browse a website and return the answer and links to the user |
| | |
| | Args: |
| | url (str): The url of the website to browse |
| | question (str): The question asked by the user |
| | |
| | Returns: |
| | Tuple[str, WebDriver]: The answer and links to the user and the webdriver |
| | """ |
| | driver, text = scrape_text_with_selenium(url) |
| | add_header(driver) |
| | summary_text = summary.summarize_text(url, text, question, driver) |
| | links = scrape_links_with_selenium(driver, url) |
| |
|
| | |
| | if len(links) > 5: |
| | links = links[:5] |
| | close_browser(driver) |
| | return f"Answer gathered from website: {summary_text} \n \n Links: {links}", driver |
| |
|
| |
|
| | def scrape_text_with_selenium(url: str) -> tuple[WebDriver, str]: |
| | """Scrape text from a website using selenium |
| | |
| | Args: |
| | url (str): The url of the website to scrape |
| | |
| | Returns: |
| | Tuple[WebDriver, str]: The webdriver and the text scraped from the website |
| | """ |
| | logging.getLogger("selenium").setLevel(logging.CRITICAL) |
| |
|
| | options_available = { |
| | "chrome": ChromeOptions, |
| | "safari": SafariOptions, |
| | "firefox": FirefoxOptions, |
| | } |
| |
|
| | options = options_available[CFG.selenium_web_browser]() |
| | options.add_argument( |
| | "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.49 Safari/537.36" |
| | ) |
| |
|
| | if CFG.selenium_web_browser == "firefox": |
| | driver = webdriver.Firefox( |
| | executable_path=GeckoDriverManager().install(), options=options |
| | ) |
| | elif CFG.selenium_web_browser == "safari": |
| | |
| | |
| | driver = webdriver.Safari(options=options) |
| | else: |
| | if platform == "linux" or platform == "linux2": |
| | options.add_argument("--disable-dev-shm-usage") |
| | options.add_argument("--remote-debugging-port=9222") |
| |
|
| | options.add_argument("--no-sandbox") |
| | if CFG.selenium_headless: |
| | options.add_argument("--headless") |
| | options.add_argument("--disable-gpu") |
| |
|
| | driver = webdriver.Chrome( |
| | executable_path=ChromeDriverManager().install(), options=options |
| | ) |
| | driver.get(url) |
| |
|
| | WebDriverWait(driver, 10).until( |
| | EC.presence_of_element_located((By.TAG_NAME, "body")) |
| | ) |
| |
|
| | |
| | page_source = driver.execute_script("return document.body.outerHTML;") |
| | soup = BeautifulSoup(page_source, "html.parser") |
| |
|
| | for script in soup(["script", "style"]): |
| | script.extract() |
| |
|
| | text = soup.get_text() |
| | lines = (line.strip() for line in text.splitlines()) |
| | chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
| | text = "\n".join(chunk for chunk in chunks if chunk) |
| | return driver, text |
| |
|
| |
|
| | def scrape_links_with_selenium(driver: WebDriver, url: str) -> list[str]: |
| | """Scrape links from a website using selenium |
| | |
| | Args: |
| | driver (WebDriver): The webdriver to use to scrape the links |
| | |
| | Returns: |
| | List[str]: The links scraped from the website |
| | """ |
| | page_source = driver.page_source |
| | soup = BeautifulSoup(page_source, "html.parser") |
| |
|
| | for script in soup(["script", "style"]): |
| | script.extract() |
| |
|
| | hyperlinks = extract_hyperlinks(soup, url) |
| |
|
| | return format_hyperlinks(hyperlinks) |
| |
|
| |
|
| | def close_browser(driver: WebDriver) -> None: |
| | """Close the browser |
| | |
| | Args: |
| | driver (WebDriver): The webdriver to close |
| | |
| | Returns: |
| | None |
| | """ |
| | driver.quit() |
| |
|
| |
|
| | def add_header(driver: WebDriver) -> None: |
| | """Add a header to the website |
| | |
| | Args: |
| | driver (WebDriver): The webdriver to use to add the header |
| | |
| | Returns: |
| | None |
| | """ |
| | driver.execute_script(open(f"{FILE_DIR}/js/overlay.js", "r").read()) |
| |
|