Spaces:
Sleeping
Sleeping
| import chromedriver_autoinstaller | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.common.by import By | |
| from selenium.common.exceptions import StaleElementReferenceException | |
| import json | |
| # chromedriver_autoinstaller.install() # Automatically installs compatible driver | |
| chromedriver_autoinstaller.install(path="/tmp") | |
| options = Options() | |
| # options.binary_location = "/usr/bin/chromium-browser" | |
| options.add_argument("--headless") | |
| options.add_argument("--no-sandbox") | |
| options.add_argument("--disable-dev-shm-usage") | |
| def scraper(link:str,options = options): | |
| driver = webdriver.Chrome(options=options) | |
| driver.get(link) | |
| page_text = driver.find_element(By.TAG_NAME, "body").text | |
| scripts = driver.find_elements(By.TAG_NAME, "script") | |
| script_sources = [s.get_attribute("src") for s in scripts if s.get_attribute("src")] | |
| links = driver.find_elements(By.TAG_NAME, "link") | |
| link_sources = [l.get_attribute("href") for l in links if l.get_attribute("href")] | |
| driver.quit() | |
| data = { | |
| "page_text": page_text, | |
| "script_sources": script_sources, | |
| "link_sources": link_sources, | |
| } | |
| return data | |