selenium-scraper / legacy_scraper.py
apexherbert200's picture
First commit
f2c46e7
"""
Legacy scraper implementation for performance comparison
"""
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
import json
import time
chromedriver_autoinstaller.install()
def create_legacy_options():
"""Create legacy Chrome options"""
options = Options()
options.binary_location = "/usr/bin/chromium-browser"
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
return options
def safe_get_attribute(elements, attr):
"""Legacy safe attribute extraction"""
values = []
for e in elements:
try:
value = e.get_attribute(attr)
if value:
values.append(value)
except StaleElementReferenceException:
continue # Skip stale elements
return values
def legacy_scraper(link: str):
"""Legacy scraper implementation - creates new driver each time"""
options = create_legacy_options()
driver = webdriver.Chrome(options=options)
try:
driver.get(link)
time.sleep(2) # Fixed delay for dynamic content
try:
page_text = driver.find_element(By.TAG_NAME, "body").text
except StaleElementReferenceException:
page_text = ""
scripts = driver.find_elements(By.TAG_NAME, "script")
links = driver.find_elements(By.TAG_NAME, "link")
script_sources = safe_get_attribute(scripts, "src")
link_sources = safe_get_attribute(links, "href")
return {
"page_text": page_text,
"script_sources": script_sources,
"link_sources": link_sources,
}
finally:
driver.quit()