Spaces:

apexherbert200
/

selenium-scraper

No application file

App Files Files Community

selenium-scraper / legacy_scraper.py

apexherbert200

First commit

f2c46e7 9 months ago

raw

history blame contribute delete

1.92 kB

	"""
	Legacy scraper implementation for performance comparison
	"""

	import chromedriver_autoinstaller
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.common.by import By
	from selenium.common.exceptions import StaleElementReferenceException
	import json
	import time

	chromedriver_autoinstaller.install()

	def create_legacy_options():
	"""Create legacy Chrome options"""
	options = Options()
	options.binary_location = "/usr/bin/chromium-browser"
	options.add_argument("--headless")
	options.add_argument("--no-sandbox")
	options.add_argument("--disable-dev-shm-usage")
	return options

	def safe_get_attribute(elements, attr):
	"""Legacy safe attribute extraction"""
	values = []
	for e in elements:
	try:
	value = e.get_attribute(attr)
	if value:
	values.append(value)
	except StaleElementReferenceException:
	continue # Skip stale elements
	return values

	def legacy_scraper(link: str):
	"""Legacy scraper implementation - creates new driver each time"""
	options = create_legacy_options()
	driver = webdriver.Chrome(options=options)

	try:
	driver.get(link)
	time.sleep(2) # Fixed delay for dynamic content

	try:
	page_text = driver.find_element(By.TAG_NAME, "body").text
	except StaleElementReferenceException:
	page_text = ""

	scripts = driver.find_elements(By.TAG_NAME, "script")
	links = driver.find_elements(By.TAG_NAME, "link")

	script_sources = safe_get_attribute(scripts, "src")
	link_sources = safe_get_attribute(links, "href")

	return {
	"page_text": page_text,
	"script_sources": script_sources,
	"link_sources": link_sources,
	}

	finally:
	driver.quit()