Spaces:
No application file
No application file
| """ | |
| Legacy scraper implementation for performance comparison | |
| """ | |
| import chromedriver_autoinstaller | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.common.by import By | |
| from selenium.common.exceptions import StaleElementReferenceException | |
| import json | |
| import time | |
| chromedriver_autoinstaller.install() | |
| def create_legacy_options(): | |
| """Create legacy Chrome options""" | |
| options = Options() | |
| options.binary_location = "/usr/bin/chromium-browser" | |
| options.add_argument("--headless") | |
| options.add_argument("--no-sandbox") | |
| options.add_argument("--disable-dev-shm-usage") | |
| return options | |
| def safe_get_attribute(elements, attr): | |
| """Legacy safe attribute extraction""" | |
| values = [] | |
| for e in elements: | |
| try: | |
| value = e.get_attribute(attr) | |
| if value: | |
| values.append(value) | |
| except StaleElementReferenceException: | |
| continue # Skip stale elements | |
| return values | |
| def legacy_scraper(link: str): | |
| """Legacy scraper implementation - creates new driver each time""" | |
| options = create_legacy_options() | |
| driver = webdriver.Chrome(options=options) | |
| try: | |
| driver.get(link) | |
| time.sleep(2) # Fixed delay for dynamic content | |
| try: | |
| page_text = driver.find_element(By.TAG_NAME, "body").text | |
| except StaleElementReferenceException: | |
| page_text = "" | |
| scripts = driver.find_elements(By.TAG_NAME, "script") | |
| links = driver.find_elements(By.TAG_NAME, "link") | |
| script_sources = safe_get_attribute(scripts, "src") | |
| link_sources = safe_get_attribute(links, "href") | |
| return { | |
| "page_text": page_text, | |
| "script_sources": script_sources, | |
| "link_sources": link_sources, | |
| } | |
| finally: | |
| driver.quit() | |