Spaces:

AkshayStark
/

webscrapper

Sleeping

App Files Files Community

webscrapper / scraper /utils.py

AkshayStark

intiail setup for scrapper

95f7828 about 1 year ago

raw

history blame contribute delete

2.05 kB

	import logging
	from urllib.parse import urlparse

	from playwright.async_api import Locator, Page

	from .config import settings

	logger = logging.getLogger('fastapi_cli')

	async def navigate_to_link(page: Page, link: Locator) -> bool:
	r"""
	Navigate to the link.

	Args:
	page (Page): The page to navigate.
	link (Locator): The link to navigate to.

	Returns:
	bool: True if navigated, False otherwise.
	"""

	navigated = False

	# If the link does not open in a new tab, click it.
	try:
	target = await link.get_attribute("target", timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)

	if target != "_blank":
	await link.click(timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
	await page.wait_for_load_state("domcontentloaded")
	navigated = True

	except Exception as e:
	logger.warning(f"Failed to click the link: {e}")

	# If the link opens in a new tab, replace the current url with the link's url.
	if not navigated:
	try:
	parsed_url = urlparse(page.url)
	base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
	base_url = base_url[:-1] if base_url.endswith("/") else base_url

	href = await link.get_attribute("href", timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
	if not href:
	logger.warning("No href attribute found while trying to navigate to the link.")
	return navigated

	# NOTE: e.g. /contact -> https://example.com/contact
	if href.startswith("/"):
	href = base_url + href

	# NOTE: e.g. otoiawase/ -> https://example.com/otoiawase/
	elif not href.startswith("http"):
	href = base_url + "/" + href

	# NOTE: e.g. https://example.com/contact
	await page.goto(href, wait_until="domcontentloaded")

	navigated = True

	except Exception as e:
	logger.warning(f"Failed to navigate to the link: {e}")

	return navigated