webscrapper / scraper /utils.py
AkshayStark's picture
intiail setup for scrapper
95f7828
import logging
from urllib.parse import urlparse
from playwright.async_api import Locator, Page
from .config import settings
logger = logging.getLogger('fastapi_cli')
async def navigate_to_link(page: Page, link: Locator) -> bool:
r"""
Navigate to the link.
Args:
page (Page): The page to navigate.
link (Locator): The link to navigate to.
Returns:
bool: True if navigated, False otherwise.
"""
navigated = False
# If the link does not open in a new tab, click it.
try:
target = await link.get_attribute("target", timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
if target != "_blank":
await link.click(timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
await page.wait_for_load_state("domcontentloaded")
navigated = True
except Exception as e:
logger.warning(f"Failed to click the link: {e}")
# If the link opens in a new tab, replace the current url with the link's url.
if not navigated:
try:
parsed_url = urlparse(page.url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
base_url = base_url[:-1] if base_url.endswith("/") else base_url
href = await link.get_attribute("href", timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
if not href:
logger.warning("No href attribute found while trying to navigate to the link.")
return navigated
# NOTE: e.g. /contact -> https://example.com/contact
if href.startswith("/"):
href = base_url + href
# NOTE: e.g. otoiawase/ -> https://example.com/otoiawase/
elif not href.startswith("http"):
href = base_url + "/" + href
# NOTE: e.g. https://example.com/contact
await page.goto(href, wait_until="domcontentloaded")
navigated = True
except Exception as e:
logger.warning(f"Failed to navigate to the link: {e}")
return navigated