webscrapper / scraper /navigator.py
AkshayStark's picture
intiail setup for scrapper
95f7828
# Core web scraping logic
import asyncio
import logging
from playwright.async_api import Locator, Page
from urllib.parse import urlparse
from .locator import locate_home_page_link, locate_info_page_links, normalize_url, scrape_page_content
from .utils import navigate_to_link
from .config import settings
logger = logging.getLogger('fastapi_cli')
async def navigate_and_scrape(
page: Page,
*,
verbose: bool = False,
) -> dict[str, dict[str, str]]:
r"""
Navigate to info and home pages and scrape their content.
Args:
page (Page): The page to navigate from.
verbose (bool, optional): Whether to print verbose logs. Defaults to False.
Returns:
dict[str, dict[str, str]]: Dictionary containing scraped data from info and home pages.
"""
original_url = page.url
parsed_url = urlparse(original_url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
scraped_data = {}
seen_urls = set()
try:
# First try info/about page
info_links = await locate_info_page_links(page)
if verbose:
logger.info(f"Current URL: {original_url}")
logger.info(
f"Located the following info page links: {await asyncio.gather(*(link.get_attribute('href') for link in info_links))}"
)
for i, link in enumerate(info_links):
try:
href = await link.get_attribute("href", timeout=30000)
absolute_url = normalize_url(href, base_url)
if absolute_url in seen_urls:
continue
seen_urls.add(absolute_url)
if not await navigate_to_link(page, link):
logger.warning("Failed to navigate to the info page link.")
continue
scraped_data['info_page'] = await scrape_page_content(page)
break
except Exception as e:
logger.warning(f"Failed to navigate to info page: {e}")
continue
# Reset the page if scraping failed and there are more links to try
if i < len(info_links) - 1:
logger.warning("Failed to scrape info page. Returning to the original page.")
await page.goto(original_url, wait_until="domcontentloaded")
# Return to original page before trying home page
await page.goto(original_url, wait_until="domcontentloaded")
# Try home page
home_link = await locate_home_page_link(page)
if home_link:
try:
href = await home_link.get_attribute("href", timeout=30000)
absolute_url = normalize_url(href, base_url)
if absolute_url not in seen_urls:
seen_urls.add(absolute_url)
if not await navigate_to_link(page, home_link):
logger.warning("Failed to navigate to the home page link.")
else:
scraped_data['home_page'] = await scrape_page_content(page)
except Exception as e:
logger.warning(f"Failed to navigate to home page: {e}")
finally:
# Always return to original page
await page.goto(original_url, wait_until="domcontentloaded")
if verbose:
logger.info(f"Scraped data: {scraped_data}")
return scraped_data