Spaces:
Sleeping
Sleeping
| # Core web scraping logic | |
| import asyncio | |
| import logging | |
| from playwright.async_api import Locator, Page | |
| from urllib.parse import urlparse | |
| from .locator import locate_home_page_link, locate_info_page_links, normalize_url, scrape_page_content | |
| from .utils import navigate_to_link | |
| from .config import settings | |
| logger = logging.getLogger('fastapi_cli') | |
| async def navigate_and_scrape( | |
| page: Page, | |
| *, | |
| verbose: bool = False, | |
| ) -> dict[str, dict[str, str]]: | |
| r""" | |
| Navigate to info and home pages and scrape their content. | |
| Args: | |
| page (Page): The page to navigate from. | |
| verbose (bool, optional): Whether to print verbose logs. Defaults to False. | |
| Returns: | |
| dict[str, dict[str, str]]: Dictionary containing scraped data from info and home pages. | |
| """ | |
| original_url = page.url | |
| parsed_url = urlparse(original_url) | |
| base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" | |
| scraped_data = {} | |
| seen_urls = set() | |
| try: | |
| # First try info/about page | |
| info_links = await locate_info_page_links(page) | |
| if verbose: | |
| logger.info(f"Current URL: {original_url}") | |
| logger.info( | |
| f"Located the following info page links: {await asyncio.gather(*(link.get_attribute('href') for link in info_links))}" | |
| ) | |
| for i, link in enumerate(info_links): | |
| try: | |
| href = await link.get_attribute("href", timeout=30000) | |
| absolute_url = normalize_url(href, base_url) | |
| if absolute_url in seen_urls: | |
| continue | |
| seen_urls.add(absolute_url) | |
| if not await navigate_to_link(page, link): | |
| logger.warning("Failed to navigate to the info page link.") | |
| continue | |
| scraped_data['info_page'] = await scrape_page_content(page) | |
| break | |
| except Exception as e: | |
| logger.warning(f"Failed to navigate to info page: {e}") | |
| continue | |
| # Reset the page if scraping failed and there are more links to try | |
| if i < len(info_links) - 1: | |
| logger.warning("Failed to scrape info page. Returning to the original page.") | |
| await page.goto(original_url, wait_until="domcontentloaded") | |
| # Return to original page before trying home page | |
| await page.goto(original_url, wait_until="domcontentloaded") | |
| # Try home page | |
| home_link = await locate_home_page_link(page) | |
| if home_link: | |
| try: | |
| href = await home_link.get_attribute("href", timeout=30000) | |
| absolute_url = normalize_url(href, base_url) | |
| if absolute_url not in seen_urls: | |
| seen_urls.add(absolute_url) | |
| if not await navigate_to_link(page, home_link): | |
| logger.warning("Failed to navigate to the home page link.") | |
| else: | |
| scraped_data['home_page'] = await scrape_page_content(page) | |
| except Exception as e: | |
| logger.warning(f"Failed to navigate to home page: {e}") | |
| finally: | |
| # Always return to original page | |
| await page.goto(original_url, wait_until="domcontentloaded") | |
| if verbose: | |
| logger.info(f"Scraped data: {scraped_data}") | |
| return scraped_data |