import logging import asyncio from playwright.async_api import Locator, Page logger = logging.getLogger('fastapi_cli') # XPaths for navigation links INFO_PAGE_XPATHS = [ "//a[contains(@href, 'about') or contains(@href, 'company') or contains(@href, 'info') or contains(translate(text(), 'ABOUT', 'about'), 'about')]", "//a[contains(text(), '会社概要') or contains(text(), '企業情報')]" ] HOME_PAGE_XPATHS = [ "//a[contains(@href, 'home') or contains(@href, 'index') or @href='/']", "//a[contains(text(), 'ホーム') or contains(text(), 'HOME')]" ] def normalize_url(url: str, base_url: str) -> str: """Normalize URL to absolute form.""" if not url: return "" if url.startswith("http"): return url.rstrip("/") if url.startswith("/"): return base_url.rstrip("/") + url return base_url.rstrip("/") + "/" + url async def locate_info_page_links(page: Page) -> list[Locator]: """Locate all possible info/about page links in parallel.""" tasks = [page.locator(xpath).all() for xpath in INFO_PAGE_XPATHS] results = await asyncio.gather(*tasks) links = [link for result in results for link in result] valid_links = [] for link in links: href = await link.get_attribute("href") if href and not href.startswith(("#", "javascript:", "mailto:", "tel:")): valid_links.append(link) return valid_links async def locate_home_page_link(page: Page) -> Locator | None: """Locate the home page link in parallel.""" tasks = [page.locator(xpath).all() for xpath in HOME_PAGE_XPATHS] results = await asyncio.gather(*tasks) for links in results: for link in links: href = await link.get_attribute("href") if href and not href.startswith(("#", "javascript:", "mailto:", "tel:")): return link return None async def scrape_page_content(page: Page) -> dict[str, str]: """Scrape relevant textual content from the page in parallel.""" content = { 'url': page.url, 'title': await page.title(), 'main_content': '', 'company_info': '', 'meta_description': '', 'meta_keywords': '' } async def get_meta_data(): """Fetch meta description and keywords concurrently.""" try: meta_desc_task = page.locator('meta[name="description"]').get_attribute('content') meta_keywords_task = page.locator('meta[name="keywords"]').get_attribute('content') meta_desc, meta_keywords = await asyncio.gather(meta_desc_task, meta_keywords_task) content['meta_description'] = meta_desc or '' content['meta_keywords'] = meta_keywords or '' except Exception as e: logger.warning(f'Failed to get meta description and keywords: {e}') async def get_main_content(): """Fetch the main page content concurrently.""" main_selectors = [ "main", "article", ".main-content", "#main-content", ".content", "#content", "body" # fallback ] for selector in main_selectors: try: elements = await page.locator(selector).all() tasks = [element.inner_text() for element in elements if await element.is_visible()] texts = await asyncio.gather(*tasks) valid_texts = [text for text in texts if text and len(text) > 100] if valid_texts: content['main_content'] = valid_texts[0] break except Exception as e: logger.warning(f'Failed to scrape the main page content: {e}') async def get_company_info(): """Fetch company information concurrently.""" company_selectors = [ ".company-info", "#company-info", "section:has-text('会社概要')", "div:has-text('企業情報')", "table:has-text('会社概要')", "table:has-text('企業情報')" ] for selector in company_selectors: try: elements = await page.locator(selector).all() tasks = [element.inner_text() for element in elements if await element.is_visible()] texts = await asyncio.gather(*tasks) valid_texts = [text for text in texts if text] if valid_texts: content['company_info'] = valid_texts[0] break except Exception as e: logger.warning(f'Failed to scrape company info: {e}') # Run all scraping tasks concurrently await asyncio.gather(get_meta_data(), get_main_content(), get_company_info()) return content