webscrapper / scraper /locator.py
AkshayStark's picture
intiail setup for scrapper
95f7828
import logging
import asyncio
from playwright.async_api import Locator, Page
logger = logging.getLogger('fastapi_cli')
# XPaths for navigation links
INFO_PAGE_XPATHS = [
"//a[contains(@href, 'about') or contains(@href, 'company') or contains(@href, 'info') or contains(translate(text(), 'ABOUT', 'about'), 'about')]",
"//a[contains(text(), '会社概要') or contains(text(), '企業情報')]"
]
HOME_PAGE_XPATHS = [
"//a[contains(@href, 'home') or contains(@href, 'index') or @href='/']",
"//a[contains(text(), 'ホーム') or contains(text(), 'HOME')]"
]
def normalize_url(url: str, base_url: str) -> str:
"""Normalize URL to absolute form."""
if not url:
return ""
if url.startswith("http"):
return url.rstrip("/")
if url.startswith("/"):
return base_url.rstrip("/") + url
return base_url.rstrip("/") + "/" + url
async def locate_info_page_links(page: Page) -> list[Locator]:
"""Locate all possible info/about page links in parallel."""
tasks = [page.locator(xpath).all() for xpath in INFO_PAGE_XPATHS]
results = await asyncio.gather(*tasks)
links = [link for result in results for link in result]
valid_links = []
for link in links:
href = await link.get_attribute("href")
if href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
valid_links.append(link)
return valid_links
async def locate_home_page_link(page: Page) -> Locator | None:
"""Locate the home page link in parallel."""
tasks = [page.locator(xpath).all() for xpath in HOME_PAGE_XPATHS]
results = await asyncio.gather(*tasks)
for links in results:
for link in links:
href = await link.get_attribute("href")
if href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
return link
return None
async def scrape_page_content(page: Page) -> dict[str, str]:
"""Scrape relevant textual content from the page in parallel."""
content = {
'url': page.url,
'title': await page.title(),
'main_content': '',
'company_info': '',
'meta_description': '',
'meta_keywords': ''
}
async def get_meta_data():
"""Fetch meta description and keywords concurrently."""
try:
meta_desc_task = page.locator('meta[name="description"]').get_attribute('content')
meta_keywords_task = page.locator('meta[name="keywords"]').get_attribute('content')
meta_desc, meta_keywords = await asyncio.gather(meta_desc_task, meta_keywords_task)
content['meta_description'] = meta_desc or ''
content['meta_keywords'] = meta_keywords or ''
except Exception as e:
logger.warning(f'Failed to get meta description and keywords: {e}')
async def get_main_content():
"""Fetch the main page content concurrently."""
main_selectors = [
"main", "article", ".main-content", "#main-content",
".content", "#content", "body" # fallback
]
for selector in main_selectors:
try:
elements = await page.locator(selector).all()
tasks = [element.inner_text() for element in elements if await element.is_visible()]
texts = await asyncio.gather(*tasks)
valid_texts = [text for text in texts if text and len(text) > 100]
if valid_texts:
content['main_content'] = valid_texts[0]
break
except Exception as e:
logger.warning(f'Failed to scrape the main page content: {e}')
async def get_company_info():
"""Fetch company information concurrently."""
company_selectors = [
".company-info", "#company-info",
"section:has-text('会社概要')", "div:has-text('企業情報')",
"table:has-text('会社概要')", "table:has-text('企業情報')"
]
for selector in company_selectors:
try:
elements = await page.locator(selector).all()
tasks = [element.inner_text() for element in elements if await element.is_visible()]
texts = await asyncio.gather(*tasks)
valid_texts = [text for text in texts if text]
if valid_texts:
content['company_info'] = valid_texts[0]
break
except Exception as e:
logger.warning(f'Failed to scrape company info: {e}')
# Run all scraping tasks concurrently
await asyncio.gather(get_meta_data(), get_main_content(), get_company_info())
return content