Spaces:
Running
Running
| import asyncio | |
| from typing import Optional | |
| import aiohttp | |
| from src.scraping.exceptions import ArticleNotFoundError, FetchError | |
| async def fetch_html( | |
| url: str, | |
| timeout_s: float = 20.0, | |
| user_agent: Optional[str] = None, | |
| ) -> str: | |
| """ | |
| HTMLを取得する | |
| Raises: | |
| ArticleNotFoundError: 404エラーの場合 | |
| FetchError: その他のネットワークエラーの場合 | |
| """ | |
| headers = { | |
| "User-Agent": user_agent or ( | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " | |
| "(KHTML, like Gecko) Chrome/124.0 Safari/537.36" | |
| ) | |
| } | |
| timeout = aiohttp.ClientTimeout(total=timeout_s) | |
| try: | |
| async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session: | |
| async with session.get(url, allow_redirects=True) as resp: | |
| if resp.status == 404: | |
| raise ArticleNotFoundError(f"記事が見つかりません: {url}") | |
| resp.raise_for_status() | |
| return await resp.text() | |
| except ArticleNotFoundError: | |
| raise | |
| except aiohttp.ClientResponseError as e: | |
| raise FetchError(f"HTTPエラー {e.status}: {url}") | |
| except (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError) as e: | |
| raise FetchError(f"ネットワークエラー: {url} - {str(e)}") | |