Spaces:
Sleeping
Sleeping
File size: 1,381 Bytes
fb05e78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import asyncio
from typing import Optional
import aiohttp
from src.scraping.exceptions import ArticleNotFoundError, FetchError
async def fetch_html(
url: str,
timeout_s: float = 20.0,
user_agent: Optional[str] = None,
) -> str:
"""
HTMLを取得する
Raises:
ArticleNotFoundError: 404エラーの場合
FetchError: その他のネットワークエラーの場合
"""
headers = {
"User-Agent": user_agent or (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
)
}
timeout = aiohttp.ClientTimeout(total=timeout_s)
try:
async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
async with session.get(url, allow_redirects=True) as resp:
if resp.status == 404:
raise ArticleNotFoundError(f"記事が見つかりません: {url}")
resp.raise_for_status()
return await resp.text()
except ArticleNotFoundError:
raise
except aiohttp.ClientResponseError as e:
raise FetchError(f"HTTPエラー {e.status}: {url}")
except (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError) as e:
raise FetchError(f"ネットワークエラー: {url} - {str(e)}")
|