AutoWS / crawler /fetch.py
Roman190928's picture
Upload AutoWS app files without plan/readme
f55f92e verified
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Awaitable, Callable
from urllib.parse import urlsplit
import aiohttp
from .config import CrawlerConfig
from .models import FetchResult
from .rate_limit import RequestRateLimiter
from .robots import RobotsPolicy
from .utils import is_html_response, normalize_url
@dataclass
class FetchOutcome:
result: FetchResult | None
robots_blocked: bool = False
async def fetch_url(
session: aiohttp.ClientSession,
url: str,
*,
config: CrawlerConfig,
mark_seen: Callable[[str], Awaitable[None]],
rate_limiter: RequestRateLimiter,
robots_policy: RobotsPolicy,
) -> FetchOutcome:
fetched_at = datetime.now(timezone.utc).isoformat()
requested_domain = (urlsplit(url).hostname or "").lower().strip(".")
if not requested_domain:
return FetchOutcome(result=None)
if not await robots_policy.can_fetch(url):
return FetchOutcome(result=None, robots_blocked=True)
await rate_limiter.acquire(requested_domain)
try:
async with session.get(url, allow_redirects=True) as response:
content_type = response.headers.get("content-type", "").lower()
final_url = normalize_url(str(response.url))
if not final_url:
return FetchOutcome(result=None)
final_domain = (urlsplit(final_url).hostname or "").lower().strip(".")
if not final_domain:
return FetchOutcome(result=None)
if not await robots_policy.can_fetch(final_url):
return FetchOutcome(result=None, robots_blocked=True)
await mark_seen(final_url)
if response.status >= 400:
return FetchOutcome(result=None)
if not is_html_response(content_type, final_url):
return FetchOutcome(
result=FetchResult(
url=final_url,
status=response.status,
fetched_at=fetched_at,
content_type=content_type,
html="",
)
)
raw = await response.content.read(config.max_response_bytes + 1)
if len(raw) > config.max_response_bytes:
raw = raw[: config.max_response_bytes]
html = raw.decode(response.charset or "utf-8", errors="ignore")
return FetchOutcome(
result=FetchResult(
url=final_url,
status=response.status,
fetched_at=fetched_at,
content_type=content_type,
html=html,
)
)
except Exception:
return FetchOutcome(result=None)