from __future__ import annotations from dataclasses import dataclass from datetime import datetime, timezone from typing import Awaitable, Callable from urllib.parse import urlsplit import aiohttp from .config import CrawlerConfig from .models import FetchResult from .rate_limit import RequestRateLimiter from .robots import RobotsPolicy from .utils import is_html_response, normalize_url @dataclass class FetchOutcome: result: FetchResult | None robots_blocked: bool = False async def fetch_url( session: aiohttp.ClientSession, url: str, *, config: CrawlerConfig, mark_seen: Callable[[str], Awaitable[None]], rate_limiter: RequestRateLimiter, robots_policy: RobotsPolicy, ) -> FetchOutcome: fetched_at = datetime.now(timezone.utc).isoformat() requested_domain = (urlsplit(url).hostname or "").lower().strip(".") if not requested_domain: return FetchOutcome(result=None) if not await robots_policy.can_fetch(url): return FetchOutcome(result=None, robots_blocked=True) await rate_limiter.acquire(requested_domain) try: async with session.get(url, allow_redirects=True) as response: content_type = response.headers.get("content-type", "").lower() final_url = normalize_url(str(response.url)) if not final_url: return FetchOutcome(result=None) final_domain = (urlsplit(final_url).hostname or "").lower().strip(".") if not final_domain: return FetchOutcome(result=None) if not await robots_policy.can_fetch(final_url): return FetchOutcome(result=None, robots_blocked=True) await mark_seen(final_url) if response.status >= 400: return FetchOutcome(result=None) if not is_html_response(content_type, final_url): return FetchOutcome( result=FetchResult( url=final_url, status=response.status, fetched_at=fetched_at, content_type=content_type, html="", ) ) raw = await response.content.read(config.max_response_bytes + 1) if len(raw) > config.max_response_bytes: raw = raw[: config.max_response_bytes] html = raw.decode(response.charset or "utf-8", errors="ignore") return FetchOutcome( result=FetchResult( url=final_url, status=response.status, fetched_at=fetched_at, content_type=content_type, html=html, ) ) except Exception: return FetchOutcome(result=None)