Spaces:
Running
Running
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from datetime import datetime, timezone | |
| from typing import Awaitable, Callable | |
| from urllib.parse import urlsplit | |
| import aiohttp | |
| from .config import CrawlerConfig | |
| from .models import FetchResult | |
| from .rate_limit import RequestRateLimiter | |
| from .robots import RobotsPolicy | |
| from .utils import is_html_response, normalize_url | |
| class FetchOutcome: | |
| result: FetchResult | None | |
| robots_blocked: bool = False | |
| async def fetch_url( | |
| session: aiohttp.ClientSession, | |
| url: str, | |
| *, | |
| config: CrawlerConfig, | |
| mark_seen: Callable[[str], Awaitable[None]], | |
| rate_limiter: RequestRateLimiter, | |
| robots_policy: RobotsPolicy, | |
| ) -> FetchOutcome: | |
| fetched_at = datetime.now(timezone.utc).isoformat() | |
| requested_domain = (urlsplit(url).hostname or "").lower().strip(".") | |
| if not requested_domain: | |
| return FetchOutcome(result=None) | |
| if not await robots_policy.can_fetch(url): | |
| return FetchOutcome(result=None, robots_blocked=True) | |
| await rate_limiter.acquire(requested_domain) | |
| try: | |
| async with session.get(url, allow_redirects=True) as response: | |
| content_type = response.headers.get("content-type", "").lower() | |
| final_url = normalize_url(str(response.url)) | |
| if not final_url: | |
| return FetchOutcome(result=None) | |
| final_domain = (urlsplit(final_url).hostname or "").lower().strip(".") | |
| if not final_domain: | |
| return FetchOutcome(result=None) | |
| if not await robots_policy.can_fetch(final_url): | |
| return FetchOutcome(result=None, robots_blocked=True) | |
| await mark_seen(final_url) | |
| if response.status >= 400: | |
| return FetchOutcome(result=None) | |
| if not is_html_response(content_type, final_url): | |
| return FetchOutcome( | |
| result=FetchResult( | |
| url=final_url, | |
| status=response.status, | |
| fetched_at=fetched_at, | |
| content_type=content_type, | |
| html="", | |
| ) | |
| ) | |
| raw = await response.content.read(config.max_response_bytes + 1) | |
| if len(raw) > config.max_response_bytes: | |
| raw = raw[: config.max_response_bytes] | |
| html = raw.decode(response.charset or "utf-8", errors="ignore") | |
| return FetchOutcome( | |
| result=FetchResult( | |
| url=final_url, | |
| status=response.status, | |
| fetched_at=fetched_at, | |
| content_type=content_type, | |
| html=html, | |
| ) | |
| ) | |
| except Exception: | |
| return FetchOutcome(result=None) | |