Spaces:
Running
Running
File size: 8,977 Bytes
0157ac7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 | """Outbound HTTP for web_search / web_fetch (client, body caps, logging)."""
from __future__ import annotations
import asyncio
import socket
from collections.abc import AsyncIterator
from urllib.parse import urljoin, urlparse
import aiohttp
import httpx
from aiohttp import ClientSession, ClientTimeout, TCPConnector
from aiohttp.abc import AbstractResolver, ResolveResult
from loguru import logger
from . import constants
from .constants import (
_MAX_FETCH_CHARS,
_MAX_SEARCH_RESULTS,
_REDIRECT_RESPONSE_BODY_CAP_BYTES,
_REQUEST_TIMEOUT_S,
_WEB_FETCH_REDIRECT_STATUSES,
_WEB_TOOL_HTTP_HEADERS,
)
from .egress import (
WebFetchEgressPolicy,
WebFetchEgressViolation,
get_validated_stream_addrinfos_for_egress,
)
from .parsers import HTMLTextParser, SearchResultParser
def _safe_public_host_for_logs(url: str) -> str:
host = urlparse(url).hostname or ""
return host[:253]
def _log_web_tool_failure(
tool_name: str,
error: BaseException,
*,
fetch_url: str | None = None,
) -> None:
exc_type = type(error).__name__
if isinstance(error, WebFetchEgressViolation):
host = _safe_public_host_for_logs(fetch_url) if fetch_url else ""
logger.warning(
"web_tool_egress_rejected tool={} exc_type={} host={!r}",
tool_name,
exc_type,
host,
)
return
if tool_name == "web_fetch" and fetch_url:
logger.warning(
"web_tool_failure tool={} exc_type={} host={!r}",
tool_name,
exc_type,
_safe_public_host_for_logs(fetch_url),
)
else:
logger.warning("web_tool_failure tool={} exc_type={}", tool_name, exc_type)
def _web_tool_client_error_summary(
tool_name: str,
error: BaseException,
*,
verbose: bool,
) -> str:
if verbose:
return f"{tool_name} failed: {type(error).__name__}"
return "Web tool request failed."
async def _iter_response_body_under_cap(
response: httpx.Response, max_bytes: int
) -> AsyncIterator[bytes]:
if max_bytes <= 0:
return
received = 0
async for chunk in response.aiter_bytes(chunk_size=65_536):
if received >= max_bytes:
break
remaining = max_bytes - received
if len(chunk) <= remaining:
received += len(chunk)
yield chunk
if received >= max_bytes:
break
else:
yield chunk[:remaining]
break
async def _drain_response_body_capped(response: httpx.Response, max_bytes: int) -> None:
async for _ in _iter_response_body_under_cap(response, max_bytes):
pass
async def _read_response_body_capped(response: httpx.Response, max_bytes: int) -> bytes:
return b"".join(
[piece async for piece in _iter_response_body_under_cap(response, max_bytes)]
)
_NUMERIC_RESOLVE_FLAGS = socket.AI_NUMERICHOST | socket.AI_NUMERICSERV
_NAME_RESOLVE_FLAGS = socket.NI_NUMERICHOST | socket.NI_NUMERICSERV
def getaddrinfo_rows_to_resolve_results(
host: str, addrinfos: list[tuple]
) -> list[ResolveResult]:
"""Map :func:`socket.getaddrinfo` rows to aiohttp :class:`ResolveResult` (ThreadedResolver logic)."""
out: list[ResolveResult] = []
for family, _type, proto, _canon, sockaddr in addrinfos:
if family == socket.AF_INET6:
if len(sockaddr) < 3:
continue
if sockaddr[3]:
resolved_host, port = socket.getnameinfo(sockaddr, _NAME_RESOLVE_FLAGS)
else:
resolved_host, port = sockaddr[:2]
else:
assert family == socket.AF_INET, family
resolved_host, port = sockaddr[0], sockaddr[1]
resolved_host = str(resolved_host)
port = int(port)
out.append(
ResolveResult(
hostname=host,
host=resolved_host,
port=int(port),
family=family,
proto=proto,
flags=_NUMERIC_RESOLVE_FLAGS,
)
)
return out
class _PinnedEgressStaticResolver(AbstractResolver):
"""Return only pre-validated :class:`ResolveResult` for the outbound request."""
def __init__(self, results: list[ResolveResult]) -> None:
self._results = results
async def resolve(
self, host: str, port: int = 0, family: int = socket.AF_INET
) -> list[ResolveResult]:
return self._results
async def close(self) -> None: # pragma: no cover - aiohttp contract
return
async def _read_aiohttp_body_capped(
response: aiohttp.ClientResponse, max_bytes: int
) -> bytes:
received = 0
parts: list[bytes] = []
async for chunk in response.content.iter_chunked(65_536):
if received >= max_bytes:
break
remaining = max_bytes - received
if len(chunk) <= remaining:
received += len(chunk)
parts.append(chunk)
else:
parts.append(chunk[:remaining])
break
return b"".join(parts)
async def _drain_aiohttp_body_capped(
response: aiohttp.ClientResponse, max_bytes: int
) -> None:
if max_bytes <= 0:
return
received = 0
async for chunk in response.content.iter_chunked(65_536):
received += len(chunk)
if received >= max_bytes:
break
async def _run_web_search(query: str) -> list[dict[str, str]]:
async with (
httpx.AsyncClient(
timeout=_REQUEST_TIMEOUT_S,
follow_redirects=True,
headers=_WEB_TOOL_HTTP_HEADERS,
) as client,
client.stream(
"GET",
"https://lite.duckduckgo.com/lite/",
params={"q": query},
) as response,
):
response.raise_for_status()
body_bytes = await _read_response_body_capped(
response, constants._MAX_WEB_FETCH_RESPONSE_BYTES
)
text = body_bytes.decode("utf-8", errors="replace")
parser = SearchResultParser()
parser.feed(text)
return parser.results[:_MAX_SEARCH_RESULTS]
async def _run_web_fetch(url: str, egress: WebFetchEgressPolicy) -> dict[str, str]:
"""Fetch URL with manual redirects; each hop is DNS-pinned to validated addresses."""
current_url = url
redirect_hops = 0
timeout = ClientTimeout(total=_REQUEST_TIMEOUT_S)
while True:
addr_infos = await asyncio.to_thread(
get_validated_stream_addrinfos_for_egress, current_url, egress
)
host = urlparse(current_url).hostname or ""
results = getaddrinfo_rows_to_resolve_results(host, addr_infos)
resolver = _PinnedEgressStaticResolver(results)
connector = TCPConnector(
resolver=resolver,
force_close=True,
)
try:
async with (
ClientSession(
timeout=timeout,
headers=_WEB_TOOL_HTTP_HEADERS,
connector=connector,
) as session,
session.get(current_url, allow_redirects=False) as response,
):
if response.status in _WEB_FETCH_REDIRECT_STATUSES:
await _drain_aiohttp_body_capped(
response, _REDIRECT_RESPONSE_BODY_CAP_BYTES
)
if redirect_hops >= constants._MAX_WEB_FETCH_REDIRECTS:
raise WebFetchEgressViolation(
"web_fetch exceeded maximum redirects "
f"({constants._MAX_WEB_FETCH_REDIRECTS})"
)
location = response.headers.get("location")
if not location or not location.strip():
raise WebFetchEgressViolation(
"web_fetch redirect response missing Location header"
)
current_url = urljoin(str(response.url), location.strip())
redirect_hops += 1
continue
response.raise_for_status()
content_type = response.headers.get("content-type", "text/plain")
final_url = str(response.url)
encoding = response.get_encoding() or "utf-8"
body_bytes = await _read_aiohttp_body_capped(
response, constants._MAX_WEB_FETCH_RESPONSE_BYTES
)
finally:
await connector.close()
break
text = body_bytes.decode(encoding, errors="replace")
title = final_url
data = text
if "html" in content_type.lower():
parser = HTMLTextParser()
parser.feed(text)
title = parser.title or final_url
data = "\n".join(parser.text_parts)
return {
"url": final_url,
"title": title,
"media_type": "text/plain",
"data": data[:_MAX_FETCH_CHARS],
}
|