| import axios, { type AxiosResponse } from 'axios' |
| import { LRUCache } from 'lru-cache' |
| import { |
| type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, |
| logEvent, |
| } from '../../services/analytics/index.js' |
| import { queryHaiku } from '../../services/api/claude.js' |
| import { AbortError } from '../../utils/errors.js' |
| import { getWebFetchUserAgent } from '../../utils/http.js' |
| import { logError } from '../../utils/log.js' |
| import { |
| isBinaryContentType, |
| persistBinaryContent, |
| } from '../../utils/mcpOutputStorage.js' |
| import { getSettings_DEPRECATED } from '../../utils/settings/settings.js' |
| import { asSystemPrompt } from '../../utils/systemPromptType.js' |
| import { isPreapprovedHost } from './preapproved.js' |
| import { makeSecondaryModelPrompt } from './prompt.js' |
|
|
| |
| class DomainBlockedError extends Error { |
| constructor(domain: string) { |
| super(`Claude Code is unable to fetch from ${domain}`) |
| this.name = 'DomainBlockedError' |
| } |
| } |
|
|
| class DomainCheckFailedError extends Error { |
| constructor(domain: string) { |
| super( |
| `Unable to verify if domain ${domain} is safe to fetch. This may be due to network restrictions or enterprise security policies blocking claude.ai.`, |
| ) |
| this.name = 'DomainCheckFailedError' |
| } |
| } |
|
|
| class EgressBlockedError extends Error { |
| constructor(public readonly domain: string) { |
| super( |
| JSON.stringify({ |
| error_type: 'EGRESS_BLOCKED', |
| domain, |
| message: `Access to ${domain} is blocked by the network egress proxy.`, |
| }), |
| ) |
| this.name = 'EgressBlockedError' |
| } |
| } |
|
|
| |
| type CacheEntry = { |
| bytes: number |
| code: number |
| codeText: string |
| content: string |
| contentType: string |
| persistedPath?: string |
| persistedSize?: number |
| } |
|
|
| |
| |
| const CACHE_TTL_MS = 15 * 60 * 1000 |
| const MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 |
|
|
| const URL_CACHE = new LRUCache<string, CacheEntry>({ |
| maxSize: MAX_CACHE_SIZE_BYTES, |
| ttl: CACHE_TTL_MS, |
| }) |
|
|
| |
| |
| |
| |
| const DOMAIN_CHECK_CACHE = new LRUCache<string, true>({ |
| max: 128, |
| ttl: 5 * 60 * 1000, |
| }) |
|
|
| export function clearWebFetchCache(): void { |
| URL_CACHE.clear() |
| DOMAIN_CHECK_CACHE.clear() |
| } |
|
|
| |
| |
| |
| |
| |
| type TurndownCtor = typeof import('turndown') |
| let turndownServicePromise: Promise<InstanceType<TurndownCtor>> | undefined |
| function getTurndownService(): Promise<InstanceType<TurndownCtor>> { |
| return (turndownServicePromise ??= import('turndown').then(m => { |
| const Turndown = (m as unknown as { default: TurndownCtor }).default |
| return new Turndown() |
| })) |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| const MAX_URL_LENGTH = 2000 |
|
|
| |
| |
| |
| |
| const MAX_HTTP_CONTENT_LENGTH = 10 * 1024 * 1024 |
|
|
| |
| |
| const FETCH_TIMEOUT_MS = 60_000 |
|
|
| |
| const DOMAIN_CHECK_TIMEOUT_MS = 10_000 |
|
|
| |
| |
| |
| |
| const MAX_REDIRECTS = 10 |
|
|
| |
| export const MAX_MARKDOWN_LENGTH = 100_000 |
|
|
| export function isPreapprovedUrl(url: string): boolean { |
| try { |
| const parsedUrl = new URL(url) |
| return isPreapprovedHost(parsedUrl.hostname, parsedUrl.pathname) |
| } catch { |
| return false |
| } |
| } |
|
|
| export function validateURL(url: string): boolean { |
| if (url.length > MAX_URL_LENGTH) { |
| return false |
| } |
|
|
| let parsed |
| try { |
| parsed = new URL(url) |
| } catch { |
| return false |
| } |
|
|
| |
|
|
| |
| |
| |
| if (parsed.username || parsed.password) { |
| return false |
| } |
|
|
| |
| |
| const hostname = parsed.hostname |
| const parts = hostname.split('.') |
| if (parts.length < 2) { |
| return false |
| } |
|
|
| return true |
| } |
|
|
| type DomainCheckResult = |
| | { status: 'allowed' } |
| | { status: 'blocked' } |
| | { status: 'check_failed'; error: Error } |
|
|
| export async function checkDomainBlocklist( |
| domain: string, |
| ): Promise<DomainCheckResult> { |
| if (DOMAIN_CHECK_CACHE.has(domain)) { |
| return { status: 'allowed' } |
| } |
| try { |
| const response = await axios.get( |
| `https://api.anthropic.com/api/web/domain_info?domain=${encodeURIComponent(domain)}`, |
| { timeout: DOMAIN_CHECK_TIMEOUT_MS }, |
| ) |
| if (response.status === 200) { |
| if (response.data.can_fetch === true) { |
| DOMAIN_CHECK_CACHE.set(domain, true) |
| return { status: 'allowed' } |
| } |
| return { status: 'blocked' } |
| } |
| |
| return { |
| status: 'check_failed', |
| error: new Error(`Domain check returned status ${response.status}`), |
| } |
| } catch (e) { |
| logError(e) |
| return { status: 'check_failed', error: e as Error } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| export function isPermittedRedirect( |
| originalUrl: string, |
| redirectUrl: string, |
| ): boolean { |
| try { |
| const parsedOriginal = new URL(originalUrl) |
| const parsedRedirect = new URL(redirectUrl) |
|
|
| if (parsedRedirect.protocol !== parsedOriginal.protocol) { |
| return false |
| } |
|
|
| if (parsedRedirect.port !== parsedOriginal.port) { |
| return false |
| } |
|
|
| if (parsedRedirect.username || parsedRedirect.password) { |
| return false |
| } |
|
|
| |
| |
| |
| |
| const stripWww = (hostname: string) => hostname.replace(/^www\./, '') |
| const originalHostWithoutWww = stripWww(parsedOriginal.hostname) |
| const redirectHostWithoutWww = stripWww(parsedRedirect.hostname) |
| return originalHostWithoutWww === redirectHostWithoutWww |
| } catch (_error) { |
| return false |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| type RedirectInfo = { |
| type: 'redirect' |
| originalUrl: string |
| redirectUrl: string |
| statusCode: number |
| } |
|
|
| export async function getWithPermittedRedirects( |
| url: string, |
| signal: AbortSignal, |
| redirectChecker: (originalUrl: string, redirectUrl: string) => boolean, |
| depth = 0, |
| ): Promise<AxiosResponse<ArrayBuffer> | RedirectInfo> { |
| if (depth > MAX_REDIRECTS) { |
| throw new Error(`Too many redirects (exceeded ${MAX_REDIRECTS})`) |
| } |
| try { |
| return await axios.get(url, { |
| signal, |
| timeout: FETCH_TIMEOUT_MS, |
| maxRedirects: 0, |
| responseType: 'arraybuffer', |
| maxContentLength: MAX_HTTP_CONTENT_LENGTH, |
| headers: { |
| Accept: 'text/markdown, text/html, */*', |
| 'User-Agent': getWebFetchUserAgent(), |
| }, |
| }) |
| } catch (error) { |
| if ( |
| axios.isAxiosError(error) && |
| error.response && |
| [301, 302, 307, 308].includes(error.response.status) |
| ) { |
| const redirectLocation = error.response.headers.location |
| if (!redirectLocation) { |
| throw new Error('Redirect missing Location header') |
| } |
|
|
| |
| const redirectUrl = new URL(redirectLocation, url).toString() |
|
|
| if (redirectChecker(url, redirectUrl)) { |
| |
| return getWithPermittedRedirects( |
| redirectUrl, |
| signal, |
| redirectChecker, |
| depth + 1, |
| ) |
| } else { |
| |
| return { |
| type: 'redirect', |
| originalUrl: url, |
| redirectUrl, |
| statusCode: error.response.status, |
| } |
| } |
| } |
|
|
| |
| |
| if ( |
| axios.isAxiosError(error) && |
| error.response?.status === 403 && |
| error.response.headers['x-proxy-error'] === 'blocked-by-allowlist' |
| ) { |
| const hostname = new URL(url).hostname |
| throw new EgressBlockedError(hostname) |
| } |
|
|
| throw error |
| } |
| } |
|
|
| function isRedirectInfo( |
| response: AxiosResponse<ArrayBuffer> | RedirectInfo, |
| ): response is RedirectInfo { |
| return 'type' in response && response.type === 'redirect' |
| } |
|
|
| export type FetchedContent = { |
| content: string |
| bytes: number |
| code: number |
| codeText: string |
| contentType: string |
| persistedPath?: string |
| persistedSize?: number |
| } |
|
|
| export async function getURLMarkdownContent( |
| url: string, |
| abortController: AbortController, |
| ): Promise<FetchedContent | RedirectInfo> { |
| if (!validateURL(url)) { |
| throw new Error('Invalid URL') |
| } |
|
|
| |
| const cachedEntry = URL_CACHE.get(url) |
| if (cachedEntry) { |
| return { |
| bytes: cachedEntry.bytes, |
| code: cachedEntry.code, |
| codeText: cachedEntry.codeText, |
| content: cachedEntry.content, |
| contentType: cachedEntry.contentType, |
| persistedPath: cachedEntry.persistedPath, |
| persistedSize: cachedEntry.persistedSize, |
| } |
| } |
|
|
| let parsedUrl: URL |
| let upgradedUrl = url |
|
|
| try { |
| parsedUrl = new URL(url) |
|
|
| |
| if (parsedUrl.protocol === 'http:') { |
| parsedUrl.protocol = 'https:' |
| upgradedUrl = parsedUrl.toString() |
| } |
|
|
| const hostname = parsedUrl.hostname |
|
|
| |
| |
| |
| const settings = getSettings_DEPRECATED() |
| if (!settings.skipWebFetchPreflight) { |
| const checkResult = await checkDomainBlocklist(hostname) |
| switch (checkResult.status) { |
| case 'allowed': |
| |
| break |
| case 'blocked': |
| throw new DomainBlockedError(hostname) |
| case 'check_failed': |
| throw new DomainCheckFailedError(hostname) |
| } |
| } |
|
|
| if (process.env.USER_TYPE === 'ant') { |
| logEvent('tengu_web_fetch_host', { |
| hostname: |
| hostname as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, |
| }) |
| } |
| } catch (e) { |
| if ( |
| e instanceof DomainBlockedError || |
| e instanceof DomainCheckFailedError |
| ) { |
| |
| throw e |
| } |
| logError(e) |
| } |
|
|
| const response = await getWithPermittedRedirects( |
| upgradedUrl, |
| abortController.signal, |
| isPermittedRedirect, |
| ) |
|
|
| |
| if (isRedirectInfo(response)) { |
| return response |
| } |
|
|
| const rawBuffer = Buffer.from(response.data) |
| |
| |
| |
| ;(response as { data: unknown }).data = null |
| const contentType = response.headers['content-type'] ?? '' |
|
|
| |
| |
| |
| |
| |
| let persistedPath: string | undefined |
| let persistedSize: number | undefined |
| if (isBinaryContentType(contentType)) { |
| const persistId = `webfetch-${Date.now()}-${Math.random().toString(36).slice(2, 8)}` |
| const result = await persistBinaryContent(rawBuffer, contentType, persistId) |
| if (!('error' in result)) { |
| persistedPath = result.filepath |
| persistedSize = result.size |
| } |
| } |
|
|
| const bytes = rawBuffer.length |
| const htmlContent = rawBuffer.toString('utf-8') |
|
|
| let markdownContent: string |
| let contentBytes: number |
| if (contentType.includes('text/html')) { |
| markdownContent = (await getTurndownService()).turndown(htmlContent) |
| contentBytes = Buffer.byteLength(markdownContent) |
| } else { |
| |
| |
| |
| |
| markdownContent = htmlContent |
| contentBytes = bytes |
| } |
|
|
| |
| |
| const entry: CacheEntry = { |
| bytes, |
| code: response.status, |
| codeText: response.statusText, |
| content: markdownContent, |
| contentType, |
| persistedPath, |
| persistedSize, |
| } |
| |
| URL_CACHE.set(url, entry, { size: Math.max(1, contentBytes) }) |
| return entry |
| } |
|
|
| export async function applyPromptToMarkdown( |
| prompt: string, |
| markdownContent: string, |
| signal: AbortSignal, |
| isNonInteractiveSession: boolean, |
| isPreapprovedDomain: boolean, |
| ): Promise<string> { |
| |
| const truncatedContent = |
| markdownContent.length > MAX_MARKDOWN_LENGTH |
| ? markdownContent.slice(0, MAX_MARKDOWN_LENGTH) + |
| '\n\n[Content truncated due to length...]' |
| : markdownContent |
|
|
| const modelPrompt = makeSecondaryModelPrompt( |
| truncatedContent, |
| prompt, |
| isPreapprovedDomain, |
| ) |
| const assistantMessage = await queryHaiku({ |
| systemPrompt: asSystemPrompt([]), |
| userPrompt: modelPrompt, |
| signal, |
| options: { |
| querySource: 'web_fetch_apply', |
| agents: [], |
| isNonInteractiveSession, |
| hasAppendSystemPrompt: false, |
| mcpTools: [], |
| }, |
| }) |
|
|
| |
| |
| if (signal.aborted) { |
| throw new AbortError() |
| } |
|
|
| const { content } = assistantMessage.message |
| if (content.length > 0) { |
| const contentBlock = content[0] |
| if ('text' in contentBlock!) { |
| return contentBlock.text |
| } |
| } |
| return 'No response from model' |
| } |
|
|