import axios from 'axios'; import * as cheerio from 'cheerio'; import { HttpsProxyAgent } from 'https-proxy-agent'; import { getProxyUrl } from '../../config.js'; import { assertPublicHttpUrl } from '../../utils/urlSafety.js'; export interface FetchWebContentResult { url: string; finalUrl: string; contentType: string; title: string; truncated: boolean; content: string; } const DEFAULT_TIMEOUT_MS = 20000; const DEFAULT_MAX_CHARS = 30000; const MIN_MAX_CHARS = 1000; const MAX_MAX_CHARS = 200000; const MAX_DOWNLOAD_BYTES = 2 * 1024 * 1024; function normalizeText(text: string): string { return text .replace(/\r\n/g, '\n') .replace(/\u00a0/g, ' ') .replace(/[ \t]+\n/g, '\n') .replace(/\n{3,}/g, '\n\n') .trim(); } function clampMaxChars(value: number): number { return Math.max(MIN_MAX_CHARS, Math.min(MAX_MAX_CHARS, value)); } function looksLikeHtml(raw: string): boolean { return /]|]/i.test(raw); } function isMarkdownPath(url: URL): boolean { const pathname = url.pathname.toLowerCase(); return pathname.endsWith('.md') || pathname.endsWith('.markdown') || pathname.endsWith('.mdx'); } function isMarkdownContentType(contentType: string): boolean { const ct = contentType.toLowerCase(); return ct.includes('text/markdown') || ct.includes('application/markdown') || ct.includes('text/x-markdown'); } function extractMainTextFromHtml(html: string): { title: string; text: string } { const $ = cheerio.load(html); const title = $('title').first().text().trim(); const metaDescription = $('meta[name="description"]').attr('content')?.trim() || $('meta[property="og:description"]').attr('content')?.trim() || ''; $('script, style, noscript, template, iframe, svg, canvas').remove(); const preferredContainers = [ 'article', 'main', '[role="main"]', '.markdown-body', '.article-content', '.post-content', '.entry-content', '.content' ]; let selectedText = ''; for (const selector of preferredContainers) { const container = $(selector).first(); if (container.length === 0) { continue; } const candidate = normalizeText(container.text()); if (candidate.length >= 120) { selectedText = candidate; break; } } if (!selectedText) { const body = $('body'); selectedText = normalizeText((body.length > 0 ? body : $.root()).text()); } // SPA pages often render content by JS and leave body nearly empty. // Fall back to metadata so callers still get useful page info. if (!selectedText) { selectedText = normalizeText([title, metaDescription].filter(Boolean).join('\n\n')); } return { title, text: selectedText }; } export async function fetchWebContent(url: string, maxChars: number = DEFAULT_MAX_CHARS): Promise { const parsedUrl = new URL(url); assertPublicHttpUrl(parsedUrl, 'Request URL'); const effectiveProxyUrl = getProxyUrl(); const requestOptions: any = { timeout: DEFAULT_TIMEOUT_MS, maxRedirects: 5, responseType: 'text', maxContentLength: MAX_DOWNLOAD_BYTES, maxBodyLength: MAX_DOWNLOAD_BYTES, decompress: true, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36', 'Accept': 'text/markdown,text/plain,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8' } }; if (effectiveProxyUrl) { const proxyAgent = new HttpsProxyAgent(effectiveProxyUrl); requestOptions.httpAgent = proxyAgent; requestOptions.httpsAgent = proxyAgent; } // Pre-flight check to avoid downloading oversized payloads when Content-Length is present. try { const headResponse = await axios.head(parsedUrl.toString(), { ...requestOptions, responseType: 'json', validateStatus: (status: number) => status >= 200 && status < 400 }); const headLength = Number(headResponse.headers['content-length']); if (Number.isFinite(headLength) && headLength > MAX_DOWNLOAD_BYTES) { const tooLargeError = new Error(`Response body too large (${headLength} bytes). Max allowed is ${MAX_DOWNLOAD_BYTES} bytes`); (tooLargeError as any).code = 'ERR_RESPONSE_TOO_LARGE'; throw tooLargeError; } } catch (error: any) { if (error?.code === 'ERR_RESPONSE_TOO_LARGE') { throw error; } const status = error?.response?.status; // Some servers don't support HEAD correctly; continue and rely on GET download limits. if (status !== undefined && ![400, 403, 404, 405, 406, 501].includes(status)) { throw error; } } const response = await axios.get(parsedUrl.toString(), requestOptions); const contentType = String(response.headers['content-type'] || '').toLowerCase(); const finalUrl = response.request?.res?.responseUrl || parsedUrl.toString(); assertPublicHttpUrl(finalUrl, 'Final URL'); const contentLength = Number(response.headers['content-length']); if (Number.isFinite(contentLength) && contentLength > MAX_DOWNLOAD_BYTES) { throw new Error(`Response body too large (${contentLength} bytes). Max allowed is ${MAX_DOWNLOAD_BYTES} bytes`); } const raw = typeof response.data === 'string' ? response.data : JSON.stringify(response.data, null, 2); let title = ''; let extractedContent = ''; // Keep raw markdown behavior for explicit markdown paths. if (isMarkdownPath(parsedUrl)) { extractedContent = normalizeText(raw); } else if (contentType.includes('text/html') || looksLikeHtml(raw)) { const parsed = extractMainTextFromHtml(raw); title = parsed.title; extractedContent = parsed.text; } else if (isMarkdownContentType(contentType)) { extractedContent = normalizeText(raw); } else { extractedContent = normalizeText(raw); } if (!extractedContent) { throw new Error('No readable content was extracted from this URL'); } const targetMaxChars = clampMaxChars(maxChars); const truncated = extractedContent.length > targetMaxChars; const content = truncated ? `${extractedContent.slice(0, targetMaxChars)}\n\n[...truncated ${extractedContent.length - targetMaxChars} characters]` : extractedContent; return { url: parsedUrl.toString(), finalUrl, contentType: contentType || 'unknown', title, truncated, content }; }