| import axios from 'axios'; |
| import * as cheerio from 'cheerio'; |
| import { HttpsProxyAgent } from 'https-proxy-agent'; |
| import { getProxyUrl } from '../../config.js'; |
| import { assertPublicHttpUrl } from '../../utils/urlSafety.js'; |
|
|
| export interface FetchWebContentResult { |
| url: string; |
| finalUrl: string; |
| contentType: string; |
| title: string; |
| truncated: boolean; |
| content: string; |
| } |
|
|
| const DEFAULT_TIMEOUT_MS = 20000; |
| const DEFAULT_MAX_CHARS = 30000; |
| const MIN_MAX_CHARS = 1000; |
| const MAX_MAX_CHARS = 200000; |
| const MAX_DOWNLOAD_BYTES = 2 * 1024 * 1024; |
|
|
| function normalizeText(text: string): string { |
| return text |
| .replace(/\r\n/g, '\n') |
| .replace(/\u00a0/g, ' ') |
| .replace(/[ \t]+\n/g, '\n') |
| .replace(/\n{3,}/g, '\n\n') |
| .trim(); |
| } |
|
|
| function clampMaxChars(value: number): number { |
| return Math.max(MIN_MAX_CHARS, Math.min(MAX_MAX_CHARS, value)); |
| } |
|
|
| function looksLikeHtml(raw: string): boolean { |
| return /<!doctype html|<html[\s>]|<body[\s>]/i.test(raw); |
| } |
|
|
| function isMarkdownPath(url: URL): boolean { |
| const pathname = url.pathname.toLowerCase(); |
| return pathname.endsWith('.md') || pathname.endsWith('.markdown') || pathname.endsWith('.mdx'); |
| } |
|
|
| function isMarkdownContentType(contentType: string): boolean { |
| const ct = contentType.toLowerCase(); |
| return ct.includes('text/markdown') || ct.includes('application/markdown') || ct.includes('text/x-markdown'); |
| } |
|
|
| function extractMainTextFromHtml(html: string): { title: string; text: string } { |
| const $ = cheerio.load(html); |
| const title = $('title').first().text().trim(); |
| const metaDescription = $('meta[name="description"]').attr('content')?.trim() || |
| $('meta[property="og:description"]').attr('content')?.trim() || |
| ''; |
|
|
| $('script, style, noscript, template, iframe, svg, canvas').remove(); |
|
|
| const preferredContainers = [ |
| 'article', |
| 'main', |
| '[role="main"]', |
| '.markdown-body', |
| '.article-content', |
| '.post-content', |
| '.entry-content', |
| '.content' |
| ]; |
|
|
| let selectedText = ''; |
| for (const selector of preferredContainers) { |
| const container = $(selector).first(); |
| if (container.length === 0) { |
| continue; |
| } |
|
|
| const candidate = normalizeText(container.text()); |
| if (candidate.length >= 120) { |
| selectedText = candidate; |
| break; |
| } |
| } |
|
|
| if (!selectedText) { |
| const body = $('body'); |
| selectedText = normalizeText((body.length > 0 ? body : $.root()).text()); |
| } |
|
|
| |
| |
| if (!selectedText) { |
| selectedText = normalizeText([title, metaDescription].filter(Boolean).join('\n\n')); |
| } |
|
|
| return { title, text: selectedText }; |
| } |
|
|
| export async function fetchWebContent(url: string, maxChars: number = DEFAULT_MAX_CHARS): Promise<FetchWebContentResult> { |
| const parsedUrl = new URL(url); |
| assertPublicHttpUrl(parsedUrl, 'Request URL'); |
|
|
| const effectiveProxyUrl = getProxyUrl(); |
| const requestOptions: any = { |
| timeout: DEFAULT_TIMEOUT_MS, |
| maxRedirects: 5, |
| responseType: 'text', |
| maxContentLength: MAX_DOWNLOAD_BYTES, |
| maxBodyLength: MAX_DOWNLOAD_BYTES, |
| decompress: true, |
| headers: { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36', |
| 'Accept': 'text/markdown,text/plain,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
| 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8' |
| } |
| }; |
|
|
| if (effectiveProxyUrl) { |
| const proxyAgent = new HttpsProxyAgent(effectiveProxyUrl); |
| requestOptions.httpAgent = proxyAgent; |
| requestOptions.httpsAgent = proxyAgent; |
| } |
|
|
| |
| try { |
| const headResponse = await axios.head(parsedUrl.toString(), { |
| ...requestOptions, |
| responseType: 'json', |
| validateStatus: (status: number) => status >= 200 && status < 400 |
| }); |
| const headLength = Number(headResponse.headers['content-length']); |
| if (Number.isFinite(headLength) && headLength > MAX_DOWNLOAD_BYTES) { |
| const tooLargeError = new Error(`Response body too large (${headLength} bytes). Max allowed is ${MAX_DOWNLOAD_BYTES} bytes`); |
| (tooLargeError as any).code = 'ERR_RESPONSE_TOO_LARGE'; |
| throw tooLargeError; |
| } |
| } catch (error: any) { |
| if (error?.code === 'ERR_RESPONSE_TOO_LARGE') { |
| throw error; |
| } |
| const status = error?.response?.status; |
| |
| if (status !== undefined && ![400, 403, 404, 405, 406, 501].includes(status)) { |
| throw error; |
| } |
| } |
|
|
| const response = await axios.get(parsedUrl.toString(), requestOptions); |
| const contentType = String(response.headers['content-type'] || '').toLowerCase(); |
| const finalUrl = response.request?.res?.responseUrl || parsedUrl.toString(); |
| assertPublicHttpUrl(finalUrl, 'Final URL'); |
| const contentLength = Number(response.headers['content-length']); |
| if (Number.isFinite(contentLength) && contentLength > MAX_DOWNLOAD_BYTES) { |
| throw new Error(`Response body too large (${contentLength} bytes). Max allowed is ${MAX_DOWNLOAD_BYTES} bytes`); |
| } |
| const raw = typeof response.data === 'string' |
| ? response.data |
| : JSON.stringify(response.data, null, 2); |
|
|
| let title = ''; |
| let extractedContent = ''; |
|
|
| |
| if (isMarkdownPath(parsedUrl)) { |
| extractedContent = normalizeText(raw); |
| } else if (contentType.includes('text/html') || looksLikeHtml(raw)) { |
| const parsed = extractMainTextFromHtml(raw); |
| title = parsed.title; |
| extractedContent = parsed.text; |
| } else if (isMarkdownContentType(contentType)) { |
| extractedContent = normalizeText(raw); |
| } else { |
| extractedContent = normalizeText(raw); |
| } |
|
|
| if (!extractedContent) { |
| throw new Error('No readable content was extracted from this URL'); |
| } |
|
|
| const targetMaxChars = clampMaxChars(maxChars); |
| const truncated = extractedContent.length > targetMaxChars; |
| const content = truncated |
| ? `${extractedContent.slice(0, targetMaxChars)}\n\n[...truncated ${extractedContent.length - targetMaxChars} characters]` |
| : extractedContent; |
|
|
| return { |
| url: parsedUrl.toString(), |
| finalUrl, |
| contentType: contentType || 'unknown', |
| title, |
| truncated, |
| content |
| }; |
| } |
|
|