Spaces:

darkfire514
/

OpenClawBot

Paused

App Files Files Community

OpenClawBot / src /agents /tools /web-fetch-utils.ts

darkfire514

Upload 2526 files

fb4d8fe verified about 1 month ago

raw

history blame contribute delete

4.23 kB

	export type ExtractMode = "markdown" \| "text";

	function decodeEntities(value: string): string {
	return value
	.replace(/ /gi, " ")
	.replace(/&/gi, "&")
	.replace(/"/gi, '"')
	.replace(/'/gi, "'")
	.replace(/</gi, "<")
	.replace(/>/gi, ">")
	.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16)))
	.replace(/&#(\d+);/gi, (_, dec) => String.fromCharCode(Number.parseInt(dec, 10)));
	}

	function stripTags(value: string): string {
	return decodeEntities(value.replace(/<[^>]+>/g, ""));
	}

	function normalizeWhitespace(value: string): string {
	return value
	.replace(/\r/g, "")
	.replace(/[ \t]+\n/g, "\n")
	.replace(/\n{3,}/g, "\n\n")
	.replace(/[ \t]{2,}/g, " ")
	.trim();
	}

	export function htmlToMarkdown(html: string): { text: string; title?: string } {
	const titleMatch = html.match(/<title[^>]>([\s\S]?)<\/title>/i);
	const title = titleMatch ? normalizeWhitespace(stripTags(titleMatch[1])) : undefined;
	let text = html
	.replace(/<script[\s\S]*?<\/script>/gi, "")
	.replace(/<style[\s\S]*?<\/style>/gi, "")
	.replace(/<noscript[\s\S]*?<\/noscript>/gi, "");
	text = text.replace(/<a\s+[^>]href=["']([^"']+)["'][^>]>([\s\S]*?)<\/a>/gi, (_, href, body) => {
	const label = normalizeWhitespace(stripTags(body));
	if (!label) {
	return href;
	}
	return `[${label}](${href})`;
	});
	text = text.replace(/<h([1-6])[^>]>([\s\S]?)<\/h\1>/gi, (_, level, body) => {
	const prefix = "#".repeat(Math.max(1, Math.min(6, Number.parseInt(level, 10))));
	const label = normalizeWhitespace(stripTags(body));
	return `\n${prefix} ${label}\n`;
	});
	text = text.replace(/<li[^>]>([\s\S]?)<\/li>/gi, (_, body) => {
	const label = normalizeWhitespace(stripTags(body));
	return label ? `\n- ${label}` : "";
	});
	text = text
	.replace(/<(br\|hr)\s*\/?>/gi, "\n")
	.replace(/<\/(p\|div\|section\|article\|header\|footer\|table\|tr\|ul\|ol)>/gi, "\n");
	text = stripTags(text);
	text = normalizeWhitespace(text);
	return { text, title };
	}

	export function markdownToText(markdown: string): string {
	let text = markdown;
	text = text.replace(/!\[[^\]]*]$[^)]+$/g, "");
	text = text.replace(/\[([^\]]+)]$[^)]+$/g, "$1");
	text = text.replace(/```[\s\S]*?```/g, (block) =>
	block.replace(/```[^\n]*\n?/g, "").replace(/```/g, ""),
	);
	text = text.replace(/`([^`]+)`/g, "$1");
	text = text.replace(/^#{1,6}\s+/gm, "");
	text = text.replace(/^\s[-+]\s+/gm, "");
	text = text.replace(/^\s*\d+\.\s+/gm, "");
	return normalizeWhitespace(text);
	}

	export function truncateText(
	value: string,
	maxChars: number,
	): { text: string; truncated: boolean } {
	if (value.length <= maxChars) {
	return { text: value, truncated: false };
	}
	return { text: value.slice(0, maxChars), truncated: true };
	}

	export async function extractReadableContent(params: {
	html: string;
	url: string;
	extractMode: ExtractMode;
	}): Promise<{ text: string; title?: string } \| null> {
	const fallback = (): { text: string; title?: string } => {
	const rendered = htmlToMarkdown(params.html);
	if (params.extractMode === "text") {
	const text = markdownToText(rendered.text) \|\| normalizeWhitespace(stripTags(params.html));
	return { text, title: rendered.title };
	}
	return rendered;
	};
	try {
	const [{ Readability }, { parseHTML }] = await Promise.all([
	import("@mozilla/readability"),
	import("linkedom"),
	]);
	const { document } = parseHTML(params.html);
	try {
	(document as { baseURI?: string }).baseURI = params.url;
	} catch {
	// Best-effort base URI for relative links.
	}
	const reader = new Readability(document, { charThreshold: 0 });
	const parsed = reader.parse();
	if (!parsed?.content) {
	return fallback();
	}
	const title = parsed.title \|\| undefined;
	if (params.extractMode === "text") {
	const text = normalizeWhitespace(parsed.textContent ?? "");
	return text ? { text, title } : fallback();
	}
	const rendered = htmlToMarkdown(parsed.content);
	return { text: rendered.text, title: title ?? rendered.title };
	} catch {
	return fallback();
	}
	}