Spaces:

darkfire514
/

OpenClawBot

Running

App Files Files Community

OpenClawBot / src /media /input-files.ts

darkfire514

Upload 2526 files

fb4d8fe verified 2 months ago

raw

history blame contribute delete

12.5 kB

	import type { Dispatcher } from "undici";
	import {
	closeDispatcher,
	createPinnedDispatcher,
	resolvePinnedHostname,
	} from "../infra/net/ssrf.js";
	import { logWarn } from "../logger.js";

	type CanvasModule = typeof import("@napi-rs/canvas");
	type PdfJsModule = typeof import("pdfjs-dist/legacy/build/pdf.mjs");

	let canvasModulePromise: Promise<CanvasModule> \| null = null;
	let pdfJsModulePromise: Promise<PdfJsModule> \| null = null;

	// Lazy-load optional PDF/image deps so non-PDF paths don't require native installs.
	async function loadCanvasModule(): Promise<CanvasModule> {
	if (!canvasModulePromise) {
	canvasModulePromise = import("@napi-rs/canvas").catch((err) => {
	canvasModulePromise = null;
	throw new Error(
	`Optional dependency @napi-rs/canvas is required for PDF image extraction: ${String(err)}`,
	);
	});
	}
	return canvasModulePromise;
	}

	async function loadPdfJsModule(): Promise<PdfJsModule> {
	if (!pdfJsModulePromise) {
	pdfJsModulePromise = import("pdfjs-dist/legacy/build/pdf.mjs").catch((err) => {
	pdfJsModulePromise = null;
	throw new Error(
	`Optional dependency pdfjs-dist is required for PDF extraction: ${String(err)}`,
	);
	});
	}
	return pdfJsModulePromise;
	}

	export type InputImageContent = {
	type: "image";
	data: string;
	mimeType: string;
	};

	export type InputFileExtractResult = {
	filename: string;
	text?: string;
	images?: InputImageContent[];
	};

	export type InputPdfLimits = {
	maxPages: number;
	maxPixels: number;
	minTextChars: number;
	};

	export type InputFileLimits = {
	allowUrl: boolean;
	allowedMimes: Set<string>;
	maxBytes: number;
	maxChars: number;
	maxRedirects: number;
	timeoutMs: number;
	pdf: InputPdfLimits;
	};

	export type InputImageLimits = {
	allowUrl: boolean;
	allowedMimes: Set<string>;
	maxBytes: number;
	maxRedirects: number;
	timeoutMs: number;
	};

	export type InputImageSource = {
	type: "base64" \| "url";
	data?: string;
	url?: string;
	mediaType?: string;
	};

	export type InputFileSource = {
	type: "base64" \| "url";
	data?: string;
	url?: string;
	mediaType?: string;
	filename?: string;
	};

	export type InputFetchResult = {
	buffer: Buffer;
	mimeType: string;
	contentType?: string;
	};

	export const DEFAULT_INPUT_IMAGE_MIMES = ["image/jpeg", "image/png", "image/gif", "image/webp"];
	export const DEFAULT_INPUT_FILE_MIMES = [
	"text/plain",
	"text/markdown",
	"text/html",
	"text/csv",
	"application/json",
	"application/pdf",
	];
	export const DEFAULT_INPUT_IMAGE_MAX_BYTES = 10 * 1024 * 1024;
	export const DEFAULT_INPUT_FILE_MAX_BYTES = 5 * 1024 * 1024;
	export const DEFAULT_INPUT_FILE_MAX_CHARS = 200_000;
	export const DEFAULT_INPUT_MAX_REDIRECTS = 3;
	export const DEFAULT_INPUT_TIMEOUT_MS = 10_000;
	export const DEFAULT_INPUT_PDF_MAX_PAGES = 4;
	export const DEFAULT_INPUT_PDF_MAX_PIXELS = 4_000_000;
	export const DEFAULT_INPUT_PDF_MIN_TEXT_CHARS = 200;

	function isRedirectStatus(status: number): boolean {
	return status === 301 \|\| status === 302 \|\| status === 303 \|\| status === 307 \|\| status === 308;
	}

	export function normalizeMimeType(value: string \| undefined): string \| undefined {
	if (!value) {
	return undefined;
	}
	const [raw] = value.split(";");
	const normalized = raw?.trim().toLowerCase();
	return normalized \|\| undefined;
	}

	export function parseContentType(value: string \| undefined): {
	mimeType?: string;
	charset?: string;
	} {
	if (!value) {
	return {};
	}
	const parts = value.split(";").map((part) => part.trim());
	const mimeType = normalizeMimeType(parts[0]);
	const charset = parts
	.map((part) => part.match(/^charset=(.+)$/i)?.[1]?.trim())
	.find((part) => part && part.length > 0);
	return { mimeType, charset };
	}

	export function normalizeMimeList(values: string[] \| undefined, fallback: string[]): Set<string> {
	const input = values && values.length > 0 ? values : fallback;
	return new Set(input.map((value) => normalizeMimeType(value)).filter(Boolean) as string[]);
	}

	export async function fetchWithGuard(params: {
	url: string;
	maxBytes: number;
	timeoutMs: number;
	maxRedirects: number;
	}): Promise<InputFetchResult> {
	let currentUrl = params.url;
	let redirectCount = 0;

	const controller = new AbortController();
	const timeoutId = setTimeout(() => controller.abort(), params.timeoutMs);

	try {
	while (true) {
	const parsedUrl = new URL(currentUrl);
	if (!["http:", "https:"].includes(parsedUrl.protocol)) {
	throw new Error(`Invalid URL protocol: ${parsedUrl.protocol}. Only HTTP/HTTPS allowed.`);
	}
	const pinned = await resolvePinnedHostname(parsedUrl.hostname);
	const dispatcher = createPinnedDispatcher(pinned);

	try {
	const response = await fetch(parsedUrl, {
	signal: controller.signal,
	headers: { "User-Agent": "OpenClaw-Gateway/1.0" },
	redirect: "manual",
	dispatcher,
	} as RequestInit & { dispatcher: Dispatcher });

	if (isRedirectStatus(response.status)) {
	const location = response.headers.get("location");
	if (!location) {
	throw new Error(`Redirect missing location header (${response.status})`);
	}
	redirectCount += 1;
	if (redirectCount > params.maxRedirects) {
	throw new Error(`Too many redirects (limit: ${params.maxRedirects})`);
	}
	void response.body?.cancel();
	currentUrl = new URL(location, parsedUrl).toString();
	continue;
	}

	if (!response.ok) {
	throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
	}

	const contentLength = response.headers.get("content-length");
	if (contentLength) {
	const size = parseInt(contentLength, 10);
	if (size > params.maxBytes) {
	throw new Error(`Content too large: ${size} bytes (limit: ${params.maxBytes} bytes)`);
	}
	}

	const buffer = Buffer.from(await response.arrayBuffer());
	if (buffer.byteLength > params.maxBytes) {
	throw new Error(
	`Content too large: ${buffer.byteLength} bytes (limit: ${params.maxBytes} bytes)`,
	);
	}

	const contentType = response.headers.get("content-type") \|\| undefined;
	const parsed = parseContentType(contentType);
	const mimeType = parsed.mimeType ?? "application/octet-stream";
	return { buffer, mimeType, contentType };
	} finally {
	await closeDispatcher(dispatcher);
	}
	}
	} finally {
	clearTimeout(timeoutId);
	}
	}

	function decodeTextContent(buffer: Buffer, charset: string \| undefined): string {
	const encoding = charset?.trim().toLowerCase() \|\| "utf-8";
	try {
	return new TextDecoder(encoding).decode(buffer);
	} catch {
	return new TextDecoder("utf-8").decode(buffer);
	}
	}

	function clampText(text: string, maxChars: number): string {
	if (text.length <= maxChars) {
	return text;
	}
	return text.slice(0, maxChars);
	}

	async function extractPdfContent(params: {
	buffer: Buffer;
	limits: InputFileLimits;
	}): Promise<{ text: string; images: InputImageContent[] }> {
	const { buffer, limits } = params;
	const { getDocument } = await loadPdfJsModule();
	const pdf = await getDocument({
	data: new Uint8Array(buffer),
	disableWorker: true,
	}).promise;
	const maxPages = Math.min(pdf.numPages, limits.pdf.maxPages);
	const textParts: string[] = [];

	for (let pageNum = 1; pageNum <= maxPages; pageNum += 1) {
	const page = await pdf.getPage(pageNum);
	const textContent = await page.getTextContent();
	const pageText = textContent.items
	.map((item) => ("str" in item ? String(item.str) : ""))
	.filter(Boolean)
	.join(" ");
	if (pageText) {
	textParts.push(pageText);
	}
	}

	const text = textParts.join("\n\n");
	if (text.trim().length >= limits.pdf.minTextChars) {
	return { text, images: [] };
	}

	let canvasModule: CanvasModule;
	try {
	canvasModule = await loadCanvasModule();
	} catch (err) {
	logWarn(`media: PDF image extraction skipped; ${String(err)}`);
	return { text, images: [] };
	}
	const { createCanvas } = canvasModule;
	const images: InputImageContent[] = [];
	for (let pageNum = 1; pageNum <= maxPages; pageNum += 1) {
	const page = await pdf.getPage(pageNum);
	const viewport = page.getViewport({ scale: 1 });
	const maxPixels = limits.pdf.maxPixels;
	const pixelBudget = Math.max(1, maxPixels);
	const pagePixels = viewport.width * viewport.height;
	const scale = Math.min(1, Math.sqrt(pixelBudget / pagePixels));
	const scaled = page.getViewport({ scale: Math.max(0.1, scale) });
	const canvas = createCanvas(Math.ceil(scaled.width), Math.ceil(scaled.height));
	await page.render({
	canvas: canvas as unknown as HTMLCanvasElement,
	viewport: scaled,
	}).promise;
	const png = canvas.toBuffer("image/png");
	images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" });
	}

	return { text, images };
	}

	export async function extractImageContentFromSource(
	source: InputImageSource,
	limits: InputImageLimits,
	): Promise<InputImageContent> {
	if (source.type === "base64") {
	if (!source.data) {
	throw new Error("input_image base64 source missing 'data' field");
	}
	const mimeType = normalizeMimeType(source.mediaType) ?? "image/png";
	if (!limits.allowedMimes.has(mimeType)) {
	throw new Error(`Unsupported image MIME type: ${mimeType}`);
	}
	const buffer = Buffer.from(source.data, "base64");
	if (buffer.byteLength > limits.maxBytes) {
	throw new Error(
	`Image too large: ${buffer.byteLength} bytes (limit: ${limits.maxBytes} bytes)`,
	);
	}
	return { type: "image", data: source.data, mimeType };
	}

	if (source.type === "url" && source.url) {
	if (!limits.allowUrl) {
	throw new Error("input_image URL sources are disabled by config");
	}
	const result = await fetchWithGuard({
	url: source.url,
	maxBytes: limits.maxBytes,
	timeoutMs: limits.timeoutMs,
	maxRedirects: limits.maxRedirects,
	});
	if (!limits.allowedMimes.has(result.mimeType)) {
	throw new Error(`Unsupported image MIME type from URL: ${result.mimeType}`);
	}
	return { type: "image", data: result.buffer.toString("base64"), mimeType: result.mimeType };
	}

	throw new Error("input_image must have 'source.url' or 'source.data'");
	}

	export async function extractFileContentFromSource(params: {
	source: InputFileSource;
	limits: InputFileLimits;
	}): Promise<InputFileExtractResult> {
	const { source, limits } = params;
	const filename = source.filename \|\| "file";

	let buffer: Buffer;
	let mimeType: string \| undefined;
	let charset: string \| undefined;

	if (source.type === "base64") {
	if (!source.data) {
	throw new Error("input_file base64 source missing 'data' field");
	}
	const parsed = parseContentType(source.mediaType);
	mimeType = parsed.mimeType;
	charset = parsed.charset;
	buffer = Buffer.from(source.data, "base64");
	} else if (source.type === "url" && source.url) {
	if (!limits.allowUrl) {
	throw new Error("input_file URL sources are disabled by config");
	}
	const result = await fetchWithGuard({
	url: source.url,
	maxBytes: limits.maxBytes,
	timeoutMs: limits.timeoutMs,
	maxRedirects: limits.maxRedirects,
	});
	const parsed = parseContentType(result.contentType);
	mimeType = parsed.mimeType ?? normalizeMimeType(result.mimeType);
	charset = parsed.charset;
	buffer = result.buffer;
	} else {
	throw new Error("input_file must have 'source.url' or 'source.data'");
	}

	if (buffer.byteLength > limits.maxBytes) {
	throw new Error(`File too large: ${buffer.byteLength} bytes (limit: ${limits.maxBytes} bytes)`);
	}

	if (!mimeType) {
	throw new Error("input_file missing media type");
	}
	if (!limits.allowedMimes.has(mimeType)) {
	throw new Error(`Unsupported file MIME type: ${mimeType}`);
	}

	if (mimeType === "application/pdf") {
	const extracted = await extractPdfContent({ buffer, limits });
	const text = extracted.text ? clampText(extracted.text, limits.maxChars) : "";
	return {
	filename,
	text,
	images: extracted.images.length > 0 ? extracted.images : undefined,
	};
	}

	const text = clampText(decodeTextContent(buffer, charset), limits.maxChars);
	return { filename, text };
	}