OpenClawBot

Sleeping

App Files Files Community

OpenClawBot / src /media-understanding /apply.ts

darkfire514

Upload 2526 files

fb4d8fe verified 2 months ago

raw

history blame contribute delete

16.8 kB

	import path from "node:path";
	import type { MsgContext } from "../auto-reply/templating.js";
	import type { OpenClawConfig } from "../config/config.js";
	import type {
	MediaUnderstandingCapability,
	MediaUnderstandingDecision,
	MediaUnderstandingOutput,
	MediaUnderstandingProvider,
	} from "./types.js";
	import { finalizeInboundContext } from "../auto-reply/reply/inbound-context.js";
	import { logVerbose, shouldLogVerbose } from "../globals.js";
	import {
	DEFAULT_INPUT_FILE_MAX_BYTES,
	DEFAULT_INPUT_FILE_MAX_CHARS,
	DEFAULT_INPUT_FILE_MIMES,
	DEFAULT_INPUT_MAX_REDIRECTS,
	DEFAULT_INPUT_PDF_MAX_PAGES,
	DEFAULT_INPUT_PDF_MAX_PIXELS,
	DEFAULT_INPUT_PDF_MIN_TEXT_CHARS,
	DEFAULT_INPUT_TIMEOUT_MS,
	extractFileContentFromSource,
	normalizeMimeList,
	normalizeMimeType,
	} from "../media/input-files.js";
	import { resolveAttachmentKind } from "./attachments.js";
	import { runWithConcurrency } from "./concurrency.js";
	import {
	extractMediaUserText,
	formatAudioTranscripts,
	formatMediaUnderstandingBody,
	} from "./format.js";
	import { resolveConcurrency } from "./resolve.js";
	import {
	type ActiveMediaModel,
	buildProviderRegistry,
	createMediaAttachmentCache,
	normalizeMediaAttachments,
	runCapability,
	} from "./runner.js";

	export type ApplyMediaUnderstandingResult = {
	outputs: MediaUnderstandingOutput[];
	decisions: MediaUnderstandingDecision[];
	appliedImage: boolean;
	appliedAudio: boolean;
	appliedVideo: boolean;
	appliedFile: boolean;
	};

	const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
	const EXTRA_TEXT_MIMES = [
	"application/xml",
	"text/xml",
	"application/x-yaml",
	"text/yaml",
	"application/yaml",
	"application/javascript",
	"text/javascript",
	"text/tab-separated-values",
	];
	const TEXT_EXT_MIME = new Map<string, string>([
	[".csv", "text/csv"],
	[".tsv", "text/tab-separated-values"],
	[".txt", "text/plain"],
	[".md", "text/markdown"],
	[".log", "text/plain"],
	[".ini", "text/plain"],
	[".cfg", "text/plain"],
	[".conf", "text/plain"],
	[".env", "text/plain"],
	[".json", "application/json"],
	[".yaml", "text/yaml"],
	[".yml", "text/yaml"],
	[".xml", "application/xml"],
	]);

	const XML_ESCAPE_MAP: Record<string, string> = {
	"<": "<",
	">": ">",
	"&": "&",
	'"': """,
	"'": "'",
	};

	/**
	* Escapes special XML characters in attribute values to prevent injection.
	*/
	function xmlEscapeAttr(value: string): string {
	return value.replace(/[<>&"']/g, (char) => XML_ESCAPE_MAP[char] ?? char);
	}

	function escapeFileBlockContent(value: string): string {
	return value.replace(/<\s\/\sfile\s>/gi, "</file>").replace(/<\sfile\b/gi, "<file");
	}

	function sanitizeMimeType(value?: string): string \| undefined {
	if (!value) {
	return undefined;
	}
	const trimmed = value.trim().toLowerCase();
	if (!trimmed) {
	return undefined;
	}
	const match = trimmed.match(/^([a-z0-9!#$&^_.+-]+\/[a-z0-9!#$&^_.+-]+)/);
	return match?.[1];
	}

	function resolveFileLimits(cfg: OpenClawConfig) {
	const files = cfg.gateway?.http?.endpoints?.responses?.files;
	const allowedMimesConfigured = Boolean(files?.allowedMimes && files.allowedMimes.length > 0);
	return {
	allowUrl: files?.allowUrl ?? true,
	allowedMimes: normalizeMimeList(files?.allowedMimes, DEFAULT_INPUT_FILE_MIMES),
	allowedMimesConfigured,
	maxBytes: files?.maxBytes ?? DEFAULT_INPUT_FILE_MAX_BYTES,
	maxChars: files?.maxChars ?? DEFAULT_INPUT_FILE_MAX_CHARS,
	maxRedirects: files?.maxRedirects ?? DEFAULT_INPUT_MAX_REDIRECTS,
	timeoutMs: files?.timeoutMs ?? DEFAULT_INPUT_TIMEOUT_MS,
	pdf: {
	maxPages: files?.pdf?.maxPages ?? DEFAULT_INPUT_PDF_MAX_PAGES,
	maxPixels: files?.pdf?.maxPixels ?? DEFAULT_INPUT_PDF_MAX_PIXELS,
	minTextChars: files?.pdf?.minTextChars ?? DEFAULT_INPUT_PDF_MIN_TEXT_CHARS,
	},
	};
	}

	function appendFileBlocks(body: string \| undefined, blocks: string[]): string {
	if (!blocks \|\| blocks.length === 0) {
	return body ?? "";
	}
	const base = typeof body === "string" ? body.trim() : "";
	const suffix = blocks.join("\n\n").trim();
	if (!base) {
	return suffix;
	}
	return `${base}\n\n${suffix}`.trim();
	}

	function resolveUtf16Charset(buffer?: Buffer): "utf-16le" \| "utf-16be" \| undefined {
	if (!buffer \|\| buffer.length < 2) {
	return undefined;
	}
	const b0 = buffer[0];
	const b1 = buffer[1];
	if (b0 === 0xff && b1 === 0xfe) {
	return "utf-16le";
	}
	if (b0 === 0xfe && b1 === 0xff) {
	return "utf-16be";
	}
	const sampleLen = Math.min(buffer.length, 2048);
	let zeroEven = 0;
	let zeroOdd = 0;
	for (let i = 0; i < sampleLen; i += 1) {
	if (buffer[i] !== 0) {
	continue;
	}
	if (i % 2 === 0) {
	zeroEven += 1;
	} else {
	zeroOdd += 1;
	}
	}
	const zeroCount = zeroEven + zeroOdd;
	if (zeroCount / sampleLen > 0.2) {
	return zeroOdd >= zeroEven ? "utf-16le" : "utf-16be";
	}
	return undefined;
	}

	const WORDISH_CHAR = /[\p{L}\p{N}]/u;
	const CP1252_MAP: Array<string \| undefined> = [
	"\u20ac",
	undefined,
	"\u201a",
	"\u0192",
	"\u201e",
	"\u2026",
	"\u2020",
	"\u2021",
	"\u02c6",
	"\u2030",
	"\u0160",
	"\u2039",
	"\u0152",
	undefined,
	"\u017d",
	undefined,
	undefined,
	"\u2018",
	"\u2019",
	"\u201c",
	"\u201d",
	"\u2022",
	"\u2013",
	"\u2014",
	"\u02dc",
	"\u2122",
	"\u0161",
	"\u203a",
	"\u0153",
	undefined,
	"\u017e",
	"\u0178",
	];

	function decodeLegacyText(buffer: Buffer): string {
	let output = "";
	for (const byte of buffer) {
	if (byte >= 0x80 && byte <= 0x9f) {
	const mapped = CP1252_MAP[byte - 0x80];
	output += mapped ?? String.fromCharCode(byte);
	continue;
	}
	output += String.fromCharCode(byte);
	}
	return output;
	}

	function getTextStats(text: string): { printableRatio: number; wordishRatio: number } {
	if (!text) {
	return { printableRatio: 0, wordishRatio: 0 };
	}
	let printable = 0;
	let control = 0;
	let wordish = 0;
	for (const char of text) {
	const code = char.codePointAt(0) ?? 0;
	if (code === 9 \|\| code === 10 \|\| code === 13 \|\| code === 32) {
	printable += 1;
	wordish += 1;
	continue;
	}
	if (code < 32 \|\| (code >= 0x7f && code <= 0x9f)) {
	control += 1;
	continue;
	}
	printable += 1;
	if (WORDISH_CHAR.test(char)) {
	wordish += 1;
	}
	}
	const total = printable + control;
	if (total === 0) {
	return { printableRatio: 0, wordishRatio: 0 };
	}
	return { printableRatio: printable / total, wordishRatio: wordish / total };
	}

	function isMostlyPrintable(text: string): boolean {
	return getTextStats(text).printableRatio > 0.85;
	}

	function looksLikeLegacyTextBytes(buffer: Buffer): boolean {
	if (buffer.length === 0) {
	return false;
	}
	const text = decodeLegacyText(buffer);
	const { printableRatio, wordishRatio } = getTextStats(text);
	return printableRatio > 0.95 && wordishRatio > 0.3;
	}

	function looksLikeUtf8Text(buffer?: Buffer): boolean {
	if (!buffer \|\| buffer.length === 0) {
	return false;
	}
	const sample = buffer.subarray(0, Math.min(buffer.length, 4096));
	try {
	const text = new TextDecoder("utf-8", { fatal: true }).decode(sample);
	return isMostlyPrintable(text);
	} catch {
	return looksLikeLegacyTextBytes(sample);
	}
	}

	function decodeTextSample(buffer?: Buffer): string {
	if (!buffer \|\| buffer.length === 0) {
	return "";
	}
	const sample = buffer.subarray(0, Math.min(buffer.length, 8192));
	const utf16Charset = resolveUtf16Charset(sample);
	if (utf16Charset === "utf-16be") {
	const swapped = Buffer.alloc(sample.length);
	for (let i = 0; i + 1 < sample.length; i += 2) {
	swapped[i] = sample[i + 1];
	swapped[i + 1] = sample[i];
	}
	return new TextDecoder("utf-16le").decode(swapped);
	}
	if (utf16Charset === "utf-16le") {
	return new TextDecoder("utf-16le").decode(sample);
	}
	return new TextDecoder("utf-8").decode(sample);
	}

	function guessDelimitedMime(text: string): string \| undefined {
	if (!text) {
	return undefined;
	}
	const line = text.split(/\r?\n/)[0] ?? "";
	const tabs = (line.match(/\t/g) ?? []).length;
	const commas = (line.match(/,/g) ?? []).length;
	if (commas > 0) {
	return "text/csv";
	}
	if (tabs > 0) {
	return "text/tab-separated-values";
	}
	return undefined;
	}

	function resolveTextMimeFromName(name?: string): string \| undefined {
	if (!name) {
	return undefined;
	}
	const ext = path.extname(name).toLowerCase();
	return TEXT_EXT_MIME.get(ext);
	}

	async function extractFileBlocks(params: {
	attachments: ReturnType<typeof normalizeMediaAttachments>;
	cache: ReturnType<typeof createMediaAttachmentCache>;
	limits: ReturnType<typeof resolveFileLimits>;
	skipAttachmentIndexes?: Set<number>;
	}): Promise<string[]> {
	const { attachments, cache, limits, skipAttachmentIndexes } = params;
	if (!attachments \|\| attachments.length === 0) {
	return [];
	}
	const blocks: string[] = [];
	for (const attachment of attachments) {
	if (!attachment) {
	continue;
	}
	if (skipAttachmentIndexes?.has(attachment.index)) {
	continue;
	}
	const forcedTextMime = resolveTextMimeFromName(attachment.path ?? attachment.url ?? "");
	const kind = forcedTextMime ? "document" : resolveAttachmentKind(attachment);
	if (!forcedTextMime && (kind === "image" \|\| kind === "video")) {
	continue;
	}
	if (!limits.allowUrl && attachment.url && !attachment.path) {
	if (shouldLogVerbose()) {
	logVerbose(`media: file attachment skipped (url disabled) index=${attachment.index}`);
	}
	continue;
	}
	let bufferResult: Awaited<ReturnType<typeof cache.getBuffer>>;
	try {
	bufferResult = await cache.getBuffer({
	attachmentIndex: attachment.index,
	maxBytes: limits.maxBytes,
	timeoutMs: limits.timeoutMs,
	});
	} catch (err) {
	if (shouldLogVerbose()) {
	logVerbose(`media: file attachment skipped (buffer): ${String(err)}`);
	}
	continue;
	}
	const nameHint = bufferResult?.fileName ?? attachment.path ?? attachment.url;
	const forcedTextMimeResolved = forcedTextMime ?? resolveTextMimeFromName(nameHint ?? "");
	const utf16Charset = resolveUtf16Charset(bufferResult?.buffer);
	const textSample = decodeTextSample(bufferResult?.buffer);
	const textLike = Boolean(utf16Charset) \|\| looksLikeUtf8Text(bufferResult?.buffer);
	if (!forcedTextMimeResolved && kind === "audio" && !textLike) {
	continue;
	}
	const guessedDelimited = textLike ? guessDelimitedMime(textSample) : undefined;
	const textHint =
	forcedTextMimeResolved ?? guessedDelimited ?? (textLike ? "text/plain" : undefined);
	const rawMime = bufferResult?.mime ?? attachment.mime;
	const mimeType = sanitizeMimeType(textHint ?? normalizeMimeType(rawMime));
	// Log when MIME type is overridden from non-text to text for auditability
	if (textHint && rawMime && !rawMime.startsWith("text/")) {
	logVerbose(
	`media: MIME override from "${rawMime}" to "${textHint}" for index=${attachment.index}`,
	);
	}
	if (!mimeType) {
	if (shouldLogVerbose()) {
	logVerbose(`media: file attachment skipped (unknown mime) index=${attachment.index}`);
	}
	continue;
	}
	const allowedMimes = new Set(limits.allowedMimes);
	if (!limits.allowedMimesConfigured) {
	for (const extra of EXTRA_TEXT_MIMES) {
	allowedMimes.add(extra);
	}
	if (mimeType.startsWith("text/")) {
	allowedMimes.add(mimeType);
	}
	}
	if (!allowedMimes.has(mimeType)) {
	if (shouldLogVerbose()) {
	logVerbose(
	`media: file attachment skipped (unsupported mime ${mimeType}) index=${attachment.index}`,
	);
	}
	continue;
	}
	let extracted: Awaited<ReturnType<typeof extractFileContentFromSource>>;
	try {
	const mediaType = utf16Charset ? `${mimeType}; charset=${utf16Charset}` : mimeType;
	const { allowedMimesConfigured: _allowedMimesConfigured, ...baseLimits } = limits;
	extracted = await extractFileContentFromSource({
	source: {
	type: "base64",
	data: bufferResult.buffer.toString("base64"),
	mediaType,
	filename: bufferResult.fileName,
	},
	limits: {
	...baseLimits,
	allowedMimes,
	},
	});
	} catch (err) {
	if (shouldLogVerbose()) {
	logVerbose(`media: file attachment skipped (extract): ${String(err)}`);
	}
	continue;
	}
	const text = extracted?.text?.trim() ?? "";
	let blockText = text;
	if (!blockText) {
	if (extracted?.images && extracted.images.length > 0) {
	blockText = "[PDF content rendered to images; images not forwarded to model]";
	} else {
	blockText = "[No extractable text]";
	}
	}
	const safeName = (bufferResult.fileName ?? `file-${attachment.index + 1}`)
	.replace(/[\r\n\t]+/g, " ")
	.trim();
	// Escape XML special characters in attributes to prevent injection
	blocks.push(
	`<file name="${xmlEscapeAttr(safeName)}" mime="${xmlEscapeAttr(mimeType)}">\n${escapeFileBlockContent(blockText)}\n</file>`,
	);
	}
	return blocks;
	}

	export async function applyMediaUnderstanding(params: {
	ctx: MsgContext;
	cfg: OpenClawConfig;
	agentDir?: string;
	providers?: Record<string, MediaUnderstandingProvider>;
	activeModel?: ActiveMediaModel;
	}): Promise<ApplyMediaUnderstandingResult> {
	const { ctx, cfg } = params;
	const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body];
	const originalUserText =
	commandCandidates
	.map((value) => extractMediaUserText(value))
	.find((value) => value && value.trim()) ?? undefined;

	const attachments = normalizeMediaAttachments(ctx);
	const providerRegistry = buildProviderRegistry(params.providers);
	const cache = createMediaAttachmentCache(attachments);

	try {
	const tasks = CAPABILITY_ORDER.map((capability) => async () => {
	const config = cfg.tools?.media?.[capability];
	return await runCapability({
	capability,
	cfg,
	ctx,
	attachments: cache,
	media: attachments,
	agentDir: params.agentDir,
	providerRegistry,
	config,
	activeModel: params.activeModel,
	});
	});

	const results = await runWithConcurrency(tasks, resolveConcurrency(cfg));
	const outputs: MediaUnderstandingOutput[] = [];
	const decisions: MediaUnderstandingDecision[] = [];
	for (const entry of results) {
	if (!entry) {
	continue;
	}
	for (const output of entry.outputs) {
	outputs.push(output);
	}
	decisions.push(entry.decision);
	}

	if (decisions.length > 0) {
	ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
	}

	if (outputs.length > 0) {
	ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
	const audioOutputs = outputs.filter((output) => output.kind === "audio.transcription");
	if (audioOutputs.length > 0) {
	const transcript = formatAudioTranscripts(audioOutputs);
	ctx.Transcript = transcript;
	if (originalUserText) {
	ctx.CommandBody = originalUserText;
	ctx.RawBody = originalUserText;
	} else {
	ctx.CommandBody = transcript;
	ctx.RawBody = transcript;
	}
	} else if (originalUserText) {
	ctx.CommandBody = originalUserText;
	ctx.RawBody = originalUserText;
	}
	ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
	}
	const audioAttachmentIndexes = new Set(
	outputs
	.filter((output) => output.kind === "audio.transcription")
	.map((output) => output.attachmentIndex),
	);
	const fileBlocks = await extractFileBlocks({
	attachments,
	cache,
	limits: resolveFileLimits(cfg),
	skipAttachmentIndexes: audioAttachmentIndexes.size > 0 ? audioAttachmentIndexes : undefined,
	});
	if (fileBlocks.length > 0) {
	ctx.Body = appendFileBlocks(ctx.Body, fileBlocks);
	}
	if (outputs.length > 0 \|\| fileBlocks.length > 0) {
	finalizeInboundContext(ctx, {
	forceBodyForAgent: true,
	forceBodyForCommands: outputs.length > 0 \|\| fileBlocks.length > 0,
	});
	}

	return {
	outputs,
	decisions,
	appliedImage: outputs.some((output) => output.kind === "image.description"),
	appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
	appliedVideo: outputs.some((output) => output.kind === "video.description"),
	appliedFile: fileBlocks.length > 0,
	};
	} finally {
	await cache.cleanup();
	}
	}