Spaces:
Sleeping
Sleeping
| import path from "node:path"; | |
| import type { MsgContext } from "../auto-reply/templating.js"; | |
| import type { OpenClawConfig } from "../config/config.js"; | |
| import type { | |
| MediaUnderstandingCapability, | |
| MediaUnderstandingDecision, | |
| MediaUnderstandingOutput, | |
| MediaUnderstandingProvider, | |
| } from "./types.js"; | |
| import { finalizeInboundContext } from "../auto-reply/reply/inbound-context.js"; | |
| import { logVerbose, shouldLogVerbose } from "../globals.js"; | |
| import { | |
| DEFAULT_INPUT_FILE_MAX_BYTES, | |
| DEFAULT_INPUT_FILE_MAX_CHARS, | |
| DEFAULT_INPUT_FILE_MIMES, | |
| DEFAULT_INPUT_MAX_REDIRECTS, | |
| DEFAULT_INPUT_PDF_MAX_PAGES, | |
| DEFAULT_INPUT_PDF_MAX_PIXELS, | |
| DEFAULT_INPUT_PDF_MIN_TEXT_CHARS, | |
| DEFAULT_INPUT_TIMEOUT_MS, | |
| extractFileContentFromSource, | |
| normalizeMimeList, | |
| normalizeMimeType, | |
| } from "../media/input-files.js"; | |
| import { resolveAttachmentKind } from "./attachments.js"; | |
| import { runWithConcurrency } from "./concurrency.js"; | |
| import { | |
| extractMediaUserText, | |
| formatAudioTranscripts, | |
| formatMediaUnderstandingBody, | |
| } from "./format.js"; | |
| import { resolveConcurrency } from "./resolve.js"; | |
| import { | |
| type ActiveMediaModel, | |
| buildProviderRegistry, | |
| createMediaAttachmentCache, | |
| normalizeMediaAttachments, | |
| runCapability, | |
| } from "./runner.js"; | |
| export type ApplyMediaUnderstandingResult = { | |
| outputs: MediaUnderstandingOutput[]; | |
| decisions: MediaUnderstandingDecision[]; | |
| appliedImage: boolean; | |
| appliedAudio: boolean; | |
| appliedVideo: boolean; | |
| appliedFile: boolean; | |
| }; | |
| const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"]; | |
| const EXTRA_TEXT_MIMES = [ | |
| "application/xml", | |
| "text/xml", | |
| "application/x-yaml", | |
| "text/yaml", | |
| "application/yaml", | |
| "application/javascript", | |
| "text/javascript", | |
| "text/tab-separated-values", | |
| ]; | |
| const TEXT_EXT_MIME = new Map<string, string>([ | |
| [".csv", "text/csv"], | |
| [".tsv", "text/tab-separated-values"], | |
| [".txt", "text/plain"], | |
| [".md", "text/markdown"], | |
| [".log", "text/plain"], | |
| [".ini", "text/plain"], | |
| [".cfg", "text/plain"], | |
| [".conf", "text/plain"], | |
| [".env", "text/plain"], | |
| [".json", "application/json"], | |
| [".yaml", "text/yaml"], | |
| [".yml", "text/yaml"], | |
| [".xml", "application/xml"], | |
| ]); | |
| const XML_ESCAPE_MAP: Record<string, string> = { | |
| "<": "<", | |
| ">": ">", | |
| "&": "&", | |
| '"': """, | |
| "'": "'", | |
| }; | |
| /** | |
| * Escapes special XML characters in attribute values to prevent injection. | |
| */ | |
| function xmlEscapeAttr(value: string): string { | |
| return value.replace(/[<>&"']/g, (char) => XML_ESCAPE_MAP[char] ?? char); | |
| } | |
| function escapeFileBlockContent(value: string): string { | |
| return value.replace(/<\s*\/\s*file\s*>/gi, "</file>").replace(/<\s*file\b/gi, "<file"); | |
| } | |
| function sanitizeMimeType(value?: string): string | undefined { | |
| if (!value) { | |
| return undefined; | |
| } | |
| const trimmed = value.trim().toLowerCase(); | |
| if (!trimmed) { | |
| return undefined; | |
| } | |
| const match = trimmed.match(/^([a-z0-9!#$&^_.+-]+\/[a-z0-9!#$&^_.+-]+)/); | |
| return match?.[1]; | |
| } | |
| function resolveFileLimits(cfg: OpenClawConfig) { | |
| const files = cfg.gateway?.http?.endpoints?.responses?.files; | |
| const allowedMimesConfigured = Boolean(files?.allowedMimes && files.allowedMimes.length > 0); | |
| return { | |
| allowUrl: files?.allowUrl ?? true, | |
| allowedMimes: normalizeMimeList(files?.allowedMimes, DEFAULT_INPUT_FILE_MIMES), | |
| allowedMimesConfigured, | |
| maxBytes: files?.maxBytes ?? DEFAULT_INPUT_FILE_MAX_BYTES, | |
| maxChars: files?.maxChars ?? DEFAULT_INPUT_FILE_MAX_CHARS, | |
| maxRedirects: files?.maxRedirects ?? DEFAULT_INPUT_MAX_REDIRECTS, | |
| timeoutMs: files?.timeoutMs ?? DEFAULT_INPUT_TIMEOUT_MS, | |
| pdf: { | |
| maxPages: files?.pdf?.maxPages ?? DEFAULT_INPUT_PDF_MAX_PAGES, | |
| maxPixels: files?.pdf?.maxPixels ?? DEFAULT_INPUT_PDF_MAX_PIXELS, | |
| minTextChars: files?.pdf?.minTextChars ?? DEFAULT_INPUT_PDF_MIN_TEXT_CHARS, | |
| }, | |
| }; | |
| } | |
| function appendFileBlocks(body: string | undefined, blocks: string[]): string { | |
| if (!blocks || blocks.length === 0) { | |
| return body ?? ""; | |
| } | |
| const base = typeof body === "string" ? body.trim() : ""; | |
| const suffix = blocks.join("\n\n").trim(); | |
| if (!base) { | |
| return suffix; | |
| } | |
| return `${base}\n\n${suffix}`.trim(); | |
| } | |
| function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefined { | |
| if (!buffer || buffer.length < 2) { | |
| return undefined; | |
| } | |
| const b0 = buffer[0]; | |
| const b1 = buffer[1]; | |
| if (b0 === 0xff && b1 === 0xfe) { | |
| return "utf-16le"; | |
| } | |
| if (b0 === 0xfe && b1 === 0xff) { | |
| return "utf-16be"; | |
| } | |
| const sampleLen = Math.min(buffer.length, 2048); | |
| let zeroEven = 0; | |
| let zeroOdd = 0; | |
| for (let i = 0; i < sampleLen; i += 1) { | |
| if (buffer[i] !== 0) { | |
| continue; | |
| } | |
| if (i % 2 === 0) { | |
| zeroEven += 1; | |
| } else { | |
| zeroOdd += 1; | |
| } | |
| } | |
| const zeroCount = zeroEven + zeroOdd; | |
| if (zeroCount / sampleLen > 0.2) { | |
| return zeroOdd >= zeroEven ? "utf-16le" : "utf-16be"; | |
| } | |
| return undefined; | |
| } | |
| const WORDISH_CHAR = /[\p{L}\p{N}]/u; | |
| const CP1252_MAP: Array<string | undefined> = [ | |
| "\u20ac", | |
| undefined, | |
| "\u201a", | |
| "\u0192", | |
| "\u201e", | |
| "\u2026", | |
| "\u2020", | |
| "\u2021", | |
| "\u02c6", | |
| "\u2030", | |
| "\u0160", | |
| "\u2039", | |
| "\u0152", | |
| undefined, | |
| "\u017d", | |
| undefined, | |
| undefined, | |
| "\u2018", | |
| "\u2019", | |
| "\u201c", | |
| "\u201d", | |
| "\u2022", | |
| "\u2013", | |
| "\u2014", | |
| "\u02dc", | |
| "\u2122", | |
| "\u0161", | |
| "\u203a", | |
| "\u0153", | |
| undefined, | |
| "\u017e", | |
| "\u0178", | |
| ]; | |
| function decodeLegacyText(buffer: Buffer): string { | |
| let output = ""; | |
| for (const byte of buffer) { | |
| if (byte >= 0x80 && byte <= 0x9f) { | |
| const mapped = CP1252_MAP[byte - 0x80]; | |
| output += mapped ?? String.fromCharCode(byte); | |
| continue; | |
| } | |
| output += String.fromCharCode(byte); | |
| } | |
| return output; | |
| } | |
| function getTextStats(text: string): { printableRatio: number; wordishRatio: number } { | |
| if (!text) { | |
| return { printableRatio: 0, wordishRatio: 0 }; | |
| } | |
| let printable = 0; | |
| let control = 0; | |
| let wordish = 0; | |
| for (const char of text) { | |
| const code = char.codePointAt(0) ?? 0; | |
| if (code === 9 || code === 10 || code === 13 || code === 32) { | |
| printable += 1; | |
| wordish += 1; | |
| continue; | |
| } | |
| if (code < 32 || (code >= 0x7f && code <= 0x9f)) { | |
| control += 1; | |
| continue; | |
| } | |
| printable += 1; | |
| if (WORDISH_CHAR.test(char)) { | |
| wordish += 1; | |
| } | |
| } | |
| const total = printable + control; | |
| if (total === 0) { | |
| return { printableRatio: 0, wordishRatio: 0 }; | |
| } | |
| return { printableRatio: printable / total, wordishRatio: wordish / total }; | |
| } | |
| function isMostlyPrintable(text: string): boolean { | |
| return getTextStats(text).printableRatio > 0.85; | |
| } | |
| function looksLikeLegacyTextBytes(buffer: Buffer): boolean { | |
| if (buffer.length === 0) { | |
| return false; | |
| } | |
| const text = decodeLegacyText(buffer); | |
| const { printableRatio, wordishRatio } = getTextStats(text); | |
| return printableRatio > 0.95 && wordishRatio > 0.3; | |
| } | |
| function looksLikeUtf8Text(buffer?: Buffer): boolean { | |
| if (!buffer || buffer.length === 0) { | |
| return false; | |
| } | |
| const sample = buffer.subarray(0, Math.min(buffer.length, 4096)); | |
| try { | |
| const text = new TextDecoder("utf-8", { fatal: true }).decode(sample); | |
| return isMostlyPrintable(text); | |
| } catch { | |
| return looksLikeLegacyTextBytes(sample); | |
| } | |
| } | |
| function decodeTextSample(buffer?: Buffer): string { | |
| if (!buffer || buffer.length === 0) { | |
| return ""; | |
| } | |
| const sample = buffer.subarray(0, Math.min(buffer.length, 8192)); | |
| const utf16Charset = resolveUtf16Charset(sample); | |
| if (utf16Charset === "utf-16be") { | |
| const swapped = Buffer.alloc(sample.length); | |
| for (let i = 0; i + 1 < sample.length; i += 2) { | |
| swapped[i] = sample[i + 1]; | |
| swapped[i + 1] = sample[i]; | |
| } | |
| return new TextDecoder("utf-16le").decode(swapped); | |
| } | |
| if (utf16Charset === "utf-16le") { | |
| return new TextDecoder("utf-16le").decode(sample); | |
| } | |
| return new TextDecoder("utf-8").decode(sample); | |
| } | |
| function guessDelimitedMime(text: string): string | undefined { | |
| if (!text) { | |
| return undefined; | |
| } | |
| const line = text.split(/\r?\n/)[0] ?? ""; | |
| const tabs = (line.match(/\t/g) ?? []).length; | |
| const commas = (line.match(/,/g) ?? []).length; | |
| if (commas > 0) { | |
| return "text/csv"; | |
| } | |
| if (tabs > 0) { | |
| return "text/tab-separated-values"; | |
| } | |
| return undefined; | |
| } | |
| function resolveTextMimeFromName(name?: string): string | undefined { | |
| if (!name) { | |
| return undefined; | |
| } | |
| const ext = path.extname(name).toLowerCase(); | |
| return TEXT_EXT_MIME.get(ext); | |
| } | |
| async function extractFileBlocks(params: { | |
| attachments: ReturnType<typeof normalizeMediaAttachments>; | |
| cache: ReturnType<typeof createMediaAttachmentCache>; | |
| limits: ReturnType<typeof resolveFileLimits>; | |
| skipAttachmentIndexes?: Set<number>; | |
| }): Promise<string[]> { | |
| const { attachments, cache, limits, skipAttachmentIndexes } = params; | |
| if (!attachments || attachments.length === 0) { | |
| return []; | |
| } | |
| const blocks: string[] = []; | |
| for (const attachment of attachments) { | |
| if (!attachment) { | |
| continue; | |
| } | |
| if (skipAttachmentIndexes?.has(attachment.index)) { | |
| continue; | |
| } | |
| const forcedTextMime = resolveTextMimeFromName(attachment.path ?? attachment.url ?? ""); | |
| const kind = forcedTextMime ? "document" : resolveAttachmentKind(attachment); | |
| if (!forcedTextMime && (kind === "image" || kind === "video")) { | |
| continue; | |
| } | |
| if (!limits.allowUrl && attachment.url && !attachment.path) { | |
| if (shouldLogVerbose()) { | |
| logVerbose(`media: file attachment skipped (url disabled) index=${attachment.index}`); | |
| } | |
| continue; | |
| } | |
| let bufferResult: Awaited<ReturnType<typeof cache.getBuffer>>; | |
| try { | |
| bufferResult = await cache.getBuffer({ | |
| attachmentIndex: attachment.index, | |
| maxBytes: limits.maxBytes, | |
| timeoutMs: limits.timeoutMs, | |
| }); | |
| } catch (err) { | |
| if (shouldLogVerbose()) { | |
| logVerbose(`media: file attachment skipped (buffer): ${String(err)}`); | |
| } | |
| continue; | |
| } | |
| const nameHint = bufferResult?.fileName ?? attachment.path ?? attachment.url; | |
| const forcedTextMimeResolved = forcedTextMime ?? resolveTextMimeFromName(nameHint ?? ""); | |
| const utf16Charset = resolveUtf16Charset(bufferResult?.buffer); | |
| const textSample = decodeTextSample(bufferResult?.buffer); | |
| const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer); | |
| if (!forcedTextMimeResolved && kind === "audio" && !textLike) { | |
| continue; | |
| } | |
| const guessedDelimited = textLike ? guessDelimitedMime(textSample) : undefined; | |
| const textHint = | |
| forcedTextMimeResolved ?? guessedDelimited ?? (textLike ? "text/plain" : undefined); | |
| const rawMime = bufferResult?.mime ?? attachment.mime; | |
| const mimeType = sanitizeMimeType(textHint ?? normalizeMimeType(rawMime)); | |
| // Log when MIME type is overridden from non-text to text for auditability | |
| if (textHint && rawMime && !rawMime.startsWith("text/")) { | |
| logVerbose( | |
| `media: MIME override from "${rawMime}" to "${textHint}" for index=${attachment.index}`, | |
| ); | |
| } | |
| if (!mimeType) { | |
| if (shouldLogVerbose()) { | |
| logVerbose(`media: file attachment skipped (unknown mime) index=${attachment.index}`); | |
| } | |
| continue; | |
| } | |
| const allowedMimes = new Set(limits.allowedMimes); | |
| if (!limits.allowedMimesConfigured) { | |
| for (const extra of EXTRA_TEXT_MIMES) { | |
| allowedMimes.add(extra); | |
| } | |
| if (mimeType.startsWith("text/")) { | |
| allowedMimes.add(mimeType); | |
| } | |
| } | |
| if (!allowedMimes.has(mimeType)) { | |
| if (shouldLogVerbose()) { | |
| logVerbose( | |
| `media: file attachment skipped (unsupported mime ${mimeType}) index=${attachment.index}`, | |
| ); | |
| } | |
| continue; | |
| } | |
| let extracted: Awaited<ReturnType<typeof extractFileContentFromSource>>; | |
| try { | |
| const mediaType = utf16Charset ? `${mimeType}; charset=${utf16Charset}` : mimeType; | |
| const { allowedMimesConfigured: _allowedMimesConfigured, ...baseLimits } = limits; | |
| extracted = await extractFileContentFromSource({ | |
| source: { | |
| type: "base64", | |
| data: bufferResult.buffer.toString("base64"), | |
| mediaType, | |
| filename: bufferResult.fileName, | |
| }, | |
| limits: { | |
| ...baseLimits, | |
| allowedMimes, | |
| }, | |
| }); | |
| } catch (err) { | |
| if (shouldLogVerbose()) { | |
| logVerbose(`media: file attachment skipped (extract): ${String(err)}`); | |
| } | |
| continue; | |
| } | |
| const text = extracted?.text?.trim() ?? ""; | |
| let blockText = text; | |
| if (!blockText) { | |
| if (extracted?.images && extracted.images.length > 0) { | |
| blockText = "[PDF content rendered to images; images not forwarded to model]"; | |
| } else { | |
| blockText = "[No extractable text]"; | |
| } | |
| } | |
| const safeName = (bufferResult.fileName ?? `file-${attachment.index + 1}`) | |
| .replace(/[\r\n\t]+/g, " ") | |
| .trim(); | |
| // Escape XML special characters in attributes to prevent injection | |
| blocks.push( | |
| `<file name="${xmlEscapeAttr(safeName)}" mime="${xmlEscapeAttr(mimeType)}">\n${escapeFileBlockContent(blockText)}\n</file>`, | |
| ); | |
| } | |
| return blocks; | |
| } | |
| export async function applyMediaUnderstanding(params: { | |
| ctx: MsgContext; | |
| cfg: OpenClawConfig; | |
| agentDir?: string; | |
| providers?: Record<string, MediaUnderstandingProvider>; | |
| activeModel?: ActiveMediaModel; | |
| }): Promise<ApplyMediaUnderstandingResult> { | |
| const { ctx, cfg } = params; | |
| const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body]; | |
| const originalUserText = | |
| commandCandidates | |
| .map((value) => extractMediaUserText(value)) | |
| .find((value) => value && value.trim()) ?? undefined; | |
| const attachments = normalizeMediaAttachments(ctx); | |
| const providerRegistry = buildProviderRegistry(params.providers); | |
| const cache = createMediaAttachmentCache(attachments); | |
| try { | |
| const tasks = CAPABILITY_ORDER.map((capability) => async () => { | |
| const config = cfg.tools?.media?.[capability]; | |
| return await runCapability({ | |
| capability, | |
| cfg, | |
| ctx, | |
| attachments: cache, | |
| media: attachments, | |
| agentDir: params.agentDir, | |
| providerRegistry, | |
| config, | |
| activeModel: params.activeModel, | |
| }); | |
| }); | |
| const results = await runWithConcurrency(tasks, resolveConcurrency(cfg)); | |
| const outputs: MediaUnderstandingOutput[] = []; | |
| const decisions: MediaUnderstandingDecision[] = []; | |
| for (const entry of results) { | |
| if (!entry) { | |
| continue; | |
| } | |
| for (const output of entry.outputs) { | |
| outputs.push(output); | |
| } | |
| decisions.push(entry.decision); | |
| } | |
| if (decisions.length > 0) { | |
| ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions]; | |
| } | |
| if (outputs.length > 0) { | |
| ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs }); | |
| const audioOutputs = outputs.filter((output) => output.kind === "audio.transcription"); | |
| if (audioOutputs.length > 0) { | |
| const transcript = formatAudioTranscripts(audioOutputs); | |
| ctx.Transcript = transcript; | |
| if (originalUserText) { | |
| ctx.CommandBody = originalUserText; | |
| ctx.RawBody = originalUserText; | |
| } else { | |
| ctx.CommandBody = transcript; | |
| ctx.RawBody = transcript; | |
| } | |
| } else if (originalUserText) { | |
| ctx.CommandBody = originalUserText; | |
| ctx.RawBody = originalUserText; | |
| } | |
| ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs]; | |
| } | |
| const audioAttachmentIndexes = new Set( | |
| outputs | |
| .filter((output) => output.kind === "audio.transcription") | |
| .map((output) => output.attachmentIndex), | |
| ); | |
| const fileBlocks = await extractFileBlocks({ | |
| attachments, | |
| cache, | |
| limits: resolveFileLimits(cfg), | |
| skipAttachmentIndexes: audioAttachmentIndexes.size > 0 ? audioAttachmentIndexes : undefined, | |
| }); | |
| if (fileBlocks.length > 0) { | |
| ctx.Body = appendFileBlocks(ctx.Body, fileBlocks); | |
| } | |
| if (outputs.length > 0 || fileBlocks.length > 0) { | |
| finalizeInboundContext(ctx, { | |
| forceBodyForAgent: true, | |
| forceBodyForCommands: outputs.length > 0 || fileBlocks.length > 0, | |
| }); | |
| } | |
| return { | |
| outputs, | |
| decisions, | |
| appliedImage: outputs.some((output) => output.kind === "image.description"), | |
| appliedAudio: outputs.some((output) => output.kind === "audio.transcription"), | |
| appliedVideo: outputs.some((output) => output.kind === "video.description"), | |
| appliedFile: fileBlocks.length > 0, | |
| }; | |
| } finally { | |
| await cache.cleanup(); | |
| } | |
| } | |