| import { type Context, complete } from "@mariozechner/pi-ai"; |
| import { Type } from "@sinclair/typebox"; |
| import type { OpenClawConfig } from "../../config/config.js"; |
| import { resolveUserPath } from "../../utils.js"; |
| import { loadWebMedia } from "../../web/media.js"; |
| import { isMinimaxVlmModel, isMinimaxVlmProvider, minimaxUnderstandImage } from "../minimax-vlm.js"; |
| import { |
| coerceImageAssistantText, |
| coerceImageModelConfig, |
| decodeDataUrl, |
| type ImageModelConfig, |
| resolveProviderVisionModelFromConfig, |
| } from "./image-tool.helpers.js"; |
| import { |
| applyImageModelConfigDefaults, |
| buildTextToolResult, |
| resolveModelFromRegistry, |
| resolveMediaToolLocalRoots, |
| resolveModelRuntimeApiKey, |
| resolvePromptAndModelOverride, |
| } from "./media-tool-shared.js"; |
| import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js"; |
| import { |
| createSandboxBridgeReadFile, |
| discoverAuthStorage, |
| discoverModels, |
| ensureOpenClawModelsJson, |
| resolveSandboxedBridgeMediaPath, |
| runWithImageModelFallback, |
| type AnyAgentTool, |
| type SandboxedBridgeMediaPathConfig, |
| type SandboxFsBridge, |
| type ToolFsPolicy, |
| } from "./tool-runtime.helpers.js"; |
|
|
| const DEFAULT_PROMPT = "Describe the image."; |
| const ANTHROPIC_IMAGE_PRIMARY = "anthropic/claude-opus-4-6"; |
| const ANTHROPIC_IMAGE_FALLBACK = "anthropic/claude-opus-4-5"; |
| const DEFAULT_MAX_IMAGES = 20; |
|
|
| export const __testing = { |
| decodeDataUrl, |
| coerceImageAssistantText, |
| resolveImageToolMaxTokens, |
| } as const; |
|
|
| function resolveImageToolMaxTokens(modelMaxTokens: number | undefined, requestedMaxTokens = 4096) { |
| if ( |
| typeof modelMaxTokens !== "number" || |
| !Number.isFinite(modelMaxTokens) || |
| modelMaxTokens <= 0 |
| ) { |
| return requestedMaxTokens; |
| } |
| return Math.min(requestedMaxTokens, modelMaxTokens); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| export function resolveImageModelConfigForTool(params: { |
| cfg?: OpenClawConfig; |
| agentDir: string; |
| }): ImageModelConfig | null { |
| |
| |
| |
| |
| const explicit = coerceImageModelConfig(params.cfg); |
| if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) { |
| return explicit; |
| } |
|
|
| const primary = resolveDefaultModelRef(params.cfg); |
| const openaiOk = hasAuthForProvider({ |
| provider: "openai", |
| agentDir: params.agentDir, |
| }); |
| const anthropicOk = hasAuthForProvider({ |
| provider: "anthropic", |
| agentDir: params.agentDir, |
| }); |
|
|
| const fallbacks: string[] = []; |
| const addFallback = (modelRef: string | null) => { |
| const ref = (modelRef ?? "").trim(); |
| if (!ref) { |
| return; |
| } |
| if (fallbacks.includes(ref)) { |
| return; |
| } |
| fallbacks.push(ref); |
| }; |
|
|
| const providerVisionFromConfig = resolveProviderVisionModelFromConfig({ |
| cfg: params.cfg, |
| provider: primary.provider, |
| }); |
| const providerOk = hasAuthForProvider({ |
| provider: primary.provider, |
| agentDir: params.agentDir, |
| }); |
|
|
| let preferred: string | null = null; |
|
|
| |
| if (isMinimaxVlmProvider(primary.provider) && providerOk) { |
| preferred = `${primary.provider}/MiniMax-VL-01`; |
| } else if (providerOk && providerVisionFromConfig) { |
| preferred = providerVisionFromConfig; |
| } else if (primary.provider === "zai" && providerOk) { |
| preferred = "zai/glm-4.6v"; |
| } else if (primary.provider === "openai" && openaiOk) { |
| preferred = "openai/gpt-5-mini"; |
| } else if (primary.provider === "anthropic" && anthropicOk) { |
| preferred = ANTHROPIC_IMAGE_PRIMARY; |
| } |
|
|
| if (preferred?.trim()) { |
| if (openaiOk) { |
| addFallback("openai/gpt-5-mini"); |
| } |
| if (anthropicOk) { |
| addFallback(ANTHROPIC_IMAGE_FALLBACK); |
| } |
| |
| const pruned = fallbacks.filter((ref) => ref !== preferred); |
| return { |
| primary: preferred, |
| ...(pruned.length > 0 ? { fallbacks: pruned } : {}), |
| }; |
| } |
|
|
| |
| if (openaiOk) { |
| if (anthropicOk) { |
| addFallback(ANTHROPIC_IMAGE_FALLBACK); |
| } |
| return { |
| primary: "openai/gpt-5-mini", |
| ...(fallbacks.length ? { fallbacks } : {}), |
| }; |
| } |
| if (anthropicOk) { |
| return { |
| primary: ANTHROPIC_IMAGE_PRIMARY, |
| fallbacks: [ANTHROPIC_IMAGE_FALLBACK], |
| }; |
| } |
|
|
| return null; |
| } |
|
|
| function pickMaxBytes(cfg?: OpenClawConfig, maxBytesMb?: number): number | undefined { |
| if (typeof maxBytesMb === "number" && Number.isFinite(maxBytesMb) && maxBytesMb > 0) { |
| return Math.floor(maxBytesMb * 1024 * 1024); |
| } |
| const configured = cfg?.agents?.defaults?.mediaMaxMb; |
| if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) { |
| return Math.floor(configured * 1024 * 1024); |
| } |
| return undefined; |
| } |
|
|
| function buildImageContext( |
| prompt: string, |
| images: Array<{ base64: string; mimeType: string }>, |
| ): Context { |
| const content: Array< |
| { type: "text"; text: string } | { type: "image"; data: string; mimeType: string } |
| > = [{ type: "text", text: prompt }]; |
| for (const img of images) { |
| content.push({ type: "image", data: img.base64, mimeType: img.mimeType }); |
| } |
| return { |
| messages: [ |
| { |
| role: "user", |
| content, |
| timestamp: Date.now(), |
| }, |
| ], |
| }; |
| } |
|
|
| type ImageSandboxConfig = { |
| root: string; |
| bridge: SandboxFsBridge; |
| }; |
|
|
| async function runImagePrompt(params: { |
| cfg?: OpenClawConfig; |
| agentDir: string; |
| imageModelConfig: ImageModelConfig; |
| modelOverride?: string; |
| prompt: string; |
| images: Array<{ base64: string; mimeType: string }>; |
| }): Promise<{ |
| text: string; |
| provider: string; |
| model: string; |
| attempts: Array<{ provider: string; model: string; error: string }>; |
| }> { |
| const effectiveCfg = applyImageModelConfigDefaults(params.cfg, params.imageModelConfig); |
|
|
| await ensureOpenClawModelsJson(effectiveCfg, params.agentDir); |
| const authStorage = discoverAuthStorage(params.agentDir); |
| const modelRegistry = discoverModels(authStorage, params.agentDir); |
|
|
| const result = await runWithImageModelFallback({ |
| cfg: effectiveCfg, |
| modelOverride: params.modelOverride, |
| run: async (provider, modelId) => { |
| const model = resolveModelFromRegistry({ modelRegistry, provider, modelId }); |
| if (!model.input?.includes("image")) { |
| throw new Error(`Model does not support images: ${provider}/${modelId}`); |
| } |
| const apiKey = await resolveModelRuntimeApiKey({ |
| model, |
| cfg: effectiveCfg, |
| agentDir: params.agentDir, |
| authStorage, |
| }); |
|
|
| |
| if (isMinimaxVlmModel(model.provider, model.id)) { |
| const first = params.images[0]; |
| const imageDataUrl = `data:${first.mimeType};base64,${first.base64}`; |
| const text = await minimaxUnderstandImage({ |
| apiKey, |
| prompt: params.prompt, |
| imageDataUrl, |
| modelBaseUrl: model.baseUrl, |
| }); |
| return { text, provider: model.provider, model: model.id }; |
| } |
|
|
| const context = buildImageContext(params.prompt, params.images); |
| const message = await complete(model, context, { |
| apiKey, |
| maxTokens: resolveImageToolMaxTokens(model.maxTokens), |
| }); |
| const text = coerceImageAssistantText({ |
| message, |
| provider: model.provider, |
| model: model.id, |
| }); |
| return { text, provider: model.provider, model: model.id }; |
| }, |
| }); |
|
|
| return { |
| text: result.result.text, |
| provider: result.result.provider, |
| model: result.result.model, |
| attempts: result.attempts.map((attempt) => ({ |
| provider: attempt.provider, |
| model: attempt.model, |
| error: attempt.error, |
| })), |
| }; |
| } |
|
|
| export function createImageTool(options?: { |
| config?: OpenClawConfig; |
| agentDir?: string; |
| workspaceDir?: string; |
| sandbox?: ImageSandboxConfig; |
| fsPolicy?: ToolFsPolicy; |
| /** If true, the model has native vision capability and images in the prompt are auto-injected */ |
| modelHasVision?: boolean; |
| }): AnyAgentTool | null { |
| const agentDir = options?.agentDir?.trim(); |
| if (!agentDir) { |
| const explicit = coerceImageModelConfig(options?.config); |
| if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) { |
| throw new Error("createImageTool requires agentDir when enabled"); |
| } |
| return null; |
| } |
| const imageModelConfig = resolveImageModelConfigForTool({ |
| cfg: options?.config, |
| agentDir, |
| }); |
| if (!imageModelConfig) { |
| return null; |
| } |
|
|
| |
| |
| const description = options?.modelHasVision |
| ? "Analyze one or more images with a vision model. Use image for a single path/URL, or images for multiple (up to 20). Only use this tool when images were NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you." |
| : "Analyze one or more images with the configured image model (agents.defaults.imageModel). Use image for a single path/URL, or images for multiple (up to 20). Provide a prompt describing what to analyze."; |
|
|
| const localRoots = resolveMediaToolLocalRoots(options?.workspaceDir, { |
| workspaceOnly: options?.fsPolicy?.workspaceOnly === true, |
| }); |
|
|
| return { |
| label: "Image", |
| name: "image", |
| description, |
| parameters: Type.Object({ |
| prompt: Type.Optional(Type.String()), |
| image: Type.Optional(Type.String({ description: "Single image path or URL." })), |
| images: Type.Optional( |
| Type.Array(Type.String(), { |
| description: "Multiple image paths or URLs (up to maxImages, default 20).", |
| }), |
| ), |
| model: Type.Optional(Type.String()), |
| maxBytesMb: Type.Optional(Type.Number()), |
| maxImages: Type.Optional(Type.Number()), |
| }), |
| execute: async (_toolCallId, args) => { |
| const record = args && typeof args === "object" ? (args as Record<string, unknown>) : {}; |
|
|
| |
| const imageCandidates: string[] = []; |
| if (typeof record.image === "string") { |
| imageCandidates.push(record.image); |
| } |
| if (Array.isArray(record.images)) { |
| imageCandidates.push(...record.images.filter((v): v is string => typeof v === "string")); |
| } |
|
|
| const seenImages = new Set<string>(); |
| const imageInputs: string[] = []; |
| for (const candidate of imageCandidates) { |
| const trimmedCandidate = candidate.trim(); |
| const normalizedForDedupe = trimmedCandidate.startsWith("@") |
| ? trimmedCandidate.slice(1).trim() |
| : trimmedCandidate; |
| if (!normalizedForDedupe || seenImages.has(normalizedForDedupe)) { |
| continue; |
| } |
| seenImages.add(normalizedForDedupe); |
| imageInputs.push(trimmedCandidate); |
| } |
| if (imageInputs.length === 0) { |
| throw new Error("image required"); |
| } |
|
|
| |
| const maxImagesRaw = typeof record.maxImages === "number" ? record.maxImages : undefined; |
| const maxImages = |
| typeof maxImagesRaw === "number" && Number.isFinite(maxImagesRaw) && maxImagesRaw > 0 |
| ? Math.floor(maxImagesRaw) |
| : DEFAULT_MAX_IMAGES; |
| if (imageInputs.length > maxImages) { |
| return { |
| content: [ |
| { |
| type: "text", |
| text: `Too many images: ${imageInputs.length} provided, maximum is ${maxImages}. Please reduce the number of images.`, |
| }, |
| ], |
| details: { error: "too_many_images", count: imageInputs.length, max: maxImages }, |
| }; |
| } |
|
|
| const { prompt: promptRaw, modelOverride } = resolvePromptAndModelOverride( |
| record, |
| DEFAULT_PROMPT, |
| ); |
| const maxBytesMb = typeof record.maxBytesMb === "number" ? record.maxBytesMb : undefined; |
| const maxBytes = pickMaxBytes(options?.config, maxBytesMb); |
|
|
| const sandboxConfig: SandboxedBridgeMediaPathConfig | null = |
| options?.sandbox && options?.sandbox.root.trim() |
| ? { |
| root: options.sandbox.root.trim(), |
| bridge: options.sandbox.bridge, |
| workspaceOnly: options.fsPolicy?.workspaceOnly === true, |
| } |
| : null; |
|
|
| |
| const loadedImages: Array<{ |
| base64: string; |
| mimeType: string; |
| resolvedImage: string; |
| rewrittenFrom?: string; |
| }> = []; |
|
|
| for (const imageRawInput of imageInputs) { |
| const trimmed = imageRawInput.trim(); |
| const imageRaw = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed; |
| if (!imageRaw) { |
| throw new Error("image required (empty string in array)"); |
| } |
|
|
| |
| |
| |
| |
| |
| const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(imageRaw); |
| const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(imageRaw); |
| const isFileUrl = /^file:/i.test(imageRaw); |
| const isHttpUrl = /^https?:\/\//i.test(imageRaw); |
| const isDataUrl = /^data:/i.test(imageRaw); |
| if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) { |
| return { |
| content: [ |
| { |
| type: "text", |
| text: `Unsupported image reference: ${imageRawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`, |
| }, |
| ], |
| details: { |
| error: "unsupported_image_reference", |
| image: imageRawInput, |
| }, |
| }; |
| } |
|
|
| if (sandboxConfig && isHttpUrl) { |
| throw new Error("Sandboxed image tool does not allow remote URLs."); |
| } |
|
|
| const resolvedImage = (() => { |
| if (sandboxConfig) { |
| return imageRaw; |
| } |
| if (imageRaw.startsWith("~")) { |
| return resolveUserPath(imageRaw); |
| } |
| return imageRaw; |
| })(); |
| const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = isDataUrl |
| ? { resolved: "" } |
| : sandboxConfig |
| ? await resolveSandboxedBridgeMediaPath({ |
| sandbox: sandboxConfig, |
| mediaPath: resolvedImage, |
| inboundFallbackDir: "media/inbound", |
| }) |
| : { |
| resolved: resolvedImage.startsWith("file://") |
| ? resolvedImage.slice("file://".length) |
| : resolvedImage, |
| }; |
| const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved; |
|
|
| const media = isDataUrl |
| ? decodeDataUrl(resolvedImage) |
| : sandboxConfig |
| ? await loadWebMedia(resolvedPath ?? resolvedImage, { |
| maxBytes, |
| sandboxValidated: true, |
| readFile: createSandboxBridgeReadFile({ sandbox: sandboxConfig }), |
| }) |
| : await loadWebMedia(resolvedPath ?? resolvedImage, { |
| maxBytes, |
| localRoots, |
| }); |
| if (media.kind !== "image") { |
| throw new Error(`Unsupported media type: ${media.kind}`); |
| } |
|
|
| const mimeType = |
| ("contentType" in media && media.contentType) || |
| ("mimeType" in media && media.mimeType) || |
| "image/png"; |
| const base64 = media.buffer.toString("base64"); |
| loadedImages.push({ |
| base64, |
| mimeType, |
| resolvedImage, |
| ...(resolvedPathInfo.rewrittenFrom |
| ? { rewrittenFrom: resolvedPathInfo.rewrittenFrom } |
| : {}), |
| }); |
| } |
|
|
| |
| const result = await runImagePrompt({ |
| cfg: options?.config, |
| agentDir, |
| imageModelConfig, |
| modelOverride, |
| prompt: promptRaw, |
| images: loadedImages.map((img) => ({ base64: img.base64, mimeType: img.mimeType })), |
| }); |
|
|
| const imageDetails = |
| loadedImages.length === 1 |
| ? { |
| image: loadedImages[0].resolvedImage, |
| ...(loadedImages[0].rewrittenFrom |
| ? { rewrittenFrom: loadedImages[0].rewrittenFrom } |
| : {}), |
| } |
| : { |
| images: loadedImages.map((img) => ({ |
| image: img.resolvedImage, |
| ...(img.rewrittenFrom ? { rewrittenFrom: img.rewrittenFrom } : {}), |
| })), |
| }; |
|
|
| return buildTextToolResult(result, imageDetails); |
| }, |
| }; |
| } |
|
|