Spaces:

darkfire514
/

OpenClawBot

Paused

App Files Files Community

OpenClawBot / src /agents /tools /image-tool.ts

darkfire514

Upload 2526 files

fb4d8fe verified about 1 month ago

raw

history blame contribute delete

14.9 kB

	import { type Api, type Context, complete, type Model } from "@mariozechner/pi-ai";
	import { Type } from "@sinclair/typebox";
	import fs from "node:fs/promises";
	import path from "node:path";
	import type { OpenClawConfig } from "../../config/config.js";
	import type { AnyAgentTool } from "./common.js";
	import { resolveUserPath } from "../../utils.js";
	import { loadWebMedia } from "../../web/media.js";
	import { ensureAuthProfileStore, listProfilesForProvider } from "../auth-profiles.js";
	import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
	import { minimaxUnderstandImage } from "../minimax-vlm.js";
	import { getApiKeyForModel, requireApiKey, resolveEnvApiKey } from "../model-auth.js";
	import { runWithImageModelFallback } from "../model-fallback.js";
	import { resolveConfiguredModelRef } from "../model-selection.js";
	import { ensureOpenClawModelsJson } from "../models-config.js";
	import { discoverAuthStorage, discoverModels } from "../pi-model-discovery.js";
	import { assertSandboxPath } from "../sandbox-paths.js";
	import {
	coerceImageAssistantText,
	coerceImageModelConfig,
	decodeDataUrl,
	type ImageModelConfig,
	resolveProviderVisionModelFromConfig,
	} from "./image-tool.helpers.js";

	const DEFAULT_PROMPT = "Describe the image.";

	export const __testing = {
	decodeDataUrl,
	coerceImageAssistantText,
	} as const;

	function resolveDefaultModelRef(cfg?: OpenClawConfig): {
	provider: string;
	model: string;
	} {
	if (cfg) {
	const resolved = resolveConfiguredModelRef({
	cfg,
	defaultProvider: DEFAULT_PROVIDER,
	defaultModel: DEFAULT_MODEL,
	});
	return { provider: resolved.provider, model: resolved.model };
	}
	return { provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL };
	}

	function hasAuthForProvider(params: { provider: string; agentDir: string }): boolean {
	if (resolveEnvApiKey(params.provider)?.apiKey) {
	return true;
	}
	const store = ensureAuthProfileStore(params.agentDir, {
	allowKeychainPrompt: false,
	});
	return listProfilesForProvider(store, params.provider).length > 0;
	}

	/**
	* Resolve the effective image model config for the `image` tool.
	*
	* - Prefer explicit config (`agents.defaults.imageModel`).
	* - Otherwise, try to "pair" the primary model with an image-capable model:
	* - same provider (best effort)
	* - fall back to OpenAI/Anthropic when available
	*/
	export function resolveImageModelConfigForTool(params: {
	cfg?: OpenClawConfig;
	agentDir: string;
	}): ImageModelConfig \| null {
	// Note: We intentionally do NOT gate based on primarySupportsImages here.
	// Even when the primary model supports images, we keep the tool available
	// because images are auto-injected into prompts (see attempt.ts detectAndLoadPromptImages).
	// The tool description is adjusted via modelHasVision to discourage redundant usage.
	const explicit = coerceImageModelConfig(params.cfg);
	if (explicit.primary?.trim() \|\| (explicit.fallbacks?.length ?? 0) > 0) {
	return explicit;
	}

	const primary = resolveDefaultModelRef(params.cfg);
	const openaiOk = hasAuthForProvider({
	provider: "openai",
	agentDir: params.agentDir,
	});
	const anthropicOk = hasAuthForProvider({
	provider: "anthropic",
	agentDir: params.agentDir,
	});

	const fallbacks: string[] = [];
	const addFallback = (modelRef: string \| null) => {
	const ref = (modelRef ?? "").trim();
	if (!ref) {
	return;
	}
	if (fallbacks.includes(ref)) {
	return;
	}
	fallbacks.push(ref);
	};

	const providerVisionFromConfig = resolveProviderVisionModelFromConfig({
	cfg: params.cfg,
	provider: primary.provider,
	});
	const providerOk = hasAuthForProvider({
	provider: primary.provider,
	agentDir: params.agentDir,
	});

	let preferred: string \| null = null;

	// MiniMax users: always try the canonical vision model first when auth exists.
	if (primary.provider === "minimax" && providerOk) {
	preferred = "minimax/MiniMax-VL-01";
	} else if (providerOk && providerVisionFromConfig) {
	preferred = providerVisionFromConfig;
	} else if (primary.provider === "openai" && openaiOk) {
	preferred = "openai/gpt-5-mini";
	} else if (primary.provider === "anthropic" && anthropicOk) {
	preferred = "anthropic/claude-opus-4-5";
	}

	if (preferred?.trim()) {
	if (openaiOk) {
	addFallback("openai/gpt-5-mini");
	}
	if (anthropicOk) {
	addFallback("anthropic/claude-opus-4-5");
	}
	// Don't duplicate primary in fallbacks.
	const pruned = fallbacks.filter((ref) => ref !== preferred);
	return {
	primary: preferred,
	...(pruned.length > 0 ? { fallbacks: pruned } : {}),
	};
	}

	// Cross-provider fallback when we can't pair with the primary provider.
	if (openaiOk) {
	if (anthropicOk) {
	addFallback("anthropic/claude-opus-4-5");
	}
	return {
	primary: "openai/gpt-5-mini",
	...(fallbacks.length ? { fallbacks } : {}),
	};
	}
	if (anthropicOk) {
	return { primary: "anthropic/claude-opus-4-5" };
	}

	return null;
	}

	function pickMaxBytes(cfg?: OpenClawConfig, maxBytesMb?: number): number \| undefined {
	if (typeof maxBytesMb === "number" && Number.isFinite(maxBytesMb) && maxBytesMb > 0) {
	return Math.floor(maxBytesMb * 1024 * 1024);
	}
	const configured = cfg?.agents?.defaults?.mediaMaxMb;
	if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
	return Math.floor(configured * 1024 * 1024);
	}
	return undefined;
	}

	function buildImageContext(prompt: string, base64: string, mimeType: string): Context {
	return {
	messages: [
	{
	role: "user",
	content: [
	{ type: "text", text: prompt },
	{ type: "image", data: base64, mimeType },
	],
	timestamp: Date.now(),
	},
	],
	};
	}

	async function resolveSandboxedImagePath(params: {
	sandboxRoot: string;
	imagePath: string;
	}): Promise<{ resolved: string; rewrittenFrom?: string }> {
	const normalize = (p: string) => (p.startsWith("file://") ? p.slice("file://".length) : p);
	const filePath = normalize(params.imagePath);
	try {
	const out = await assertSandboxPath({
	filePath,
	cwd: params.sandboxRoot,
	root: params.sandboxRoot,
	});
	return { resolved: out.resolved };
	} catch (err) {
	const name = path.basename(filePath);
	const candidateRel = path.join("media", "inbound", name);
	const candidateAbs = path.join(params.sandboxRoot, candidateRel);
	try {
	await fs.stat(candidateAbs);
	} catch {
	throw err;
	}
	const out = await assertSandboxPath({
	filePath: candidateRel,
	cwd: params.sandboxRoot,
	root: params.sandboxRoot,
	});
	return { resolved: out.resolved, rewrittenFrom: filePath };
	}
	}

	async function runImagePrompt(params: {
	cfg?: OpenClawConfig;
	agentDir: string;
	imageModelConfig: ImageModelConfig;
	modelOverride?: string;
	prompt: string;
	base64: string;
	mimeType: string;
	}): Promise<{
	text: string;
	provider: string;
	model: string;
	attempts: Array<{ provider: string; model: string; error: string }>;
	}> {
	const effectiveCfg: OpenClawConfig \| undefined = params.cfg
	? {
	...params.cfg,
	agents: {
	...params.cfg.agents,
	defaults: {
	...params.cfg.agents?.defaults,
	imageModel: params.imageModelConfig,
	},
	},
	}
	: undefined;

	await ensureOpenClawModelsJson(effectiveCfg, params.agentDir);
	const authStorage = discoverAuthStorage(params.agentDir);
	const modelRegistry = discoverModels(authStorage, params.agentDir);

	const result = await runWithImageModelFallback({
	cfg: effectiveCfg,
	modelOverride: params.modelOverride,
	run: async (provider, modelId) => {
	const model = modelRegistry.find(provider, modelId) as Model<Api> \| null;
	if (!model) {
	throw new Error(`Unknown model: ${provider}/${modelId}`);
	}
	if (!model.input?.includes("image")) {
	throw new Error(`Model does not support images: ${provider}/${modelId}`);
	}
	const apiKeyInfo = await getApiKeyForModel({
	model,
	cfg: effectiveCfg,
	agentDir: params.agentDir,
	});
	const apiKey = requireApiKey(apiKeyInfo, model.provider);
	authStorage.setRuntimeApiKey(model.provider, apiKey);
	const imageDataUrl = `data:${params.mimeType};base64,${params.base64}`;

	if (model.provider === "minimax") {
	const text = await minimaxUnderstandImage({
	apiKey,
	prompt: params.prompt,
	imageDataUrl,
	modelBaseUrl: model.baseUrl,
	});
	return { text, provider: model.provider, model: model.id };
	}

	const context = buildImageContext(params.prompt, params.base64, params.mimeType);
	const message = await complete(model, context, {
	apiKey,
	maxTokens: 512,
	});
	const text = coerceImageAssistantText({
	message,
	provider: model.provider,
	model: model.id,
	});
	return { text, provider: model.provider, model: model.id };
	},
	});

	return {
	text: result.result.text,
	provider: result.result.provider,
	model: result.result.model,
	attempts: result.attempts.map((attempt) => ({
	provider: attempt.provider,
	model: attempt.model,
	error: attempt.error,
	})),
	};
	}

	export function createImageTool(options?: {
	config?: OpenClawConfig;
	agentDir?: string;
	sandboxRoot?: string;
	/** If true, the model has native vision capability and images in the prompt are auto-injected */
	modelHasVision?: boolean;
	}): AnyAgentTool \| null {
	const agentDir = options?.agentDir?.trim();
	if (!agentDir) {
	const explicit = coerceImageModelConfig(options?.config);
	if (explicit.primary?.trim() \|\| (explicit.fallbacks?.length ?? 0) > 0) {
	throw new Error("createImageTool requires agentDir when enabled");
	}
	return null;
	}
	const imageModelConfig = resolveImageModelConfigForTool({
	cfg: options?.config,
	agentDir,
	});
	if (!imageModelConfig) {
	return null;
	}

	// If model has native vision, images in the prompt are auto-injected
	// so this tool is only needed when image wasn't provided in the prompt
	const description = options?.modelHasVision
	? "Analyze an image with a vision model. Only use this tool when the image was NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you."
	: "Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL.";

	return {
	label: "Image",
	name: "image",
	description,
	parameters: Type.Object({
	prompt: Type.Optional(Type.String()),
	image: Type.String(),
	model: Type.Optional(Type.String()),
	maxBytesMb: Type.Optional(Type.Number()),
	}),
	execute: async (_toolCallId, args) => {
	const record = args && typeof args === "object" ? (args as Record<string, unknown>) : {};
	const imageRawInput = typeof record.image === "string" ? record.image.trim() : "";
	const imageRaw = imageRawInput.startsWith("@")
	? imageRawInput.slice(1).trim()
	: imageRawInput;
	if (!imageRaw) {
	throw new Error("image required");
	}

	// The tool accepts file paths, file/data URLs, or http(s) URLs. In some
	// agent/model contexts, images can be referenced as pseudo-URIs like
	// `image:0` (e.g. "first image in the prompt"). We don't have access to a
	// shared image registry here, so fail gracefully instead of attempting to
	// `fs.readFile("image:0")` and producing a noisy ENOENT.
	const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(imageRaw);
	const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(imageRaw);
	const isFileUrl = /^file:/i.test(imageRaw);
	const isHttpUrl = /^https?:\/\//i.test(imageRaw);
	const isDataUrl = /^data:/i.test(imageRaw);
	if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) {
	return {
	content: [
	{
	type: "text",
	text: `Unsupported image reference: ${imageRawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`,
	},
	],
	details: {
	error: "unsupported_image_reference",
	image: imageRawInput,
	},
	};
	}
	const promptRaw =
	typeof record.prompt === "string" && record.prompt.trim()
	? record.prompt.trim()
	: DEFAULT_PROMPT;
	const modelOverride =
	typeof record.model === "string" && record.model.trim() ? record.model.trim() : undefined;
	const maxBytesMb = typeof record.maxBytesMb === "number" ? record.maxBytesMb : undefined;
	const maxBytes = pickMaxBytes(options?.config, maxBytesMb);

	const sandboxRoot = options?.sandboxRoot?.trim();
	const isUrl = isHttpUrl;
	if (sandboxRoot && isUrl) {
	throw new Error("Sandboxed image tool does not allow remote URLs.");
	}

	const resolvedImage = (() => {
	if (sandboxRoot) {
	return imageRaw;
	}
	if (imageRaw.startsWith("~")) {
	return resolveUserPath(imageRaw);
	}
	return imageRaw;
	})();
	const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = isDataUrl
	? { resolved: "" }
	: sandboxRoot
	? await resolveSandboxedImagePath({
	sandboxRoot,
	imagePath: resolvedImage,
	})
	: {
	resolved: resolvedImage.startsWith("file://")
	? resolvedImage.slice("file://".length)
	: resolvedImage,
	};
	const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved;

	const media = isDataUrl
	? decodeDataUrl(resolvedImage)
	: await loadWebMedia(resolvedPath ?? resolvedImage, maxBytes);
	if (media.kind !== "image") {
	throw new Error(`Unsupported media type: ${media.kind}`);
	}

	const mimeType =
	("contentType" in media && media.contentType) \|\|
	("mimeType" in media && media.mimeType) \|\|
	"image/png";
	const base64 = media.buffer.toString("base64");
	const result = await runImagePrompt({
	cfg: options?.config,
	agentDir,
	imageModelConfig,
	modelOverride,
	prompt: promptRaw,
	base64,
	mimeType,
	});
	return {
	content: [{ type: "text", text: result.text }],
	details: {
	model: `${result.provider}/${result.model}`,
	image: resolvedImage,
	...(resolvedPathInfo.rewrittenFrom
	? { rewrittenFrom: resolvedPathInfo.rewrittenFrom }
	: {}),
	attempts: result.attempts,
	},
	};
	},
	};
	}