Spaces:

mishig
/

chat-ui

Running on CPU Upgrade

App Files Files Community

victor HF Staff commited on Nov 18, 2025

Commit

bec283e

unverified ·

1 Parent(s): f87f7a2

MCP: Image refs (#1987)

Browse files

* Add image reference support for MCP tool calls

Introduces image reference resolution and attachment for MCP tool arguments, allowing tools to accept lightweight image reference strings (e.g. 'latest', 'image_1') and receive resolved image payloads. Updates tool invocation and flow logic to use these references, and improves tool prompt instructions for image input handling.

* Enable multiple file uploads in ChatInput

* Refactor multimodal model selection logic

* Remove 'latest' image reference support

Eliminates handling of the 'latest' image reference in image resolver logic and updates related comments and prompts to only support 'image_1', 'image_2', etc. This simplifies image reference usage and clarifies instructions for tool input parameters.

* Refactor imageRefs to fileRefs for tool payloads

Files changed (12) hide show

.env +2 -2
README.md +1 -1
chart/env/dev.yaml +1 -1
chart/env/prod.yaml +1 -1
src/lib/components/chat/ChatInput.svelte +2 -1
src/lib/server/router/endpoint.ts +5 -30
src/lib/server/router/multimodal.ts +28 -0
src/lib/server/textGeneration/mcp/fileRefs.ts +168 -0
src/lib/server/textGeneration/mcp/routerResolution.ts +9 -6
src/lib/server/textGeneration/mcp/runMcpFlow.ts +5 -1
src/lib/server/textGeneration/mcp/toolInvocation.ts +8 -0
src/lib/server/textGeneration/utils/toolPrompt.ts +7 -1

.env CHANGED Viewed

@@ -72,9 +72,9 @@ LLM_ROUTER_MAX_ASSISTANT_LENGTH=500
 # Maximum length (in characters) for previous user messages sent to router (latest user message not trimmed, default 400)
 LLM_ROUTER_MAX_PREV_USER_LENGTH=400
-# Enable router multimodal fallback (set to true to allow image inputs via router)
 LLM_ROUTER_ENABLE_MULTIMODAL=
-# Optional: specific model to use for multimodal requests. If not set, uses first multimodal model
 LLM_ROUTER_MULTIMODAL_MODEL=
 # Enable router tool support (set to true to allow tool calling via router)

 # Maximum length (in characters) for previous user messages sent to router (latest user message not trimmed, default 400)
 LLM_ROUTER_MAX_PREV_USER_LENGTH=400
+# Enable router multimodal handling (set to true to allow image inputs via router)
 LLM_ROUTER_ENABLE_MULTIMODAL=
+# Required when LLM_ROUTER_ENABLE_MULTIMODAL=true: id or name of the multimodal model to use for image requests
 LLM_ROUTER_MULTIMODAL_MODEL=
 # Enable router tool support (set to true to allow tool calling via router)

README.md CHANGED Viewed

@@ -144,7 +144,7 @@ When you select Omni in the UI, Chat UI will:
 Tool and multimodal shortcuts:
-- Multimodal: If `LLM_ROUTER_ENABLE_MULTIMODAL=true` and the user sends an image, the router bypasses Arch and uses `LLM_ROUTER_MULTIMODAL_MODEL` (or the first multimodal model). Route name: `multimodal`.
 - Tools: If `LLM_ROUTER_ENABLE_TOOLS=true` and the user has at least one MCP server enabled, the router bypasses Arch and uses `LLM_ROUTER_TOOLS_MODEL`. If that model is missing or misconfigured, it falls back to Arch routing. Route name: `agentic`.
 ### MCP Tools (Optional)

 Tool and multimodal shortcuts:
+- Multimodal: If `LLM_ROUTER_ENABLE_MULTIMODAL=true` and the user sends an image, the router bypasses Arch and uses the model specified in `LLM_ROUTER_MULTIMODAL_MODEL`. Route name: `multimodal`.
 - Tools: If `LLM_ROUTER_ENABLE_TOOLS=true` and the user has at least one MCP server enabled, the router bypasses Arch and uses `LLM_ROUTER_TOOLS_MODEL`. If that model is missing or misconfigured, it falls back to Arch routing. Route name: `agentic`.
 ### MCP Tools (Optional)

chart/env/dev.yaml CHANGED Viewed

@@ -67,7 +67,7 @@ envVars:
   LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
   LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
   LLM_ROUTER_ENABLE_MULTIMODAL: "true"
-  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Thinking"
   LLM_ROUTER_ENABLE_TOOLS: "true"
   LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
   MCP_SERVERS: >

   LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
   LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
   LLM_ROUTER_ENABLE_MULTIMODAL: "true"
+  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
   LLM_ROUTER_ENABLE_TOOLS: "true"
   LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
   MCP_SERVERS: >

chart/env/prod.yaml CHANGED Viewed

@@ -77,7 +77,7 @@ envVars:
   LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
   LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
   LLM_ROUTER_ENABLE_MULTIMODAL: "true"
-  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-30B-A3B-Instruct"
   LLM_ROUTER_ENABLE_TOOLS: "true"
   LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
   MCP_SERVERS: >

   LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
   LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
   LLM_ROUTER_ENABLE_MULTIMODAL: "true"
+  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
   LLM_ROUTER_ENABLE_TOOLS: "true"
   LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
   MCP_SERVERS: >

src/lib/components/chat/ChatInput.svelte CHANGED Viewed

@@ -241,6 +241,7 @@
 						class="absolute hidden size-0"
 						aria-label="Upload file"
 						type="file"
 						onchange={onFileChange}
 						onclick={(e) => {
 							if (requireAuthUser()) {
@@ -274,7 +275,7 @@
 										onSelect={() => openFilePickerImage()}
 									>
 										<CarbonImage class="size-4 opacity-90 dark:opacity-80" />
-										Add image
 									</DropdownMenu.Item>
 								{/if}

 						class="absolute hidden size-0"
 						aria-label="Upload file"
 						type="file"
+						multiple
 						onchange={onFileChange}
 						onclick={(e) => {
 							if (requireAuthUser()) {
 										onSelect={() => openFilePickerImage()}
 									>
 										<CarbonImage class="size-4 opacity-90 dark:opacity-80" />
+										Add image(s)
 									</DropdownMenu.Item>
 								{/if}

src/lib/server/router/endpoint.ts CHANGED Viewed

@@ -18,6 +18,7 @@ import {
 	pickToolsCapableModel,
 	ROUTER_TOOLS_ROUTE,
 } from "./toolsRoute";
 const REASONING_BLOCK_REGEX = /<think>[\s\S]*?(?:<\/think>|$)/g;
@@ -176,43 +177,17 @@ export async function makeRouterEndpoint(routerModel: ProcessedModel): Promise<E
 			for await (const ev of gen) yield ev;
 		}
-		async function findFirstMultimodalCandidateId(): Promise<string | undefined> {
 			try {
 				const all = await getModels();
-				// Check if a specific multimodal model is configured via env variable
-				const preferredModelId = config.LLM_ROUTER_MULTIMODAL_MODEL;
-				if (preferredModelId) {
-					const preferredModel = all?.find(
-						(m) => (m.id === preferredModelId || m.name === preferredModelId) && m.multimodal
-					);
-					if (preferredModel) {
-						logger.info(
-							{ model: preferredModel.id ?? preferredModel.name },
-							"[router] using configured multimodal model"
-						);
-						return preferredModel.id ?? preferredModel.name;
-					}
-					logger.warn(
-						{ configuredModel: preferredModelId },
-						"[router] configured multimodal model not found or not multimodal, falling back to first available"
-					);
-				}
-				// Fallback to first multimodal model
-				const first = all?.find((m) => !m.isRouter && m.multimodal);
-				return first?.id ?? first?.name;
 			} catch (e) {
 				logger.warn({ err: String(e) }, "[router] failed to load models for multimodal lookup");
-				return undefined;
 			}
-		}
-		if (routerMultimodalEnabled && hasImageInput) {
-			const multimodalCandidate = await findFirstMultimodalCandidateId();
 			if (!multimodalCandidate) {
 				throw new Error(
-					"No multimodal models are configured for the router. Remove the image or enable a multimodal model."
 				);
 			}

 	pickToolsCapableModel,
 	ROUTER_TOOLS_ROUTE,
 } from "./toolsRoute";
+import { getConfiguredMultimodalModelId } from "./multimodal";
 const REASONING_BLOCK_REGEX = /<think>[\s\S]*?(?:<\/think>|$)/g;
 			for await (const ev of gen) yield ev;
 		}
+		if (routerMultimodalEnabled && hasImageInput) {
+			let multimodalCandidate: string | undefined;
 			try {
 				const all = await getModels();
+				multimodalCandidate = getConfiguredMultimodalModelId(all);
 			} catch (e) {
 				logger.warn({ err: String(e) }, "[router] failed to load models for multimodal lookup");
 			}
 			if (!multimodalCandidate) {
 				throw new Error(
+					"Router multimodal is enabled but LLM_ROUTER_MULTIMODAL_MODEL is not correctly configured. Remove the image or configure a multimodal model via LLM_ROUTER_MULTIMODAL_MODEL."
 				);
 			}

src/lib/server/router/multimodal.ts ADDED Viewed

	@@ -0,0 +1,28 @@

+import { config } from "$lib/server/config";
+import type { ProcessedModel } from "../models";
+/**
+ * Returns the configured multimodal model when it exists and is valid.
+ * - Requires LLM_ROUTER_MULTIMODAL_MODEL to be set (id or name).
+ * - Ignores router aliases and non-multimodal models.
+ */
+export function findConfiguredMultimodalModel(
+	models: ProcessedModel[] | undefined
+): ProcessedModel | undefined {
+	const preferredModelId = (config.LLM_ROUTER_MULTIMODAL_MODEL || "").trim();
+	if (!preferredModelId || !models?.length) return undefined;
+	return models.find(
+		(candidate) =>
+			(candidate.id === preferredModelId || candidate.name === preferredModelId) &&
+			!candidate.isRouter &&
+			candidate.multimodal
+	);
+}
+export function getConfiguredMultimodalModelId(
+	models: ProcessedModel[] | undefined
+): string | undefined {
+	const model = findConfiguredMultimodalModel(models);
+	return model?.id ?? model?.name;
+}

src/lib/server/textGeneration/mcp/fileRefs.ts ADDED Viewed

	@@ -0,0 +1,168 @@

+import type { EndpointMessage } from "../../endpoints/endpoints";
+export type FileRefPayload = {
+	name: string;
+	mime: string;
+	base64: string;
+};
+export type RefKind = {
+	prefix: string;
+	matches: (mime: string) => boolean;
+	toDataUrl?: (payload: FileRefPayload) => string;
+};
+export type ResolvedFileRef = FileRefPayload & { refKind: RefKind };
+export type FileRefResolver = (ref: string) => ResolvedFileRef | undefined;
+const IMAGE_REF_KIND: RefKind = {
+	prefix: "image",
+	matches: (mime) => typeof mime === "string" && mime.startsWith("image/"),
+	toDataUrl: (payload) => `data:${payload.mime};base64,${payload.base64}`,
+};
+const DEFAULT_REF_KINDS: RefKind[] = [IMAGE_REF_KIND];
+/**
+ * Build a resolver that maps short ref strings (e.g. "image_1") to the
+ * corresponding file payload for the latest user message containing files of
+ * the allowed kinds. Currently only images are exposed to end users, but the
+ * plumbing supports additional kinds later.
+ */
+export function buildFileRefResolver(
+	messages: EndpointMessage[],
+	refKinds: RefKind[] = DEFAULT_REF_KINDS
+): FileRefResolver | undefined {
+	if (!Array.isArray(refKinds) || refKinds.length === 0) return undefined;
+	// Find the newest user message that has at least one matching file
+	let lastUserWithFiles: EndpointMessage | undefined;
+	for (let i = messages.length - 1; i >= 0; i -= 1) {
+		const msg = messages[i];
+		if (msg.from !== "user") continue;
+		const hasMatch = (msg.files ?? []).some((file) => {
+			const mime = file?.mime;
+			return refKinds.some((kind) => kind.matches(mime ?? ""));
+		});
+		if (hasMatch) {
+			lastUserWithFiles = msg;
+			break;
+		}
+	}
+	if (!lastUserWithFiles) return undefined;
+	// Bucket matched files by ref kind while preserving order within the message
+	const buckets = new Map<RefKind, FileRefPayload[]>();
+	for (const file of lastUserWithFiles.files ?? []) {
+		const mime = file?.mime ?? "";
+		const kind = refKinds.find((k) => k.matches(mime));
+		if (!kind) continue;
+		const payload: FileRefPayload = { name: file.name, mime, base64: file.value };
+		const arr = buckets.get(kind) ?? [];
+		arr.push(payload);
+		buckets.set(kind, arr);
+	}
+	if (buckets.size === 0) return undefined;
+	const resolver: FileRefResolver = (ref) => {
+		if (!ref || typeof ref !== "string") return undefined;
+		const trimmed = ref.trim().toLowerCase();
+		for (const kind of refKinds) {
+			const match = new RegExp(`^${kind.prefix}_(\\d+)$`).exec(trimmed);
+			if (!match) continue;
+			const idx = Number(match[1]) - 1;
+			const files = buckets.get(kind) ?? [];
+			if (Number.isFinite(idx) && idx >= 0 && idx < files.length) {
+				const payload = files[idx];
+				return payload ? { ...payload, refKind: kind } : undefined;
+			}
+		}
+		return undefined;
+	};
+	return resolver;
+}
+export function buildImageRefResolver(messages: EndpointMessage[]): FileRefResolver | undefined {
+	return buildFileRefResolver(messages, [IMAGE_REF_KIND]);
+}
+type FieldRule = {
+	keys: string[];
+	action: "attachPayload" | "replaceWithDataUrl";
+	attachKey?: string;
+	allowedPrefixes?: string[]; // limit to specific ref kinds (e.g. ["image"])
+};
+const DEFAULT_FIELD_RULES: FieldRule[] = [
+	{
+		keys: ["image_ref"],
+		action: "attachPayload",
+		attachKey: "image",
+		allowedPrefixes: ["image"],
+	},
+	{
+		keys: ["input_image"],
+		action: "replaceWithDataUrl",
+		allowedPrefixes: ["image"],
+	},
+];
+/**
+ * Walk tool args and hydrate known ref fields while keeping logging lightweight.
+ * Only image refs are recognized for now to preserve current behavior.
+ */
+export function attachFileRefsToArgs(
+	argsObj: Record<string, unknown>,
+	resolveRef?: FileRefResolver,
+	fieldRules: FieldRule[] = DEFAULT_FIELD_RULES
+): void {
+	if (!resolveRef) return;
+	const visit = (node: unknown): void => {
+		if (!node || typeof node !== "object") return;
+		if (Array.isArray(node)) {
+			for (const v of node) visit(v);
+			return;
+		}
+		const obj = node as Record<string, unknown>;
+		for (const [key, value] of Object.entries(obj)) {
+			if (typeof value !== "string") {
+				if (value && typeof value === "object") visit(value);
+				continue;
+			}
+			const resolved = resolveRef(value);
+			if (!resolved) continue;
+			const rule = fieldRules.find((r) => r.keys.includes(key));
+			if (!rule) continue;
+			if (rule.allowedPrefixes && !rule.allowedPrefixes.includes(resolved.refKind.prefix)) continue;
+			if (rule.action === "attachPayload") {
+				const targetKey = rule.attachKey ?? "file";
+				if (
+					typeof obj[targetKey] !== "object" ||
+					obj[targetKey] === null ||
+					Array.isArray(obj[targetKey])
+				) {
+					obj[targetKey] = {
+						name: resolved.name,
+						mime: resolved.mime,
+						base64: resolved.base64,
+					};
+				}
+			} else if (rule.action === "replaceWithDataUrl") {
+				const toUrl =
+					resolved.refKind.toDataUrl ??
+					((p: FileRefPayload) => `data:${p.mime};base64,${p.base64}`);
+				obj[key] = toUrl(resolved);
+			}
+		}
+	};
+	visit(argsObj);
+}

src/lib/server/textGeneration/mcp/routerResolution.ts CHANGED Viewed

@@ -7,6 +7,7 @@ import {
 	pickToolsCapableModel,
 	ROUTER_TOOLS_ROUTE,
 } from "$lib/server/router/toolsRoute";
 import type { EndpointMessage } from "../../endpoints/endpoints";
 import { stripReasoningFromMessageForRouting } from "../utils/routing";
 import type { ProcessedModel } from "../../models";
@@ -48,15 +49,17 @@ export async function resolveRouterTarget({
 		const allModels = mod.models as ProcessedModel[];
 		if (hasImageInput) {
-			const multimodalCandidate = allModels?.find(
-				(candidate) => !candidate.isRouter && candidate.multimodal
-			);
-			if (multimodalCandidate) {
 				targetModel = multimodalCandidate;
 				candidateModelId = multimodalCandidate.id ?? multimodalCandidate.name;
 				resolvedRoute = "multimodal";
-			} else {
-				runMcp = false;
 			}
 		} else {
 			// If tools are enabled and at least one MCP server is active, prefer a tools-capable model

 	pickToolsCapableModel,
 	ROUTER_TOOLS_ROUTE,
 } from "$lib/server/router/toolsRoute";
+import { findConfiguredMultimodalModel } from "$lib/server/router/multimodal";
 import type { EndpointMessage } from "../../endpoints/endpoints";
 import { stripReasoningFromMessageForRouting } from "../utils/routing";
 import type { ProcessedModel } from "../../models";
 		const allModels = mod.models as ProcessedModel[];
 		if (hasImageInput) {
+			const multimodalCandidate = findConfiguredMultimodalModel(allModels);
+			if (!multimodalCandidate) {
+				runMcp = false;
+				logger.warn(
+					{ configuredModel: config.LLM_ROUTER_MULTIMODAL_MODEL },
+					"[mcp] multimodal input but configured model missing or invalid; skipping MCP route"
+				);
+			} else {
 				targetModel = multimodalCandidate;
 				candidateModelId = multimodalCandidate.id ?? multimodalCandidate.name;
 				resolvedRoute = "multimodal";
 			}
 		} else {
 			// If tools are enabled and at least one MCP server is active, prefer a tools-capable model

src/lib/server/textGeneration/mcp/runMcpFlow.ts CHANGED Viewed

@@ -1,6 +1,5 @@
 import { config } from "$lib/server/config";
 import { MessageUpdateType, type MessageUpdate } from "$lib/types/MessageUpdate";
-import type { EndpointMessage } from "../../endpoints/endpoints";
 import { getMcpServers } from "$lib/server/mcp/registry";
 import { isValidUrl } from "$lib/server/urlSafety";
 import { resetMcpToolsCache } from "$lib/server/mcp/tools";
@@ -14,11 +13,13 @@ import type {
 } from "openai/resources/chat/completions";
 import type { Stream } from "openai/streaming";
 import { buildToolPreprompt } from "../utils/toolPrompt";
 import { resolveRouterTarget } from "./routerResolution";
 import { executeToolCalls, type NormalizedToolCall } from "./toolInvocation";
 import { drainPool } from "$lib/server/mcp/clientPool";
 import type { TextGenerationContext } from "../types";
 import { hasAuthHeader, isStrictHfMcpLogin, hasNonEmptyToken } from "$lib/server/mcp/hf";
 export type RunMcpFlowContext = Pick<
 	TextGenerationContext,
@@ -200,6 +201,8 @@ export async function* runMcpFlow({
 		// If anything goes wrong reading the flag, proceed (previous behavior)
 	}
 	const hasImageInput = messages.some((msg) =>
 		(msg.files ?? []).some(
 			(file) => typeof file?.mime === "string" && file.mime.startsWith("image/")
@@ -599,6 +602,7 @@ export async function* runMcpFlow({
 					mapping,
 					servers,
 					parseArgs,
 					toPrimitive,
 					processToolOutput,
 					abortSignal,

 import { config } from "$lib/server/config";
 import { MessageUpdateType, type MessageUpdate } from "$lib/types/MessageUpdate";
 import { getMcpServers } from "$lib/server/mcp/registry";
 import { isValidUrl } from "$lib/server/urlSafety";
 import { resetMcpToolsCache } from "$lib/server/mcp/tools";
 } from "openai/resources/chat/completions";
 import type { Stream } from "openai/streaming";
 import { buildToolPreprompt } from "../utils/toolPrompt";
+import type { EndpointMessage } from "../../endpoints/endpoints";
 import { resolveRouterTarget } from "./routerResolution";
 import { executeToolCalls, type NormalizedToolCall } from "./toolInvocation";
 import { drainPool } from "$lib/server/mcp/clientPool";
 import type { TextGenerationContext } from "../types";
 import { hasAuthHeader, isStrictHfMcpLogin, hasNonEmptyToken } from "$lib/server/mcp/hf";
+import { buildImageRefResolver } from "./fileRefs";
 export type RunMcpFlowContext = Pick<
 	TextGenerationContext,
 		// If anything goes wrong reading the flag, proceed (previous behavior)
 	}
+	const resolveFileRef = buildImageRefResolver(messages);
 	const hasImageInput = messages.some((msg) =>
 		(msg.files ?? []).some(
 			(file) => typeof file?.mime === "string" && file.mime.startsWith("image/")
 					mapping,
 					servers,
 					parseArgs,
+					resolveFileRef,
 					toPrimitive,
 					processToolOutput,
 					abortSignal,

src/lib/server/textGeneration/mcp/toolInvocation.ts CHANGED Viewed

@@ -8,6 +8,7 @@ import type { McpToolMapping } from "$lib/server/mcp/tools";
 import type { McpServerConfig } from "$lib/server/mcp/httpClient";
 import { callMcpTool, type McpToolTextResponse } from "$lib/server/mcp/httpClient";
 import { getClient } from "$lib/server/mcp/clientPool";
 import type { Client } from "@modelcontextprotocol/sdk/client";
 export type Primitive = string | number | boolean;
@@ -29,6 +30,7 @@ export interface ExecuteToolCallsParams {
 	mapping: Record<string, McpToolMapping>;
 	servers: McpServerConfig[];
 	parseArgs: (raw: unknown) => Record<string, unknown>;
 	toPrimitive: (value: unknown) => Primitive | undefined;
 	processToolOutput: (text: string) => {
 		annotated: string;
@@ -63,6 +65,7 @@ export async function* executeToolCalls({
 	mapping,
 	servers,
 	parseArgs,
 	toPrimitive,
 	processToolOutput,
 	abortSignal,
@@ -89,6 +92,11 @@ export async function* executeToolCalls({
 			const prim = toPrimitive(v);
 			if (prim !== undefined) paramsClean[k] = prim;
 		}
 		return { call, argsObj, paramsClean, uuid: randomUUID() };
 	});

 import type { McpServerConfig } from "$lib/server/mcp/httpClient";
 import { callMcpTool, type McpToolTextResponse } from "$lib/server/mcp/httpClient";
 import { getClient } from "$lib/server/mcp/clientPool";
+import { attachFileRefsToArgs, type FileRefResolver } from "./fileRefs";
 import type { Client } from "@modelcontextprotocol/sdk/client";
 export type Primitive = string | number | boolean;
 	mapping: Record<string, McpToolMapping>;
 	servers: McpServerConfig[];
 	parseArgs: (raw: unknown) => Record<string, unknown>;
+	resolveFileRef?: FileRefResolver;
 	toPrimitive: (value: unknown) => Primitive | undefined;
 	processToolOutput: (text: string) => {
 		annotated: string;
 	mapping,
 	servers,
 	parseArgs,
+	resolveFileRef,
 	toPrimitive,
 	processToolOutput,
 	abortSignal,
 			const prim = toPrimitive(v);
 			if (prim !== undefined) paramsClean[k] = prim;
 		}
+		// Attach any resolved image payloads _after_ computing paramsClean so that
+		// logging / status updates continue to show only the lightweight primitive
+		// arguments (e.g. "image_1") while the full data: URLs or image blobs are
+		// only sent to the MCP tool server.
+		attachFileRefsToArgs(argsObj, resolveFileRef);
 		return { call, argsObj, paramsClean, uuid: randomUUID() };
 	});

src/lib/server/textGeneration/utils/toolPrompt.ts CHANGED Viewed

@@ -11,5 +11,11 @@ export function buildToolPreprompt(tools: OpenAiTool[]): string {
 		month: "long",
 		day: "numeric",
 	});
-	return `You can use the following tools if helpful: ${names.join(", ")}. Today's date: ${currentDate}. If a tool generates an image, you can inline it directly: ![alt text](image_url).`;
 }

 		month: "long",
 		day: "numeric",
 	});
+	return [
+		`You can use the following tools if helpful: ${names.join(", ")}.`,
+		`Today's date: ${currentDate}.`,
+		`If a tool generates an image, you can inline it directly: ![alt text](image_url).`,
+		`If a tool needs to operate on an image, set its image input parameter (for example, "input_image") to an image reference string.`,
+		`Use "image_1", "image_2", etc. to point to a specific image from a user message with images. You can also reuse a direct image URL from a prior tool result instead of pasting new base64 data.`,
+	].join(" ");
 }