victor HF Staff commited on
Commit
bec283e
·
unverified ·
1 Parent(s): f87f7a2

MCP: Image refs (#1987)

Browse files

* Add image reference support for MCP tool calls

Introduces image reference resolution and attachment for MCP tool arguments, allowing tools to accept lightweight image reference strings (e.g. 'latest', 'image_1') and receive resolved image payloads. Updates tool invocation and flow logic to use these references, and improves tool prompt instructions for image input handling.

* Enable multiple file uploads in ChatInput

* Refactor multimodal model selection logic

* Remove 'latest' image reference support

Eliminates handling of the 'latest' image reference in image resolver logic and updates related comments and prompts to only support 'image_1', 'image_2', etc. This simplifies image reference usage and clarifies instructions for tool input parameters.

* Refactor imageRefs to fileRefs for tool payloads

.env CHANGED
@@ -72,9 +72,9 @@ LLM_ROUTER_MAX_ASSISTANT_LENGTH=500
72
  # Maximum length (in characters) for previous user messages sent to router (latest user message not trimmed, default 400)
73
  LLM_ROUTER_MAX_PREV_USER_LENGTH=400
74
 
75
- # Enable router multimodal fallback (set to true to allow image inputs via router)
76
  LLM_ROUTER_ENABLE_MULTIMODAL=
77
- # Optional: specific model to use for multimodal requests. If not set, uses first multimodal model
78
  LLM_ROUTER_MULTIMODAL_MODEL=
79
 
80
  # Enable router tool support (set to true to allow tool calling via router)
 
72
  # Maximum length (in characters) for previous user messages sent to router (latest user message not trimmed, default 400)
73
  LLM_ROUTER_MAX_PREV_USER_LENGTH=400
74
 
75
+ # Enable router multimodal handling (set to true to allow image inputs via router)
76
  LLM_ROUTER_ENABLE_MULTIMODAL=
77
+ # Required when LLM_ROUTER_ENABLE_MULTIMODAL=true: id or name of the multimodal model to use for image requests
78
  LLM_ROUTER_MULTIMODAL_MODEL=
79
 
80
  # Enable router tool support (set to true to allow tool calling via router)
README.md CHANGED
@@ -144,7 +144,7 @@ When you select Omni in the UI, Chat UI will:
144
 
145
  Tool and multimodal shortcuts:
146
 
147
- - Multimodal: If `LLM_ROUTER_ENABLE_MULTIMODAL=true` and the user sends an image, the router bypasses Arch and uses `LLM_ROUTER_MULTIMODAL_MODEL` (or the first multimodal model). Route name: `multimodal`.
148
  - Tools: If `LLM_ROUTER_ENABLE_TOOLS=true` and the user has at least one MCP server enabled, the router bypasses Arch and uses `LLM_ROUTER_TOOLS_MODEL`. If that model is missing or misconfigured, it falls back to Arch routing. Route name: `agentic`.
149
 
150
  ### MCP Tools (Optional)
 
144
 
145
  Tool and multimodal shortcuts:
146
 
147
+ - Multimodal: If `LLM_ROUTER_ENABLE_MULTIMODAL=true` and the user sends an image, the router bypasses Arch and uses the model specified in `LLM_ROUTER_MULTIMODAL_MODEL`. Route name: `multimodal`.
148
  - Tools: If `LLM_ROUTER_ENABLE_TOOLS=true` and the user has at least one MCP server enabled, the router bypasses Arch and uses `LLM_ROUTER_TOOLS_MODEL`. If that model is missing or misconfigured, it falls back to Arch routing. Route name: `agentic`.
149
 
150
  ### MCP Tools (Optional)
chart/env/dev.yaml CHANGED
@@ -67,7 +67,7 @@ envVars:
67
  LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
68
  LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
69
  LLM_ROUTER_ENABLE_MULTIMODAL: "true"
70
- LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Thinking"
71
  LLM_ROUTER_ENABLE_TOOLS: "true"
72
  LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
73
  MCP_SERVERS: >
 
67
  LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
68
  LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
69
  LLM_ROUTER_ENABLE_MULTIMODAL: "true"
70
+ LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
71
  LLM_ROUTER_ENABLE_TOOLS: "true"
72
  LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
73
  MCP_SERVERS: >
chart/env/prod.yaml CHANGED
@@ -77,7 +77,7 @@ envVars:
77
  LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
78
  LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
79
  LLM_ROUTER_ENABLE_MULTIMODAL: "true"
80
- LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-30B-A3B-Instruct"
81
  LLM_ROUTER_ENABLE_TOOLS: "true"
82
  LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
83
  MCP_SERVERS: >
 
77
  LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
78
  LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
79
  LLM_ROUTER_ENABLE_MULTIMODAL: "true"
80
+ LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
81
  LLM_ROUTER_ENABLE_TOOLS: "true"
82
  LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
83
  MCP_SERVERS: >
src/lib/components/chat/ChatInput.svelte CHANGED
@@ -241,6 +241,7 @@
241
  class="absolute hidden size-0"
242
  aria-label="Upload file"
243
  type="file"
 
244
  onchange={onFileChange}
245
  onclick={(e) => {
246
  if (requireAuthUser()) {
@@ -274,7 +275,7 @@
274
  onSelect={() => openFilePickerImage()}
275
  >
276
  <CarbonImage class="size-4 opacity-90 dark:opacity-80" />
277
- Add image
278
  </DropdownMenu.Item>
279
  {/if}
280
 
 
241
  class="absolute hidden size-0"
242
  aria-label="Upload file"
243
  type="file"
244
+ multiple
245
  onchange={onFileChange}
246
  onclick={(e) => {
247
  if (requireAuthUser()) {
 
275
  onSelect={() => openFilePickerImage()}
276
  >
277
  <CarbonImage class="size-4 opacity-90 dark:opacity-80" />
278
+ Add image(s)
279
  </DropdownMenu.Item>
280
  {/if}
281
 
src/lib/server/router/endpoint.ts CHANGED
@@ -18,6 +18,7 @@ import {
18
  pickToolsCapableModel,
19
  ROUTER_TOOLS_ROUTE,
20
  } from "./toolsRoute";
 
21
 
22
  const REASONING_BLOCK_REGEX = /<think>[\s\S]*?(?:<\/think>|$)/g;
23
 
@@ -176,43 +177,17 @@ export async function makeRouterEndpoint(routerModel: ProcessedModel): Promise<E
176
  for await (const ev of gen) yield ev;
177
  }
178
 
179
- async function findFirstMultimodalCandidateId(): Promise<string | undefined> {
 
180
  try {
181
  const all = await getModels();
182
-
183
- // Check if a specific multimodal model is configured via env variable
184
- const preferredModelId = config.LLM_ROUTER_MULTIMODAL_MODEL;
185
- if (preferredModelId) {
186
- const preferredModel = all?.find(
187
- (m) => (m.id === preferredModelId || m.name === preferredModelId) && m.multimodal
188
- );
189
- if (preferredModel) {
190
- logger.info(
191
- { model: preferredModel.id ?? preferredModel.name },
192
- "[router] using configured multimodal model"
193
- );
194
- return preferredModel.id ?? preferredModel.name;
195
- }
196
- logger.warn(
197
- { configuredModel: preferredModelId },
198
- "[router] configured multimodal model not found or not multimodal, falling back to first available"
199
- );
200
- }
201
-
202
- // Fallback to first multimodal model
203
- const first = all?.find((m) => !m.isRouter && m.multimodal);
204
- return first?.id ?? first?.name;
205
  } catch (e) {
206
  logger.warn({ err: String(e) }, "[router] failed to load models for multimodal lookup");
207
- return undefined;
208
  }
209
- }
210
-
211
- if (routerMultimodalEnabled && hasImageInput) {
212
- const multimodalCandidate = await findFirstMultimodalCandidateId();
213
  if (!multimodalCandidate) {
214
  throw new Error(
215
- "No multimodal models are configured for the router. Remove the image or enable a multimodal model."
216
  );
217
  }
218
 
 
18
  pickToolsCapableModel,
19
  ROUTER_TOOLS_ROUTE,
20
  } from "./toolsRoute";
21
+ import { getConfiguredMultimodalModelId } from "./multimodal";
22
 
23
  const REASONING_BLOCK_REGEX = /<think>[\s\S]*?(?:<\/think>|$)/g;
24
 
 
177
  for await (const ev of gen) yield ev;
178
  }
179
 
180
+ if (routerMultimodalEnabled && hasImageInput) {
181
+ let multimodalCandidate: string | undefined;
182
  try {
183
  const all = await getModels();
184
+ multimodalCandidate = getConfiguredMultimodalModelId(all);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  } catch (e) {
186
  logger.warn({ err: String(e) }, "[router] failed to load models for multimodal lookup");
 
187
  }
 
 
 
 
188
  if (!multimodalCandidate) {
189
  throw new Error(
190
+ "Router multimodal is enabled but LLM_ROUTER_MULTIMODAL_MODEL is not correctly configured. Remove the image or configure a multimodal model via LLM_ROUTER_MULTIMODAL_MODEL."
191
  );
192
  }
193
 
src/lib/server/router/multimodal.ts ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { config } from "$lib/server/config";
2
+ import type { ProcessedModel } from "../models";
3
+
4
+ /**
5
+ * Returns the configured multimodal model when it exists and is valid.
6
+ * - Requires LLM_ROUTER_MULTIMODAL_MODEL to be set (id or name).
7
+ * - Ignores router aliases and non-multimodal models.
8
+ */
9
+ export function findConfiguredMultimodalModel(
10
+ models: ProcessedModel[] | undefined
11
+ ): ProcessedModel | undefined {
12
+ const preferredModelId = (config.LLM_ROUTER_MULTIMODAL_MODEL || "").trim();
13
+ if (!preferredModelId || !models?.length) return undefined;
14
+
15
+ return models.find(
16
+ (candidate) =>
17
+ (candidate.id === preferredModelId || candidate.name === preferredModelId) &&
18
+ !candidate.isRouter &&
19
+ candidate.multimodal
20
+ );
21
+ }
22
+
23
+ export function getConfiguredMultimodalModelId(
24
+ models: ProcessedModel[] | undefined
25
+ ): string | undefined {
26
+ const model = findConfiguredMultimodalModel(models);
27
+ return model?.id ?? model?.name;
28
+ }
src/lib/server/textGeneration/mcp/fileRefs.ts ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { EndpointMessage } from "../../endpoints/endpoints";
2
+
3
+ export type FileRefPayload = {
4
+ name: string;
5
+ mime: string;
6
+ base64: string;
7
+ };
8
+
9
+ export type RefKind = {
10
+ prefix: string;
11
+ matches: (mime: string) => boolean;
12
+ toDataUrl?: (payload: FileRefPayload) => string;
13
+ };
14
+
15
+ export type ResolvedFileRef = FileRefPayload & { refKind: RefKind };
16
+ export type FileRefResolver = (ref: string) => ResolvedFileRef | undefined;
17
+
18
+ const IMAGE_REF_KIND: RefKind = {
19
+ prefix: "image",
20
+ matches: (mime) => typeof mime === "string" && mime.startsWith("image/"),
21
+ toDataUrl: (payload) => `data:${payload.mime};base64,${payload.base64}`,
22
+ };
23
+
24
+ const DEFAULT_REF_KINDS: RefKind[] = [IMAGE_REF_KIND];
25
+
26
+ /**
27
+ * Build a resolver that maps short ref strings (e.g. "image_1") to the
28
+ * corresponding file payload for the latest user message containing files of
29
+ * the allowed kinds. Currently only images are exposed to end users, but the
30
+ * plumbing supports additional kinds later.
31
+ */
32
+ export function buildFileRefResolver(
33
+ messages: EndpointMessage[],
34
+ refKinds: RefKind[] = DEFAULT_REF_KINDS
35
+ ): FileRefResolver | undefined {
36
+ if (!Array.isArray(refKinds) || refKinds.length === 0) return undefined;
37
+
38
+ // Find the newest user message that has at least one matching file
39
+ let lastUserWithFiles: EndpointMessage | undefined;
40
+ for (let i = messages.length - 1; i >= 0; i -= 1) {
41
+ const msg = messages[i];
42
+ if (msg.from !== "user") continue;
43
+ const hasMatch = (msg.files ?? []).some((file) => {
44
+ const mime = file?.mime;
45
+ return refKinds.some((kind) => kind.matches(mime ?? ""));
46
+ });
47
+ if (hasMatch) {
48
+ lastUserWithFiles = msg;
49
+ break;
50
+ }
51
+ }
52
+
53
+ if (!lastUserWithFiles) return undefined;
54
+
55
+ // Bucket matched files by ref kind while preserving order within the message
56
+ const buckets = new Map<RefKind, FileRefPayload[]>();
57
+ for (const file of lastUserWithFiles.files ?? []) {
58
+ const mime = file?.mime ?? "";
59
+ const kind = refKinds.find((k) => k.matches(mime));
60
+ if (!kind) continue;
61
+ const payload: FileRefPayload = { name: file.name, mime, base64: file.value };
62
+ const arr = buckets.get(kind) ?? [];
63
+ arr.push(payload);
64
+ buckets.set(kind, arr);
65
+ }
66
+
67
+ if (buckets.size === 0) return undefined;
68
+
69
+ const resolver: FileRefResolver = (ref) => {
70
+ if (!ref || typeof ref !== "string") return undefined;
71
+ const trimmed = ref.trim().toLowerCase();
72
+ for (const kind of refKinds) {
73
+ const match = new RegExp(`^${kind.prefix}_(\\d+)$`).exec(trimmed);
74
+ if (!match) continue;
75
+ const idx = Number(match[1]) - 1;
76
+ const files = buckets.get(kind) ?? [];
77
+ if (Number.isFinite(idx) && idx >= 0 && idx < files.length) {
78
+ const payload = files[idx];
79
+ return payload ? { ...payload, refKind: kind } : undefined;
80
+ }
81
+ }
82
+ return undefined;
83
+ };
84
+
85
+ return resolver;
86
+ }
87
+
88
+ export function buildImageRefResolver(messages: EndpointMessage[]): FileRefResolver | undefined {
89
+ return buildFileRefResolver(messages, [IMAGE_REF_KIND]);
90
+ }
91
+
92
+ type FieldRule = {
93
+ keys: string[];
94
+ action: "attachPayload" | "replaceWithDataUrl";
95
+ attachKey?: string;
96
+ allowedPrefixes?: string[]; // limit to specific ref kinds (e.g. ["image"])
97
+ };
98
+
99
+ const DEFAULT_FIELD_RULES: FieldRule[] = [
100
+ {
101
+ keys: ["image_ref"],
102
+ action: "attachPayload",
103
+ attachKey: "image",
104
+ allowedPrefixes: ["image"],
105
+ },
106
+ {
107
+ keys: ["input_image"],
108
+ action: "replaceWithDataUrl",
109
+ allowedPrefixes: ["image"],
110
+ },
111
+ ];
112
+
113
+ /**
114
+ * Walk tool args and hydrate known ref fields while keeping logging lightweight.
115
+ * Only image refs are recognized for now to preserve current behavior.
116
+ */
117
+ export function attachFileRefsToArgs(
118
+ argsObj: Record<string, unknown>,
119
+ resolveRef?: FileRefResolver,
120
+ fieldRules: FieldRule[] = DEFAULT_FIELD_RULES
121
+ ): void {
122
+ if (!resolveRef) return;
123
+
124
+ const visit = (node: unknown): void => {
125
+ if (!node || typeof node !== "object") return;
126
+ if (Array.isArray(node)) {
127
+ for (const v of node) visit(v);
128
+ return;
129
+ }
130
+
131
+ const obj = node as Record<string, unknown>;
132
+ for (const [key, value] of Object.entries(obj)) {
133
+ if (typeof value !== "string") {
134
+ if (value && typeof value === "object") visit(value);
135
+ continue;
136
+ }
137
+
138
+ const resolved = resolveRef(value);
139
+ if (!resolved) continue;
140
+
141
+ const rule = fieldRules.find((r) => r.keys.includes(key));
142
+ if (!rule) continue;
143
+ if (rule.allowedPrefixes && !rule.allowedPrefixes.includes(resolved.refKind.prefix)) continue;
144
+
145
+ if (rule.action === "attachPayload") {
146
+ const targetKey = rule.attachKey ?? "file";
147
+ if (
148
+ typeof obj[targetKey] !== "object" ||
149
+ obj[targetKey] === null ||
150
+ Array.isArray(obj[targetKey])
151
+ ) {
152
+ obj[targetKey] = {
153
+ name: resolved.name,
154
+ mime: resolved.mime,
155
+ base64: resolved.base64,
156
+ };
157
+ }
158
+ } else if (rule.action === "replaceWithDataUrl") {
159
+ const toUrl =
160
+ resolved.refKind.toDataUrl ??
161
+ ((p: FileRefPayload) => `data:${p.mime};base64,${p.base64}`);
162
+ obj[key] = toUrl(resolved);
163
+ }
164
+ }
165
+ };
166
+
167
+ visit(argsObj);
168
+ }
src/lib/server/textGeneration/mcp/routerResolution.ts CHANGED
@@ -7,6 +7,7 @@ import {
7
  pickToolsCapableModel,
8
  ROUTER_TOOLS_ROUTE,
9
  } from "$lib/server/router/toolsRoute";
 
10
  import type { EndpointMessage } from "../../endpoints/endpoints";
11
  import { stripReasoningFromMessageForRouting } from "../utils/routing";
12
  import type { ProcessedModel } from "../../models";
@@ -48,15 +49,17 @@ export async function resolveRouterTarget({
48
  const allModels = mod.models as ProcessedModel[];
49
 
50
  if (hasImageInput) {
51
- const multimodalCandidate = allModels?.find(
52
- (candidate) => !candidate.isRouter && candidate.multimodal
53
- );
54
- if (multimodalCandidate) {
 
 
 
 
55
  targetModel = multimodalCandidate;
56
  candidateModelId = multimodalCandidate.id ?? multimodalCandidate.name;
57
  resolvedRoute = "multimodal";
58
- } else {
59
- runMcp = false;
60
  }
61
  } else {
62
  // If tools are enabled and at least one MCP server is active, prefer a tools-capable model
 
7
  pickToolsCapableModel,
8
  ROUTER_TOOLS_ROUTE,
9
  } from "$lib/server/router/toolsRoute";
10
+ import { findConfiguredMultimodalModel } from "$lib/server/router/multimodal";
11
  import type { EndpointMessage } from "../../endpoints/endpoints";
12
  import { stripReasoningFromMessageForRouting } from "../utils/routing";
13
  import type { ProcessedModel } from "../../models";
 
49
  const allModels = mod.models as ProcessedModel[];
50
 
51
  if (hasImageInput) {
52
+ const multimodalCandidate = findConfiguredMultimodalModel(allModels);
53
+ if (!multimodalCandidate) {
54
+ runMcp = false;
55
+ logger.warn(
56
+ { configuredModel: config.LLM_ROUTER_MULTIMODAL_MODEL },
57
+ "[mcp] multimodal input but configured model missing or invalid; skipping MCP route"
58
+ );
59
+ } else {
60
  targetModel = multimodalCandidate;
61
  candidateModelId = multimodalCandidate.id ?? multimodalCandidate.name;
62
  resolvedRoute = "multimodal";
 
 
63
  }
64
  } else {
65
  // If tools are enabled and at least one MCP server is active, prefer a tools-capable model
src/lib/server/textGeneration/mcp/runMcpFlow.ts CHANGED
@@ -1,6 +1,5 @@
1
  import { config } from "$lib/server/config";
2
  import { MessageUpdateType, type MessageUpdate } from "$lib/types/MessageUpdate";
3
- import type { EndpointMessage } from "../../endpoints/endpoints";
4
  import { getMcpServers } from "$lib/server/mcp/registry";
5
  import { isValidUrl } from "$lib/server/urlSafety";
6
  import { resetMcpToolsCache } from "$lib/server/mcp/tools";
@@ -14,11 +13,13 @@ import type {
14
  } from "openai/resources/chat/completions";
15
  import type { Stream } from "openai/streaming";
16
  import { buildToolPreprompt } from "../utils/toolPrompt";
 
17
  import { resolveRouterTarget } from "./routerResolution";
18
  import { executeToolCalls, type NormalizedToolCall } from "./toolInvocation";
19
  import { drainPool } from "$lib/server/mcp/clientPool";
20
  import type { TextGenerationContext } from "../types";
21
  import { hasAuthHeader, isStrictHfMcpLogin, hasNonEmptyToken } from "$lib/server/mcp/hf";
 
22
 
23
  export type RunMcpFlowContext = Pick<
24
  TextGenerationContext,
@@ -200,6 +201,8 @@ export async function* runMcpFlow({
200
  // If anything goes wrong reading the flag, proceed (previous behavior)
201
  }
202
 
 
 
203
  const hasImageInput = messages.some((msg) =>
204
  (msg.files ?? []).some(
205
  (file) => typeof file?.mime === "string" && file.mime.startsWith("image/")
@@ -599,6 +602,7 @@ export async function* runMcpFlow({
599
  mapping,
600
  servers,
601
  parseArgs,
 
602
  toPrimitive,
603
  processToolOutput,
604
  abortSignal,
 
1
  import { config } from "$lib/server/config";
2
  import { MessageUpdateType, type MessageUpdate } from "$lib/types/MessageUpdate";
 
3
  import { getMcpServers } from "$lib/server/mcp/registry";
4
  import { isValidUrl } from "$lib/server/urlSafety";
5
  import { resetMcpToolsCache } from "$lib/server/mcp/tools";
 
13
  } from "openai/resources/chat/completions";
14
  import type { Stream } from "openai/streaming";
15
  import { buildToolPreprompt } from "../utils/toolPrompt";
16
+ import type { EndpointMessage } from "../../endpoints/endpoints";
17
  import { resolveRouterTarget } from "./routerResolution";
18
  import { executeToolCalls, type NormalizedToolCall } from "./toolInvocation";
19
  import { drainPool } from "$lib/server/mcp/clientPool";
20
  import type { TextGenerationContext } from "../types";
21
  import { hasAuthHeader, isStrictHfMcpLogin, hasNonEmptyToken } from "$lib/server/mcp/hf";
22
+ import { buildImageRefResolver } from "./fileRefs";
23
 
24
  export type RunMcpFlowContext = Pick<
25
  TextGenerationContext,
 
201
  // If anything goes wrong reading the flag, proceed (previous behavior)
202
  }
203
 
204
+ const resolveFileRef = buildImageRefResolver(messages);
205
+
206
  const hasImageInput = messages.some((msg) =>
207
  (msg.files ?? []).some(
208
  (file) => typeof file?.mime === "string" && file.mime.startsWith("image/")
 
602
  mapping,
603
  servers,
604
  parseArgs,
605
+ resolveFileRef,
606
  toPrimitive,
607
  processToolOutput,
608
  abortSignal,
src/lib/server/textGeneration/mcp/toolInvocation.ts CHANGED
@@ -8,6 +8,7 @@ import type { McpToolMapping } from "$lib/server/mcp/tools";
8
  import type { McpServerConfig } from "$lib/server/mcp/httpClient";
9
  import { callMcpTool, type McpToolTextResponse } from "$lib/server/mcp/httpClient";
10
  import { getClient } from "$lib/server/mcp/clientPool";
 
11
  import type { Client } from "@modelcontextprotocol/sdk/client";
12
 
13
  export type Primitive = string | number | boolean;
@@ -29,6 +30,7 @@ export interface ExecuteToolCallsParams {
29
  mapping: Record<string, McpToolMapping>;
30
  servers: McpServerConfig[];
31
  parseArgs: (raw: unknown) => Record<string, unknown>;
 
32
  toPrimitive: (value: unknown) => Primitive | undefined;
33
  processToolOutput: (text: string) => {
34
  annotated: string;
@@ -63,6 +65,7 @@ export async function* executeToolCalls({
63
  mapping,
64
  servers,
65
  parseArgs,
 
66
  toPrimitive,
67
  processToolOutput,
68
  abortSignal,
@@ -89,6 +92,11 @@ export async function* executeToolCalls({
89
  const prim = toPrimitive(v);
90
  if (prim !== undefined) paramsClean[k] = prim;
91
  }
 
 
 
 
 
92
  return { call, argsObj, paramsClean, uuid: randomUUID() };
93
  });
94
 
 
8
  import type { McpServerConfig } from "$lib/server/mcp/httpClient";
9
  import { callMcpTool, type McpToolTextResponse } from "$lib/server/mcp/httpClient";
10
  import { getClient } from "$lib/server/mcp/clientPool";
11
+ import { attachFileRefsToArgs, type FileRefResolver } from "./fileRefs";
12
  import type { Client } from "@modelcontextprotocol/sdk/client";
13
 
14
  export type Primitive = string | number | boolean;
 
30
  mapping: Record<string, McpToolMapping>;
31
  servers: McpServerConfig[];
32
  parseArgs: (raw: unknown) => Record<string, unknown>;
33
+ resolveFileRef?: FileRefResolver;
34
  toPrimitive: (value: unknown) => Primitive | undefined;
35
  processToolOutput: (text: string) => {
36
  annotated: string;
 
65
  mapping,
66
  servers,
67
  parseArgs,
68
+ resolveFileRef,
69
  toPrimitive,
70
  processToolOutput,
71
  abortSignal,
 
92
  const prim = toPrimitive(v);
93
  if (prim !== undefined) paramsClean[k] = prim;
94
  }
95
+ // Attach any resolved image payloads _after_ computing paramsClean so that
96
+ // logging / status updates continue to show only the lightweight primitive
97
+ // arguments (e.g. "image_1") while the full data: URLs or image blobs are
98
+ // only sent to the MCP tool server.
99
+ attachFileRefsToArgs(argsObj, resolveFileRef);
100
  return { call, argsObj, paramsClean, uuid: randomUUID() };
101
  });
102
 
src/lib/server/textGeneration/utils/toolPrompt.ts CHANGED
@@ -11,5 +11,11 @@ export function buildToolPreprompt(tools: OpenAiTool[]): string {
11
  month: "long",
12
  day: "numeric",
13
  });
14
- return `You can use the following tools if helpful: ${names.join(", ")}. Today's date: ${currentDate}. If a tool generates an image, you can inline it directly: ![alt text](image_url).`;
 
 
 
 
 
 
15
  }
 
11
  month: "long",
12
  day: "numeric",
13
  });
14
+ return [
15
+ `You can use the following tools if helpful: ${names.join(", ")}.`,
16
+ `Today's date: ${currentDate}.`,
17
+ `If a tool generates an image, you can inline it directly: ![alt text](image_url).`,
18
+ `If a tool needs to operate on an image, set its image input parameter (for example, "input_image") to an image reference string.`,
19
+ `Use "image_1", "image_2", etc. to point to a specific image from a user message with images. You can also reuse a direct image URL from a prior tool result instead of pasting new base64 data.`,
20
+ ].join(" ");
21
  }