Refactor file handling for multimodal chat messages
Browse filesMoved message and file preparation logic to a shared utility (prepareFiles.ts) for OpenAI-compatible multimodal payloads. Updated endpointOai and runMcpFlow to use the new prepareMessagesWithFiles function, improving code reuse and maintainability.
src/lib/server/endpoints/openai/endpointOai.ts
CHANGED
|
@@ -14,9 +14,7 @@ import { config } from "$lib/server/config";
|
|
| 14 |
import type { Endpoint } from "../endpoints";
|
| 15 |
import type OpenAI from "openai";
|
| 16 |
import { createImageProcessorOptionsValidator, makeImageProcessor } from "../images";
|
| 17 |
-
import {
|
| 18 |
-
import type { MessageFile } from "$lib/types/Message";
|
| 19 |
-
import type { EndpointMessage } from "../endpoints";
|
| 20 |
// uuid import removed (no tool call ids)
|
| 21 |
|
| 22 |
export const endpointOAIParametersSchema = z.object({
|
|
@@ -168,7 +166,7 @@ export async function endpointOai(
|
|
| 168 |
}) => {
|
| 169 |
// Format messages for the chat API, handling multimodal content if supported
|
| 170 |
let messagesOpenAI: OpenAI.Chat.Completions.ChatCompletionMessageParam[] =
|
| 171 |
-
await
|
| 172 |
|
| 173 |
// Normalize preprompt and handle empty values
|
| 174 |
const normalizedPreprompt = typeof preprompt === "string" ? preprompt.trim() : "";
|
|
@@ -245,88 +243,3 @@ export async function endpointOai(
|
|
| 245 |
throw new Error("Invalid completion type");
|
| 246 |
}
|
| 247 |
}
|
| 248 |
-
|
| 249 |
-
async function prepareMessages(
|
| 250 |
-
messages: EndpointMessage[],
|
| 251 |
-
imageProcessor: ReturnType<typeof makeImageProcessor>,
|
| 252 |
-
isMultimodal: boolean
|
| 253 |
-
): Promise<OpenAI.Chat.Completions.ChatCompletionMessageParam[]> {
|
| 254 |
-
return Promise.all(
|
| 255 |
-
messages.map(async (message) => {
|
| 256 |
-
if (message.from === "user" && message.files && message.files.length > 0) {
|
| 257 |
-
const { imageParts, textContent } = await prepareFiles(
|
| 258 |
-
imageProcessor,
|
| 259 |
-
message.files,
|
| 260 |
-
isMultimodal
|
| 261 |
-
);
|
| 262 |
-
|
| 263 |
-
// If we have text files, prepend their content to the message
|
| 264 |
-
let messageText = message.content;
|
| 265 |
-
if (textContent.length > 0) {
|
| 266 |
-
messageText = textContent + "\n\n" + message.content;
|
| 267 |
-
}
|
| 268 |
-
|
| 269 |
-
// If we have images and multimodal is enabled, use structured content
|
| 270 |
-
if (imageParts.length > 0 && isMultimodal) {
|
| 271 |
-
const parts = [{ type: "text" as const, text: messageText }, ...imageParts];
|
| 272 |
-
return { role: message.from, content: parts };
|
| 273 |
-
}
|
| 274 |
-
|
| 275 |
-
// Otherwise just use the text (possibly with injected file content)
|
| 276 |
-
return { role: message.from, content: messageText };
|
| 277 |
-
}
|
| 278 |
-
return { role: message.from, content: message.content };
|
| 279 |
-
})
|
| 280 |
-
);
|
| 281 |
-
}
|
| 282 |
-
|
| 283 |
-
async function prepareFiles(
|
| 284 |
-
imageProcessor: ReturnType<typeof makeImageProcessor>,
|
| 285 |
-
files: MessageFile[],
|
| 286 |
-
isMultimodal: boolean
|
| 287 |
-
): Promise<{
|
| 288 |
-
imageParts: OpenAI.Chat.Completions.ChatCompletionContentPartImage[];
|
| 289 |
-
textContent: string;
|
| 290 |
-
}> {
|
| 291 |
-
// Separate image and text files
|
| 292 |
-
const imageFiles = files.filter((file) => file.mime.startsWith("image/"));
|
| 293 |
-
const textFiles = files.filter((file) => {
|
| 294 |
-
const mime = (file.mime || "").toLowerCase();
|
| 295 |
-
const [fileType, fileSubtype] = mime.split("/");
|
| 296 |
-
return TEXT_MIME_ALLOWLIST.some((allowed) => {
|
| 297 |
-
const [type, subtype] = allowed.toLowerCase().split("/");
|
| 298 |
-
const typeOk = type === "*" || type === fileType;
|
| 299 |
-
const subOk = subtype === "*" || subtype === fileSubtype;
|
| 300 |
-
return typeOk && subOk;
|
| 301 |
-
});
|
| 302 |
-
});
|
| 303 |
-
|
| 304 |
-
// Process images if multimodal is enabled
|
| 305 |
-
let imageParts: OpenAI.Chat.Completions.ChatCompletionContentPartImage[] = [];
|
| 306 |
-
if (isMultimodal && imageFiles.length > 0) {
|
| 307 |
-
const processedFiles = await Promise.all(imageFiles.map(imageProcessor));
|
| 308 |
-
imageParts = processedFiles.map((file) => ({
|
| 309 |
-
type: "image_url" as const,
|
| 310 |
-
image_url: {
|
| 311 |
-
url: `data:${file.mime};base64,${file.image.toString("base64")}`,
|
| 312 |
-
// Improves compatibility with some OpenAI-compatible servers
|
| 313 |
-
// that expect an explicit detail setting.
|
| 314 |
-
detail: "auto",
|
| 315 |
-
},
|
| 316 |
-
}));
|
| 317 |
-
}
|
| 318 |
-
|
| 319 |
-
// Process text files - inject their content
|
| 320 |
-
let textContent = "";
|
| 321 |
-
if (textFiles.length > 0) {
|
| 322 |
-
const textParts = await Promise.all(
|
| 323 |
-
textFiles.map(async (file) => {
|
| 324 |
-
const content = Buffer.from(file.value, "base64").toString("utf-8");
|
| 325 |
-
return `<document name="${file.name}" type="${file.mime}">\n${content}\n</document>`;
|
| 326 |
-
})
|
| 327 |
-
);
|
| 328 |
-
textContent = textParts.join("\n\n");
|
| 329 |
-
}
|
| 330 |
-
|
| 331 |
-
return { imageParts, textContent };
|
| 332 |
-
}
|
|
|
|
| 14 |
import type { Endpoint } from "../endpoints";
|
| 15 |
import type OpenAI from "openai";
|
| 16 |
import { createImageProcessorOptionsValidator, makeImageProcessor } from "../images";
|
| 17 |
+
import { prepareMessagesWithFiles } from "$lib/server/textGeneration/utils/prepareFiles";
|
|
|
|
|
|
|
| 18 |
// uuid import removed (no tool call ids)
|
| 19 |
|
| 20 |
export const endpointOAIParametersSchema = z.object({
|
|
|
|
| 166 |
}) => {
|
| 167 |
// Format messages for the chat API, handling multimodal content if supported
|
| 168 |
let messagesOpenAI: OpenAI.Chat.Completions.ChatCompletionMessageParam[] =
|
| 169 |
+
await prepareMessagesWithFiles(messages, imageProcessor, isMultimodal ?? model.multimodal);
|
| 170 |
|
| 171 |
// Normalize preprompt and handle empty values
|
| 172 |
const normalizedPreprompt = typeof preprompt === "string" ? preprompt.trim() : "";
|
|
|
|
| 243 |
throw new Error("Invalid completion type");
|
| 244 |
}
|
| 245 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/lib/server/textGeneration/mcp/runMcpFlow.ts
CHANGED
|
@@ -8,7 +8,6 @@ import type {
|
|
| 8 |
ChatCompletionChunk,
|
| 9 |
ChatCompletionCreateParamsStreaming,
|
| 10 |
ChatCompletionMessageParam,
|
| 11 |
-
ChatCompletionContentPart,
|
| 12 |
ChatCompletionMessageToolCall,
|
| 13 |
} from "openai/resources/chat/completions";
|
| 14 |
import type { Stream } from "openai/streaming";
|
|
@@ -20,6 +19,8 @@ import { drainPool } from "$lib/server/mcp/clientPool";
|
|
| 20 |
import type { TextGenerationContext } from "../types";
|
| 21 |
import { hasAuthHeader, isStrictHfMcpLogin, hasNonEmptyToken } from "$lib/server/mcp/hf";
|
| 22 |
import { buildImageRefResolver } from "./fileRefs";
|
|
|
|
|
|
|
| 23 |
|
| 24 |
export type RunMcpFlowContext = Pick<
|
| 25 |
TextGenerationContext,
|
|
@@ -202,6 +203,13 @@ export async function* runMcpFlow({
|
|
| 202 |
}
|
| 203 |
|
| 204 |
const resolveFileRef = buildImageRefResolver(messages);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
const hasImageInput = messages.some((msg) =>
|
| 207 |
(msg.files ?? []).some(
|
|
@@ -270,34 +278,11 @@ export async function* runMcpFlow({
|
|
| 270 |
},
|
| 271 |
"[mcp] starting completion with tools"
|
| 272 |
);
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
const rawValue = file.value as unknown;
|
| 279 |
-
let encoded: string;
|
| 280 |
-
if (typeof rawValue === "string") {
|
| 281 |
-
encoded = rawValue;
|
| 282 |
-
} else if (rawValue instanceof Uint8Array) {
|
| 283 |
-
encoded = Buffer.from(rawValue).toString("base64");
|
| 284 |
-
} else if (rawValue instanceof ArrayBuffer) {
|
| 285 |
-
encoded = Buffer.from(rawValue).toString("base64");
|
| 286 |
-
} else {
|
| 287 |
-
encoded = String(rawValue ?? "");
|
| 288 |
-
}
|
| 289 |
-
const url = encoded.startsWith("data:")
|
| 290 |
-
? encoded
|
| 291 |
-
: `data:${file.mime};base64,${encoded}`;
|
| 292 |
-
parts.push({ type: "image_url", image_url: { url, detail: "auto" } });
|
| 293 |
-
}
|
| 294 |
-
}
|
| 295 |
-
return { role: msg.from, content: parts };
|
| 296 |
-
}
|
| 297 |
-
return { role: msg.from, content: msg.content };
|
| 298 |
-
};
|
| 299 |
-
|
| 300 |
-
let messagesOpenAI: ChatCompletionMessageParam[] = messages.map(toOpenAiMessage);
|
| 301 |
const toolPreprompt = buildToolPreprompt(oaTools);
|
| 302 |
const prepromptPieces: string[] = [];
|
| 303 |
if (toolPreprompt.trim().length > 0) {
|
|
|
|
| 8 |
ChatCompletionChunk,
|
| 9 |
ChatCompletionCreateParamsStreaming,
|
| 10 |
ChatCompletionMessageParam,
|
|
|
|
| 11 |
ChatCompletionMessageToolCall,
|
| 12 |
} from "openai/resources/chat/completions";
|
| 13 |
import type { Stream } from "openai/streaming";
|
|
|
|
| 19 |
import type { TextGenerationContext } from "../types";
|
| 20 |
import { hasAuthHeader, isStrictHfMcpLogin, hasNonEmptyToken } from "$lib/server/mcp/hf";
|
| 21 |
import { buildImageRefResolver } from "./fileRefs";
|
| 22 |
+
import { prepareMessagesWithFiles } from "$lib/server/textGeneration/utils/prepareFiles";
|
| 23 |
+
import { makeImageProcessor } from "$lib/server/endpoints/images";
|
| 24 |
|
| 25 |
export type RunMcpFlowContext = Pick<
|
| 26 |
TextGenerationContext,
|
|
|
|
| 203 |
}
|
| 204 |
|
| 205 |
const resolveFileRef = buildImageRefResolver(messages);
|
| 206 |
+
const imageProcessor = makeImageProcessor({
|
| 207 |
+
supportedMimeTypes: ["image/png", "image/jpeg"],
|
| 208 |
+
preferredMimeType: "image/jpeg",
|
| 209 |
+
maxSizeInMB: 1,
|
| 210 |
+
maxWidth: 1024,
|
| 211 |
+
maxHeight: 1024,
|
| 212 |
+
});
|
| 213 |
|
| 214 |
const hasImageInput = messages.some((msg) =>
|
| 215 |
(msg.files ?? []).some(
|
|
|
|
| 278 |
},
|
| 279 |
"[mcp] starting completion with tools"
|
| 280 |
);
|
| 281 |
+
let messagesOpenAI: ChatCompletionMessageParam[] = await prepareMessagesWithFiles(
|
| 282 |
+
messages,
|
| 283 |
+
imageProcessor,
|
| 284 |
+
mmEnabled
|
| 285 |
+
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
const toolPreprompt = buildToolPreprompt(oaTools);
|
| 287 |
const prepromptPieces: string[] = [];
|
| 288 |
if (toolPreprompt.trim().length > 0) {
|
src/lib/server/textGeneration/utils/prepareFiles.ts
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { MessageFile } from "$lib/types/Message";
|
| 2 |
+
import type { EndpointMessage } from "$lib/server/endpoints/endpoints";
|
| 3 |
+
import type { OpenAI } from "openai";
|
| 4 |
+
import { TEXT_MIME_ALLOWLIST } from "$lib/constants/mime";
|
| 5 |
+
import type { makeImageProcessor } from "$lib/server/endpoints/images";
|
| 6 |
+
|
| 7 |
+
/**
|
| 8 |
+
* Prepare chat messages for OpenAI-compatible multimodal payloads.
|
| 9 |
+
* - Processes images via the provided imageProcessor (resize/convert) when multimodal is enabled.
|
| 10 |
+
* - Injects text-file content into the user message text.
|
| 11 |
+
* - Leaves messages untouched when no files or multimodal disabled.
|
| 12 |
+
*/
|
| 13 |
+
export async function prepareMessagesWithFiles(
|
| 14 |
+
messages: EndpointMessage[],
|
| 15 |
+
imageProcessor: ReturnType<typeof makeImageProcessor>,
|
| 16 |
+
isMultimodal: boolean
|
| 17 |
+
): Promise<OpenAI.Chat.Completions.ChatCompletionMessageParam[]> {
|
| 18 |
+
return Promise.all(
|
| 19 |
+
messages.map(async (message) => {
|
| 20 |
+
if (message.from === "user" && message.files && message.files.length > 0) {
|
| 21 |
+
const { imageParts, textContent } = await prepareFiles(
|
| 22 |
+
imageProcessor,
|
| 23 |
+
message.files,
|
| 24 |
+
isMultimodal
|
| 25 |
+
);
|
| 26 |
+
|
| 27 |
+
let messageText = message.content;
|
| 28 |
+
if (textContent.length > 0) {
|
| 29 |
+
messageText = textContent + "\n\n" + message.content;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
if (imageParts.length > 0 && isMultimodal) {
|
| 33 |
+
const parts = [{ type: "text" as const, text: messageText }, ...imageParts];
|
| 34 |
+
return { role: message.from, content: parts };
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
return { role: message.from, content: messageText };
|
| 38 |
+
}
|
| 39 |
+
return { role: message.from, content: message.content };
|
| 40 |
+
})
|
| 41 |
+
);
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
async function prepareFiles(
|
| 45 |
+
imageProcessor: ReturnType<typeof makeImageProcessor>,
|
| 46 |
+
files: MessageFile[],
|
| 47 |
+
isMultimodal: boolean
|
| 48 |
+
): Promise<{
|
| 49 |
+
imageParts: OpenAI.Chat.Completions.ChatCompletionContentPartImage[];
|
| 50 |
+
textContent: string;
|
| 51 |
+
}> {
|
| 52 |
+
const imageFiles = files.filter((file) => file.mime.startsWith("image/"));
|
| 53 |
+
const textFiles = files.filter((file) => {
|
| 54 |
+
const mime = (file.mime || "").toLowerCase();
|
| 55 |
+
const [fileType, fileSubtype] = mime.split("/");
|
| 56 |
+
return TEXT_MIME_ALLOWLIST.some((allowed) => {
|
| 57 |
+
const [type, subtype] = allowed.toLowerCase().split("/");
|
| 58 |
+
const typeOk = type === "*" || type === fileType;
|
| 59 |
+
const subOk = subtype === "*" || subtype === fileSubtype;
|
| 60 |
+
return typeOk && subOk;
|
| 61 |
+
});
|
| 62 |
+
});
|
| 63 |
+
|
| 64 |
+
let imageParts: OpenAI.Chat.Completions.ChatCompletionContentPartImage[] = [];
|
| 65 |
+
if (isMultimodal && imageFiles.length > 0) {
|
| 66 |
+
const processedFiles = await Promise.all(imageFiles.map(imageProcessor));
|
| 67 |
+
imageParts = processedFiles.map((file) => ({
|
| 68 |
+
type: "image_url" as const,
|
| 69 |
+
image_url: {
|
| 70 |
+
url: `data:${file.mime};base64,${file.image.toString("base64")}`,
|
| 71 |
+
detail: "auto",
|
| 72 |
+
},
|
| 73 |
+
}));
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
let textContent = "";
|
| 77 |
+
if (textFiles.length > 0) {
|
| 78 |
+
const textParts = await Promise.all(
|
| 79 |
+
textFiles.map(async (file) => {
|
| 80 |
+
const content = Buffer.from(file.value, "base64").toString("utf-8");
|
| 81 |
+
return `<document name="${file.name}" type="${file.mime}">\n${content}\n</document>`;
|
| 82 |
+
})
|
| 83 |
+
);
|
| 84 |
+
textContent = textParts.join("\n\n");
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
return { imageParts, textContent };
|
| 88 |
+
}
|