victor HF Staff commited on
Commit
1b26311
·
1 Parent(s): 96117d2

Refactor file handling for multimodal chat messages

Browse files

Moved message and file preparation logic to a shared utility (prepareFiles.ts) for OpenAI-compatible multimodal payloads. Updated endpointOai and runMcpFlow to use the new prepareMessagesWithFiles function, improving code reuse and maintainability.

src/lib/server/endpoints/openai/endpointOai.ts CHANGED
@@ -14,9 +14,7 @@ import { config } from "$lib/server/config";
14
  import type { Endpoint } from "../endpoints";
15
  import type OpenAI from "openai";
16
  import { createImageProcessorOptionsValidator, makeImageProcessor } from "../images";
17
- import { TEXT_MIME_ALLOWLIST } from "$lib/constants/mime";
18
- import type { MessageFile } from "$lib/types/Message";
19
- import type { EndpointMessage } from "../endpoints";
20
  // uuid import removed (no tool call ids)
21
 
22
  export const endpointOAIParametersSchema = z.object({
@@ -168,7 +166,7 @@ export async function endpointOai(
168
  }) => {
169
  // Format messages for the chat API, handling multimodal content if supported
170
  let messagesOpenAI: OpenAI.Chat.Completions.ChatCompletionMessageParam[] =
171
- await prepareMessages(messages, imageProcessor, isMultimodal ?? model.multimodal);
172
 
173
  // Normalize preprompt and handle empty values
174
  const normalizedPreprompt = typeof preprompt === "string" ? preprompt.trim() : "";
@@ -245,88 +243,3 @@ export async function endpointOai(
245
  throw new Error("Invalid completion type");
246
  }
247
  }
248
-
249
- async function prepareMessages(
250
- messages: EndpointMessage[],
251
- imageProcessor: ReturnType<typeof makeImageProcessor>,
252
- isMultimodal: boolean
253
- ): Promise<OpenAI.Chat.Completions.ChatCompletionMessageParam[]> {
254
- return Promise.all(
255
- messages.map(async (message) => {
256
- if (message.from === "user" && message.files && message.files.length > 0) {
257
- const { imageParts, textContent } = await prepareFiles(
258
- imageProcessor,
259
- message.files,
260
- isMultimodal
261
- );
262
-
263
- // If we have text files, prepend their content to the message
264
- let messageText = message.content;
265
- if (textContent.length > 0) {
266
- messageText = textContent + "\n\n" + message.content;
267
- }
268
-
269
- // If we have images and multimodal is enabled, use structured content
270
- if (imageParts.length > 0 && isMultimodal) {
271
- const parts = [{ type: "text" as const, text: messageText }, ...imageParts];
272
- return { role: message.from, content: parts };
273
- }
274
-
275
- // Otherwise just use the text (possibly with injected file content)
276
- return { role: message.from, content: messageText };
277
- }
278
- return { role: message.from, content: message.content };
279
- })
280
- );
281
- }
282
-
283
- async function prepareFiles(
284
- imageProcessor: ReturnType<typeof makeImageProcessor>,
285
- files: MessageFile[],
286
- isMultimodal: boolean
287
- ): Promise<{
288
- imageParts: OpenAI.Chat.Completions.ChatCompletionContentPartImage[];
289
- textContent: string;
290
- }> {
291
- // Separate image and text files
292
- const imageFiles = files.filter((file) => file.mime.startsWith("image/"));
293
- const textFiles = files.filter((file) => {
294
- const mime = (file.mime || "").toLowerCase();
295
- const [fileType, fileSubtype] = mime.split("/");
296
- return TEXT_MIME_ALLOWLIST.some((allowed) => {
297
- const [type, subtype] = allowed.toLowerCase().split("/");
298
- const typeOk = type === "*" || type === fileType;
299
- const subOk = subtype === "*" || subtype === fileSubtype;
300
- return typeOk && subOk;
301
- });
302
- });
303
-
304
- // Process images if multimodal is enabled
305
- let imageParts: OpenAI.Chat.Completions.ChatCompletionContentPartImage[] = [];
306
- if (isMultimodal && imageFiles.length > 0) {
307
- const processedFiles = await Promise.all(imageFiles.map(imageProcessor));
308
- imageParts = processedFiles.map((file) => ({
309
- type: "image_url" as const,
310
- image_url: {
311
- url: `data:${file.mime};base64,${file.image.toString("base64")}`,
312
- // Improves compatibility with some OpenAI-compatible servers
313
- // that expect an explicit detail setting.
314
- detail: "auto",
315
- },
316
- }));
317
- }
318
-
319
- // Process text files - inject their content
320
- let textContent = "";
321
- if (textFiles.length > 0) {
322
- const textParts = await Promise.all(
323
- textFiles.map(async (file) => {
324
- const content = Buffer.from(file.value, "base64").toString("utf-8");
325
- return `<document name="${file.name}" type="${file.mime}">\n${content}\n</document>`;
326
- })
327
- );
328
- textContent = textParts.join("\n\n");
329
- }
330
-
331
- return { imageParts, textContent };
332
- }
 
14
  import type { Endpoint } from "../endpoints";
15
  import type OpenAI from "openai";
16
  import { createImageProcessorOptionsValidator, makeImageProcessor } from "../images";
17
+ import { prepareMessagesWithFiles } from "$lib/server/textGeneration/utils/prepareFiles";
 
 
18
  // uuid import removed (no tool call ids)
19
 
20
  export const endpointOAIParametersSchema = z.object({
 
166
  }) => {
167
  // Format messages for the chat API, handling multimodal content if supported
168
  let messagesOpenAI: OpenAI.Chat.Completions.ChatCompletionMessageParam[] =
169
+ await prepareMessagesWithFiles(messages, imageProcessor, isMultimodal ?? model.multimodal);
170
 
171
  // Normalize preprompt and handle empty values
172
  const normalizedPreprompt = typeof preprompt === "string" ? preprompt.trim() : "";
 
243
  throw new Error("Invalid completion type");
244
  }
245
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/lib/server/textGeneration/mcp/runMcpFlow.ts CHANGED
@@ -8,7 +8,6 @@ import type {
8
  ChatCompletionChunk,
9
  ChatCompletionCreateParamsStreaming,
10
  ChatCompletionMessageParam,
11
- ChatCompletionContentPart,
12
  ChatCompletionMessageToolCall,
13
  } from "openai/resources/chat/completions";
14
  import type { Stream } from "openai/streaming";
@@ -20,6 +19,8 @@ import { drainPool } from "$lib/server/mcp/clientPool";
20
  import type { TextGenerationContext } from "../types";
21
  import { hasAuthHeader, isStrictHfMcpLogin, hasNonEmptyToken } from "$lib/server/mcp/hf";
22
  import { buildImageRefResolver } from "./fileRefs";
 
 
23
 
24
  export type RunMcpFlowContext = Pick<
25
  TextGenerationContext,
@@ -202,6 +203,13 @@ export async function* runMcpFlow({
202
  }
203
 
204
  const resolveFileRef = buildImageRefResolver(messages);
 
 
 
 
 
 
 
205
 
206
  const hasImageInput = messages.some((msg) =>
207
  (msg.files ?? []).some(
@@ -270,34 +278,11 @@ export async function* runMcpFlow({
270
  },
271
  "[mcp] starting completion with tools"
272
  );
273
- const toOpenAiMessage = (msg: EndpointMessage): ChatCompletionMessageParam => {
274
- if (msg.from === "user" && mmEnabled) {
275
- const parts: ChatCompletionContentPart[] = [{ type: "text", text: msg.content }];
276
- for (const file of msg.files ?? []) {
277
- if (typeof file?.mime === "string" && file.mime.startsWith("image/")) {
278
- const rawValue = file.value as unknown;
279
- let encoded: string;
280
- if (typeof rawValue === "string") {
281
- encoded = rawValue;
282
- } else if (rawValue instanceof Uint8Array) {
283
- encoded = Buffer.from(rawValue).toString("base64");
284
- } else if (rawValue instanceof ArrayBuffer) {
285
- encoded = Buffer.from(rawValue).toString("base64");
286
- } else {
287
- encoded = String(rawValue ?? "");
288
- }
289
- const url = encoded.startsWith("data:")
290
- ? encoded
291
- : `data:${file.mime};base64,${encoded}`;
292
- parts.push({ type: "image_url", image_url: { url, detail: "auto" } });
293
- }
294
- }
295
- return { role: msg.from, content: parts };
296
- }
297
- return { role: msg.from, content: msg.content };
298
- };
299
-
300
- let messagesOpenAI: ChatCompletionMessageParam[] = messages.map(toOpenAiMessage);
301
  const toolPreprompt = buildToolPreprompt(oaTools);
302
  const prepromptPieces: string[] = [];
303
  if (toolPreprompt.trim().length > 0) {
 
8
  ChatCompletionChunk,
9
  ChatCompletionCreateParamsStreaming,
10
  ChatCompletionMessageParam,
 
11
  ChatCompletionMessageToolCall,
12
  } from "openai/resources/chat/completions";
13
  import type { Stream } from "openai/streaming";
 
19
  import type { TextGenerationContext } from "../types";
20
  import { hasAuthHeader, isStrictHfMcpLogin, hasNonEmptyToken } from "$lib/server/mcp/hf";
21
  import { buildImageRefResolver } from "./fileRefs";
22
+ import { prepareMessagesWithFiles } from "$lib/server/textGeneration/utils/prepareFiles";
23
+ import { makeImageProcessor } from "$lib/server/endpoints/images";
24
 
25
  export type RunMcpFlowContext = Pick<
26
  TextGenerationContext,
 
203
  }
204
 
205
  const resolveFileRef = buildImageRefResolver(messages);
206
+ const imageProcessor = makeImageProcessor({
207
+ supportedMimeTypes: ["image/png", "image/jpeg"],
208
+ preferredMimeType: "image/jpeg",
209
+ maxSizeInMB: 1,
210
+ maxWidth: 1024,
211
+ maxHeight: 1024,
212
+ });
213
 
214
  const hasImageInput = messages.some((msg) =>
215
  (msg.files ?? []).some(
 
278
  },
279
  "[mcp] starting completion with tools"
280
  );
281
+ let messagesOpenAI: ChatCompletionMessageParam[] = await prepareMessagesWithFiles(
282
+ messages,
283
+ imageProcessor,
284
+ mmEnabled
285
+ );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  const toolPreprompt = buildToolPreprompt(oaTools);
287
  const prepromptPieces: string[] = [];
288
  if (toolPreprompt.trim().length > 0) {
src/lib/server/textGeneration/utils/prepareFiles.ts ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { MessageFile } from "$lib/types/Message";
2
+ import type { EndpointMessage } from "$lib/server/endpoints/endpoints";
3
+ import type { OpenAI } from "openai";
4
+ import { TEXT_MIME_ALLOWLIST } from "$lib/constants/mime";
5
+ import type { makeImageProcessor } from "$lib/server/endpoints/images";
6
+
7
+ /**
8
+ * Prepare chat messages for OpenAI-compatible multimodal payloads.
9
+ * - Processes images via the provided imageProcessor (resize/convert) when multimodal is enabled.
10
+ * - Injects text-file content into the user message text.
11
+ * - Leaves messages untouched when no files or multimodal disabled.
12
+ */
13
+ export async function prepareMessagesWithFiles(
14
+ messages: EndpointMessage[],
15
+ imageProcessor: ReturnType<typeof makeImageProcessor>,
16
+ isMultimodal: boolean
17
+ ): Promise<OpenAI.Chat.Completions.ChatCompletionMessageParam[]> {
18
+ return Promise.all(
19
+ messages.map(async (message) => {
20
+ if (message.from === "user" && message.files && message.files.length > 0) {
21
+ const { imageParts, textContent } = await prepareFiles(
22
+ imageProcessor,
23
+ message.files,
24
+ isMultimodal
25
+ );
26
+
27
+ let messageText = message.content;
28
+ if (textContent.length > 0) {
29
+ messageText = textContent + "\n\n" + message.content;
30
+ }
31
+
32
+ if (imageParts.length > 0 && isMultimodal) {
33
+ const parts = [{ type: "text" as const, text: messageText }, ...imageParts];
34
+ return { role: message.from, content: parts };
35
+ }
36
+
37
+ return { role: message.from, content: messageText };
38
+ }
39
+ return { role: message.from, content: message.content };
40
+ })
41
+ );
42
+ }
43
+
44
+ async function prepareFiles(
45
+ imageProcessor: ReturnType<typeof makeImageProcessor>,
46
+ files: MessageFile[],
47
+ isMultimodal: boolean
48
+ ): Promise<{
49
+ imageParts: OpenAI.Chat.Completions.ChatCompletionContentPartImage[];
50
+ textContent: string;
51
+ }> {
52
+ const imageFiles = files.filter((file) => file.mime.startsWith("image/"));
53
+ const textFiles = files.filter((file) => {
54
+ const mime = (file.mime || "").toLowerCase();
55
+ const [fileType, fileSubtype] = mime.split("/");
56
+ return TEXT_MIME_ALLOWLIST.some((allowed) => {
57
+ const [type, subtype] = allowed.toLowerCase().split("/");
58
+ const typeOk = type === "*" || type === fileType;
59
+ const subOk = subtype === "*" || subtype === fileSubtype;
60
+ return typeOk && subOk;
61
+ });
62
+ });
63
+
64
+ let imageParts: OpenAI.Chat.Completions.ChatCompletionContentPartImage[] = [];
65
+ if (isMultimodal && imageFiles.length > 0) {
66
+ const processedFiles = await Promise.all(imageFiles.map(imageProcessor));
67
+ imageParts = processedFiles.map((file) => ({
68
+ type: "image_url" as const,
69
+ image_url: {
70
+ url: `data:${file.mime};base64,${file.image.toString("base64")}`,
71
+ detail: "auto",
72
+ },
73
+ }));
74
+ }
75
+
76
+ let textContent = "";
77
+ if (textFiles.length > 0) {
78
+ const textParts = await Promise.all(
79
+ textFiles.map(async (file) => {
80
+ const content = Buffer.from(file.value, "base64").toString("utf-8");
81
+ return `<document name="${file.name}" type="${file.mime}">\n${content}\n</document>`;
82
+ })
83
+ );
84
+ textContent = textParts.join("\n\n");
85
+ }
86
+
87
+ return { imageParts, textContent };
88
+ }