import { config } from "$lib/server/config"; import { MessageReasoningUpdateType, MessageUpdateType, type MessageUpdate, } from "$lib/types/MessageUpdate"; import { AbortedGenerations } from "../abortedGenerations"; import type { TextGenerationContext } from "./types"; import type { EndpointMessage } from "../endpoints/endpoints"; import { generateFromDefaultEndpoint } from "../generateFromDefaultEndpoint"; import { generateSummaryOfReasoning } from "./reasoning"; import { logger } from "../logger"; type GenerateContext = Omit & { messages: EndpointMessage[] }; export async function* generate( { model, endpoint, conv, messages, assistant, isContinue, promptedAt, forceMultimodal, authToken, }: GenerateContext, preprompt?: string ): AsyncIterable { // reasoning mode is false by default let reasoning = false; let reasoningBuffer = ""; let lastReasoningUpdate = new Date(); let status = ""; const startTime = new Date(); if ( model.reasoning && // if the beginToken is an empty string, the model starts in reasoning mode (model.reasoning.type === "regex" || model.reasoning.type === "summarize" || (model.reasoning.type === "tokens" && model.reasoning.beginToken === "")) ) { // if the model has reasoning in regex or summarize mode, it starts in reasoning mode // and we extract the answer from the reasoning reasoning = true; yield { type: MessageUpdateType.Reasoning, subtype: MessageReasoningUpdateType.Status, status: "Started reasoning...", }; } for await (const output of await endpoint({ messages, preprompt, continueMessage: isContinue, generateSettings: assistant?.generateSettings, // Allow user-level override to force multimodal isMultimodal: (forceMultimodal ?? false) || model.multimodal, conversationId: conv._id, })) { // Check if this output contains router metadata if ( "routerMetadata" in output && output.routerMetadata && output.routerMetadata.route && output.routerMetadata.model ) { yield { type: MessageUpdateType.RouterMetadata, route: output.routerMetadata.route, model: output.routerMetadata.model, }; continue; } // text generation completed if (output.generated_text) { let interrupted = !output.token.special && !model.parameters.stop?.includes(output.token.text); let text = output.generated_text.trimEnd(); for (const stopToken of model.parameters.stop ?? []) { if (!text.endsWith(stopToken)) continue; interrupted = false; text = text.slice(0, text.length - stopToken.length); } let finalAnswer = text; if (model.reasoning && model.reasoning.type === "regex") { const regex = new RegExp(model.reasoning.regex); finalAnswer = regex.exec(reasoningBuffer)?.[1] ?? text; } else if (model.reasoning && model.reasoning.type === "summarize") { yield { type: MessageUpdateType.Reasoning, subtype: MessageReasoningUpdateType.Status, status: "Summarizing reasoning...", }; try { const summary = yield* generateFromDefaultEndpoint({ messages: [ { from: "user", content: `Question: ${messages[messages.length - 1].content} Reasoning: ${reasoningBuffer}`, }, ], preprompt: `Your task is to summarize concisely all your reasoning steps and then give the final answer. Keep it short, one short paragraph at most. If the reasoning steps explicitly include a code solution, make sure to include it in your answer. If the user is just having a casual conversation that doesn't require explanations, answer directly without explaining your steps, otherwise make sure to summarize step by step, make sure to skip dead-ends in your reasoning and removing excess detail. Do not use prefixes such as Response: or Answer: when answering to the user.`, generateSettings: { max_tokens: 1024, }, modelId: model.id, apiKey: authToken, }); finalAnswer = summary; yield { type: MessageUpdateType.Reasoning, subtype: MessageReasoningUpdateType.Status, status: `Done in ${Math.round((new Date().getTime() - startTime.getTime()) / 1000)}s.`, }; } catch (e) { finalAnswer = text; logger.error(e); } } else if (model.reasoning && model.reasoning.type === "tokens") { // make sure to remove the content of the reasoning buffer from // the final answer to avoid duplication // if the beginToken is an empty string, we don't need to remove anything const beginIndex = model.reasoning.beginToken ? reasoningBuffer.indexOf(model.reasoning.beginToken) : 0; const endIndex = reasoningBuffer.lastIndexOf(model.reasoning.endToken); if (beginIndex !== -1 && endIndex !== -1) { // Remove the reasoning section (including tokens) from final answer finalAnswer = text.slice(0, beginIndex) + text.slice(endIndex + model.reasoning.endToken.length); } } yield { type: MessageUpdateType.FinalAnswer, text: finalAnswer, interrupted, }; continue; } if (model.reasoning && model.reasoning.type === "tokens") { if (output.token.text === model.reasoning.beginToken) { reasoning = true; reasoningBuffer += output.token.text; continue; } else if (output.token.text === model.reasoning.endToken) { reasoning = false; reasoningBuffer += output.token.text; yield { type: MessageUpdateType.Reasoning, subtype: MessageReasoningUpdateType.Status, status: `Done in ${Math.round((new Date().getTime() - startTime.getTime()) / 1000)}s.`, }; continue; } } // ignore special tokens if (output.token.special) continue; // pass down normal token if (reasoning) { reasoningBuffer += output.token.text; if (model.reasoning && model.reasoning.type === "tokens") { // split reasoning buffer so that anything that comes after the end token is separated // add it to the normal buffer, and yield two updates, one for the reasoning and one for the normal content // also set reasoning to false if (reasoningBuffer.lastIndexOf(model.reasoning.endToken) !== -1) { const endTokenIndex = reasoningBuffer.lastIndexOf(model.reasoning.endToken); const textBuffer = reasoningBuffer.slice(endTokenIndex + model.reasoning.endToken.length); reasoningBuffer = reasoningBuffer.slice( 0, endTokenIndex + model.reasoning.endToken.length + 1 ); yield { type: MessageUpdateType.Reasoning, subtype: MessageReasoningUpdateType.Stream, token: output.token.text, }; yield { type: MessageUpdateType.Stream, token: textBuffer, }; yield { type: MessageUpdateType.Reasoning, subtype: MessageReasoningUpdateType.Status, status: `Done in ${Math.round((new Date().getTime() - startTime.getTime()) / 1000)}s.`, }; reasoning = false; continue; } } // yield status update if it has changed if (status !== "") { yield { type: MessageUpdateType.Reasoning, subtype: MessageReasoningUpdateType.Status, status, }; status = ""; } // create a new status every 5 seconds if ( config.REASONING_SUMMARY === "true" && new Date().getTime() - lastReasoningUpdate.getTime() > 4000 ) { lastReasoningUpdate = new Date(); try { generateSummaryOfReasoning(reasoningBuffer, model.id, { apiKey: authToken }).then( (summary) => { status = summary; } ); } catch (e) { logger.error(e); } } yield { type: MessageUpdateType.Reasoning, subtype: MessageReasoningUpdateType.Stream, token: output.token.text, }; } else { yield { type: MessageUpdateType.Stream, token: output.token.text }; } // abort check const date = AbortedGenerations.getInstance().getAbortTime(conv._id.toString()); if (date && date > promptedAt) { logger.info(`Aborting generation for conversation ${conv._id}`); break; } // no output check if (!output) break; } }