Andrew
(feat) Forward auth token through reasoning flows
f407536
import { config } from "$lib/server/config";
import {
MessageReasoningUpdateType,
MessageUpdateType,
type MessageUpdate,
} from "$lib/types/MessageUpdate";
import { AbortedGenerations } from "../abortedGenerations";
import type { TextGenerationContext } from "./types";
import type { EndpointMessage } from "../endpoints/endpoints";
import { generateFromDefaultEndpoint } from "../generateFromDefaultEndpoint";
import { generateSummaryOfReasoning } from "./reasoning";
import { logger } from "../logger";
type GenerateContext = Omit<TextGenerationContext, "messages"> & { messages: EndpointMessage[] };
export async function* generate(
{
model,
endpoint,
conv,
messages,
assistant,
isContinue,
promptedAt,
forceMultimodal,
authToken,
}: GenerateContext,
preprompt?: string
): AsyncIterable<MessageUpdate> {
// reasoning mode is false by default
let reasoning = false;
let reasoningBuffer = "";
let lastReasoningUpdate = new Date();
let status = "";
const startTime = new Date();
if (
model.reasoning &&
// if the beginToken is an empty string, the model starts in reasoning mode
(model.reasoning.type === "regex" ||
model.reasoning.type === "summarize" ||
(model.reasoning.type === "tokens" && model.reasoning.beginToken === ""))
) {
// if the model has reasoning in regex or summarize mode, it starts in reasoning mode
// and we extract the answer from the reasoning
reasoning = true;
yield {
type: MessageUpdateType.Reasoning,
subtype: MessageReasoningUpdateType.Status,
status: "Started reasoning...",
};
}
for await (const output of await endpoint({
messages,
preprompt,
continueMessage: isContinue,
generateSettings: assistant?.generateSettings,
// Allow user-level override to force multimodal
isMultimodal: (forceMultimodal ?? false) || model.multimodal,
conversationId: conv._id,
})) {
// Check if this output contains router metadata
if (
"routerMetadata" in output &&
output.routerMetadata &&
output.routerMetadata.route &&
output.routerMetadata.model
) {
yield {
type: MessageUpdateType.RouterMetadata,
route: output.routerMetadata.route,
model: output.routerMetadata.model,
};
continue;
}
// text generation completed
if (output.generated_text) {
let interrupted =
!output.token.special && !model.parameters.stop?.includes(output.token.text);
let text = output.generated_text.trimEnd();
for (const stopToken of model.parameters.stop ?? []) {
if (!text.endsWith(stopToken)) continue;
interrupted = false;
text = text.slice(0, text.length - stopToken.length);
}
let finalAnswer = text;
if (model.reasoning && model.reasoning.type === "regex") {
const regex = new RegExp(model.reasoning.regex);
finalAnswer = regex.exec(reasoningBuffer)?.[1] ?? text;
} else if (model.reasoning && model.reasoning.type === "summarize") {
yield {
type: MessageUpdateType.Reasoning,
subtype: MessageReasoningUpdateType.Status,
status: "Summarizing reasoning...",
};
try {
const summary = yield* generateFromDefaultEndpoint({
messages: [
{
from: "user",
content: `Question: ${messages[messages.length - 1].content}
Reasoning: ${reasoningBuffer}`,
},
],
preprompt: `Your task is to summarize concisely all your reasoning steps and then give the final answer. Keep it short, one short paragraph at most. If the reasoning steps explicitly include a code solution, make sure to include it in your answer.
If the user is just having a casual conversation that doesn't require explanations, answer directly without explaining your steps, otherwise make sure to summarize step by step, make sure to skip dead-ends in your reasoning and removing excess detail.
Do not use prefixes such as Response: or Answer: when answering to the user.`,
generateSettings: {
max_tokens: 1024,
},
modelId: model.id,
apiKey: authToken,
});
finalAnswer = summary;
yield {
type: MessageUpdateType.Reasoning,
subtype: MessageReasoningUpdateType.Status,
status: `Done in ${Math.round((new Date().getTime() - startTime.getTime()) / 1000)}s.`,
};
} catch (e) {
finalAnswer = text;
logger.error(e);
}
} else if (model.reasoning && model.reasoning.type === "tokens") {
// make sure to remove the content of the reasoning buffer from
// the final answer to avoid duplication
// if the beginToken is an empty string, we don't need to remove anything
const beginIndex = model.reasoning.beginToken
? reasoningBuffer.indexOf(model.reasoning.beginToken)
: 0;
const endIndex = reasoningBuffer.lastIndexOf(model.reasoning.endToken);
if (beginIndex !== -1 && endIndex !== -1) {
// Remove the reasoning section (including tokens) from final answer
finalAnswer =
text.slice(0, beginIndex) + text.slice(endIndex + model.reasoning.endToken.length);
}
}
yield {
type: MessageUpdateType.FinalAnswer,
text: finalAnswer,
interrupted,
};
continue;
}
if (model.reasoning && model.reasoning.type === "tokens") {
if (output.token.text === model.reasoning.beginToken) {
reasoning = true;
reasoningBuffer += output.token.text;
continue;
} else if (output.token.text === model.reasoning.endToken) {
reasoning = false;
reasoningBuffer += output.token.text;
yield {
type: MessageUpdateType.Reasoning,
subtype: MessageReasoningUpdateType.Status,
status: `Done in ${Math.round((new Date().getTime() - startTime.getTime()) / 1000)}s.`,
};
continue;
}
}
// ignore special tokens
if (output.token.special) continue;
// pass down normal token
if (reasoning) {
reasoningBuffer += output.token.text;
if (model.reasoning && model.reasoning.type === "tokens") {
// split reasoning buffer so that anything that comes after the end token is separated
// add it to the normal buffer, and yield two updates, one for the reasoning and one for the normal content
// also set reasoning to false
if (reasoningBuffer.lastIndexOf(model.reasoning.endToken) !== -1) {
const endTokenIndex = reasoningBuffer.lastIndexOf(model.reasoning.endToken);
const textBuffer = reasoningBuffer.slice(endTokenIndex + model.reasoning.endToken.length);
reasoningBuffer = reasoningBuffer.slice(
0,
endTokenIndex + model.reasoning.endToken.length + 1
);
yield {
type: MessageUpdateType.Reasoning,
subtype: MessageReasoningUpdateType.Stream,
token: output.token.text,
};
yield {
type: MessageUpdateType.Stream,
token: textBuffer,
};
yield {
type: MessageUpdateType.Reasoning,
subtype: MessageReasoningUpdateType.Status,
status: `Done in ${Math.round((new Date().getTime() - startTime.getTime()) / 1000)}s.`,
};
reasoning = false;
continue;
}
}
// yield status update if it has changed
if (status !== "") {
yield {
type: MessageUpdateType.Reasoning,
subtype: MessageReasoningUpdateType.Status,
status,
};
status = "";
}
// create a new status every 5 seconds
if (
config.REASONING_SUMMARY === "true" &&
new Date().getTime() - lastReasoningUpdate.getTime() > 4000
) {
lastReasoningUpdate = new Date();
try {
generateSummaryOfReasoning(reasoningBuffer, model.id, { apiKey: authToken }).then(
(summary) => {
status = summary;
}
);
} catch (e) {
logger.error(e);
}
}
yield {
type: MessageUpdateType.Reasoning,
subtype: MessageReasoningUpdateType.Stream,
token: output.token.text,
};
} else {
yield { type: MessageUpdateType.Stream, token: output.token.text };
}
// abort check
const date = AbortedGenerations.getInstance().getAbortTime(conv._id.toString());
if (date && date > promptedAt) {
logger.info(`Aborting generation for conversation ${conv._id}`);
break;
}
// no output check
if (!output) break;
}
}