| const { NativeEmbedder } = require("../../EmbeddingEngines/native"); |
| const { |
| LLMPerformanceMonitor, |
| } = require("../../helpers/chat/LLMPerformanceMonitor"); |
| const { |
| handleDefaultStreamResponseV2, |
| formatChatHistory, |
| } = require("../../helpers/chat/responses"); |
|
|
| class LiteLLM { |
| constructor(embedder = null, modelPreference = null) { |
| const { OpenAI: OpenAIApi } = require("openai"); |
| if (!process.env.LITE_LLM_BASE_PATH) |
| throw new Error( |
| "LiteLLM must have a valid base path to use for the api." |
| ); |
|
|
| this.basePath = process.env.LITE_LLM_BASE_PATH; |
| this.openai = new OpenAIApi({ |
| baseURL: this.basePath, |
| apiKey: process.env.LITE_LLM_API_KEY ?? null, |
| }); |
| this.model = modelPreference ?? process.env.LITE_LLM_MODEL_PREF ?? null; |
| this.maxTokens = process.env.LITE_LLM_MODEL_TOKEN_LIMIT ?? 1024; |
| if (!this.model) throw new Error("LiteLLM must have a valid model set."); |
| this.limits = { |
| history: this.promptWindowLimit() * 0.15, |
| system: this.promptWindowLimit() * 0.15, |
| user: this.promptWindowLimit() * 0.7, |
| }; |
|
|
| this.embedder = embedder ?? new NativeEmbedder(); |
| this.defaultTemp = 0.7; |
| this.log(`Inference API: ${this.basePath} Model: ${this.model}`); |
| } |
|
|
| log(text, ...args) { |
| console.log(`\x1b[36m[${this.constructor.name}]\x1b[0m ${text}`, ...args); |
| } |
|
|
| #appendContext(contextTexts = []) { |
| if (!contextTexts || !contextTexts.length) return ""; |
| return ( |
| "\nContext:\n" + |
| contextTexts |
| .map((text, i) => { |
| return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`; |
| }) |
| .join("") |
| ); |
| } |
|
|
| streamingEnabled() { |
| return "streamGetChatCompletion" in this; |
| } |
|
|
| static promptWindowLimit(_modelName) { |
| const limit = process.env.LITE_LLM_MODEL_TOKEN_LIMIT || 4096; |
| if (!limit || isNaN(Number(limit))) |
| throw new Error("No token context limit was set."); |
| return Number(limit); |
| } |
|
|
| |
| |
| promptWindowLimit() { |
| const limit = process.env.LITE_LLM_MODEL_TOKEN_LIMIT || 4096; |
| if (!limit || isNaN(Number(limit))) |
| throw new Error("No token context limit was set."); |
| return Number(limit); |
| } |
|
|
| |
| |
| isValidChatCompletionModel(_modelName = "") { |
| return true; |
| } |
|
|
| |
| |
| |
| |
| |
| #generateContent({ userPrompt, attachments = [] }) { |
| if (!attachments.length) { |
| return userPrompt; |
| } |
|
|
| const content = [{ type: "text", text: userPrompt }]; |
| for (let attachment of attachments) { |
| content.push({ |
| type: "image_url", |
| image_url: { |
| url: attachment.contentString, |
| }, |
| }); |
| } |
| return content.flat(); |
| } |
|
|
| |
| |
| |
| |
| |
| constructPrompt({ |
| systemPrompt = "", |
| contextTexts = [], |
| chatHistory = [], |
| userPrompt = "", |
| attachments = [], |
| }) { |
| const prompt = { |
| role: "system", |
| content: `${systemPrompt}${this.#appendContext(contextTexts)}`, |
| }; |
| return [ |
| prompt, |
| ...formatChatHistory(chatHistory, this.#generateContent), |
| { |
| role: "user", |
| content: this.#generateContent({ userPrompt, attachments }), |
| }, |
| ]; |
| } |
|
|
| async getChatCompletion(messages = null, { temperature = 0.7 }) { |
| const result = await LLMPerformanceMonitor.measureAsyncFunction( |
| this.openai.chat.completions |
| .create({ |
| model: this.model, |
| messages, |
| temperature, |
| max_tokens: parseInt(this.maxTokens), |
| }) |
| .catch((e) => { |
| throw new Error(e.message); |
| }) |
| ); |
|
|
| if ( |
| !result.output.hasOwnProperty("choices") || |
| result.output.choices.length === 0 |
| ) |
| return null; |
|
|
| return { |
| textResponse: result.output.choices[0].message.content, |
| metrics: { |
| prompt_tokens: result.output.usage?.prompt_tokens || 0, |
| completion_tokens: result.output.usage?.completion_tokens || 0, |
| total_tokens: result.output.usage?.total_tokens || 0, |
| outputTps: |
| (result.output.usage?.completion_tokens || 0) / result.duration, |
| duration: result.duration, |
| }, |
| }; |
| } |
|
|
| async streamGetChatCompletion(messages = null, { temperature = 0.7 }) { |
| const measuredStreamRequest = await LLMPerformanceMonitor.measureStream( |
| this.openai.chat.completions.create({ |
| model: this.model, |
| stream: true, |
| messages, |
| temperature, |
| max_tokens: parseInt(this.maxTokens), |
| }), |
| messages |
| |
| |
| ); |
|
|
| return measuredStreamRequest; |
| } |
|
|
| handleStream(response, stream, responseProps) { |
| return handleDefaultStreamResponseV2(response, stream, responseProps); |
| } |
|
|
| |
| async embedTextInput(textInput) { |
| return await this.embedder.embedTextInput(textInput); |
| } |
| async embedChunks(textChunks = []) { |
| return await this.embedder.embedChunks(textChunks); |
| } |
|
|
| async compressMessages(promptArgs = {}, rawHistory = []) { |
| const { messageArrayCompressor } = require("../../helpers/chat"); |
| const messageArray = this.constructPrompt(promptArgs); |
| return await messageArrayCompressor(this, messageArray, rawHistory); |
| } |
| } |
|
|
| module.exports = { |
| LiteLLM, |
| }; |
|
|