Spaces:

leowuming
/

rag

Runtime error

App Files Files Community

rag / server /utils /chats /embed.js

gaojintao01

Add files using Git LFS

f8b5d42 4 months ago

history blame contribute delete

7.95 kB

	const { v4: uuidv4 } = require("uuid");
	const { getVectorDbClass, getLLMProvider } = require("../helpers");
	const { chatPrompt, sourceIdentifier } = require("./index");
	const { EmbedChats } = require("../../models/embedChats");
	const {
	convertToPromptHistory,
	writeResponseChunk,
	} = require("../helpers/chat/responses");
	const { DocumentManager } = require("../DocumentManager");

	async function streamChatWithForEmbed(
	response,
	/** @type {import("@prisma/client").embed_configs & {workspace?: import("@prisma/client").workspaces}} */
	embed,
	/** @type {String} */
	message,
	/** @type {String} */
	sessionId,
	{ promptOverride, modelOverride, temperatureOverride, username }
	) {
	const chatMode = embed.chat_mode;
	const chatModel = embed.allow_model_override ? modelOverride : null;

	// If there are overrides in request & they are permitted, override the default workspace ref information.
	if (embed.allow_prompt_override)
	embed.workspace.openAiPrompt = promptOverride;
	if (embed.allow_temperature_override)
	embed.workspace.openAiTemp = parseFloat(temperatureOverride);

	const uuid = uuidv4();
	const LLMConnector = getLLMProvider({
	provider: embed?.workspace?.chatProvider,
	model: chatModel ?? embed.workspace?.chatModel,
	});
	const VectorDb = getVectorDbClass();

	const messageLimit = embed.message_limit ?? 20;
	const hasVectorizedSpace = await VectorDb.hasNamespace(embed.workspace.slug);
	const embeddingsCount = await VectorDb.namespaceCount(embed.workspace.slug);

	// User is trying to query-mode chat a workspace that has no data in it - so
	// we should exit early as no information can be found under these conditions.
	if ((!hasVectorizedSpace \|\| embeddingsCount === 0) && chatMode === "query") {
	writeResponseChunk(response, {
	id: uuid,
	type: "textResponse",
	textResponse:
	"I do not have enough information to answer that. Try another question.",
	sources: [],
	close: true,
	error: null,
	});
	return;
	}

	let completeText;
	let metrics = {};
	let contextTexts = [];
	let sources = [];
	let pinnedDocIdentifiers = [];
	const { rawHistory, chatHistory } = await recentEmbedChatHistory(
	sessionId,
	embed,
	messageLimit
	);

	// See stream.js comment for more information on this implementation.
	await new DocumentManager({
	workspace: embed.workspace,
	maxTokens: LLMConnector.promptWindowLimit(),
	})
	.pinnedDocs()
	.then((pinnedDocs) => {
	pinnedDocs.forEach((doc) => {
	const { pageContent, ...metadata } = doc;
	pinnedDocIdentifiers.push(sourceIdentifier(doc));
	contextTexts.push(doc.pageContent);
	sources.push({
	text:
	pageContent.slice(0, 1_000) +
	"...continued on in source document...",
	...metadata,
	});
	});
	});

	const vectorSearchResults =
	embeddingsCount !== 0
	? await VectorDb.performSimilaritySearch({
	namespace: embed.workspace.slug,
	input: message,
	LLMConnector,
	similarityThreshold: embed.workspace?.similarityThreshold,
	topN: embed.workspace?.topN,
	filterIdentifiers: pinnedDocIdentifiers,
	rerank: embed.workspace?.vectorSearchMode === "rerank",
	})
	: {
	contextTexts: [],
	sources: [],
	message: null,
	};

	// Failed similarity search if it was run at all and failed.
	if (!!vectorSearchResults.message) {
	writeResponseChunk(response, {
	id: uuid,
	type: "abort",
	textResponse: null,
	sources: [],
	close: true,
	error: "Failed to connect to vector database provider.",
	});
	return;
	}

	const { fillSourceWindow } = require("../helpers/chat");
	const filledSources = fillSourceWindow({
	nDocs: embed.workspace?.topN \|\| 4,
	searchResults: vectorSearchResults.sources,
	history: rawHistory,
	filterIdentifiers: pinnedDocIdentifiers,
	});

	// Why does contextTexts get all the info, but sources only get current search?
	// This is to give the ability of the LLM to "comprehend" a contextual response without
	// populating the Citations under a response with documents the user "thinks" are irrelevant
	// due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
	// If a past citation was used to answer the question - that is visible in the history so it logically makes sense
	// and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
	// TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
	contextTexts = [...contextTexts, ...filledSources.contextTexts];
	sources = [...sources, ...vectorSearchResults.sources];

	// If in query mode and no sources are found in current search or backfilled from history, do not
	// let the LLM try to hallucinate a response or use general knowledge
	if (chatMode === "query" && contextTexts.length === 0) {
	writeResponseChunk(response, {
	id: uuid,
	type: "textResponse",
	textResponse:
	embed.workspace?.queryRefusalResponse ??
	"There is no relevant information in this workspace to answer your query.",
	sources: [],
	close: true,
	error: null,
	});
	return;
	}

	// Compress message to ensure prompt passes token limit with room for response
	// and build system messages based on inputs and history.
	const messages = await LLMConnector.compressMessages(
	{
	systemPrompt: await chatPrompt(embed.workspace, username),
	userPrompt: message,
	contextTexts,
	chatHistory,
	},
	rawHistory
	);

	// If streaming is not explicitly enabled for connector
	// we do regular waiting of a response and send a single chunk.
	if (LLMConnector.streamingEnabled() !== true) {
	console.log(
	`\x1b[31m[STREAMING DISABLED]\x1b[0m Streaming is not available for ${LLMConnector.constructor.name}. Will use regular chat method.`
	);
	const { textResponse, metrics: performanceMetrics } =
	await LLMConnector.getChatCompletion(messages, {
	temperature: embed.workspace?.openAiTemp ?? LLMConnector.defaultTemp,
	});
	completeText = textResponse;
	metrics = performanceMetrics;
	writeResponseChunk(response, {
	uuid,
	sources: [],
	type: "textResponseChunk",
	textResponse: completeText,
	close: true,
	error: false,
	});
	} else {
	const stream = await LLMConnector.streamGetChatCompletion(messages, {
	temperature: embed.workspace?.openAiTemp ?? LLMConnector.defaultTemp,
	});
	completeText = await LLMConnector.handleStream(response, stream, {
	uuid,
	sources: [],
	});
	metrics = stream.metrics;
	}

	await EmbedChats.new({
	embedId: embed.id,
	prompt: message,
	response: { text: completeText, type: chatMode, sources, metrics },
	connection_information: response.locals.connection
	? {
	...response.locals.connection,
	username: !!username ? String(username) : null,
	}
	: { username: !!username ? String(username) : null },
	sessionId,
	});
	return;
	}

	/**
	* @param {string} sessionId the session id of the user from embed widget
	* @param {Object} embed the embed config object
	* @param {Number} messageLimit the number of messages to return
	* @returns {Promise<{rawHistory: import("@prisma/client").embed_chats[], chatHistory: {role: string, content: string, attachments?: Object[]}[]}>
	*/
	async function recentEmbedChatHistory(sessionId, embed, messageLimit = 20) {
	const rawHistory = (
	await EmbedChats.forEmbedByUser(embed.id, sessionId, messageLimit, {
	id: "desc",
	})
	).reverse();
	return { rawHistory, chatHistory: convertToPromptHistory(rawHistory) };
	}

	module.exports = {
	streamChatWithForEmbed,
	};