| | import { searchWeb } from "$lib/server/websearch/searchWeb"; |
| | import { generateQuery } from "$lib/server/websearch/generateQuery"; |
| | import { parseWeb } from "$lib/server/websearch/parseWeb"; |
| | import { chunk } from "$lib/utils/chunk"; |
| | import { findSimilarSentences } from "$lib/server/sentenceSimilarity"; |
| | import { getWebSearchProvider } from "./searchWeb"; |
| | import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels"; |
| | import { env } from "$env/dynamic/private"; |
| |
|
| | import type { Conversation } from "$lib/types/Conversation"; |
| | import type { MessageUpdate } from "$lib/types/MessageUpdate"; |
| | import type { Message } from "$lib/types/Message"; |
| | import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch"; |
| | import type { Assistant } from "$lib/types/Assistant"; |
| |
|
| | import { z } from "zod"; |
| | import JSON5 from "json5"; |
| | import { isURLLocal } from "../isURLLocal"; |
| |
|
| | const MAX_N_PAGES_SCRAPE = 10 as const; |
| | const MAX_N_PAGES_EMBED = 5 as const; |
| |
|
| | const listSchema = z.array(z.string()).default([]); |
| |
|
| | const allowList = listSchema.parse(JSON5.parse(env.WEBSEARCH_ALLOWLIST)); |
| | const blockList = listSchema.parse(JSON5.parse(env.WEBSEARCH_BLOCKLIST)); |
| |
|
| | export async function runWebSearch( |
| | conv: Conversation, |
| | messages: Message[], |
| | updatePad: (upd: MessageUpdate) => void, |
| | ragSettings?: Assistant["rag"] |
| | ) { |
| | const prompt = messages[messages.length - 1].content; |
| | const webSearch: WebSearch = { |
| | prompt, |
| | searchQuery: "", |
| | results: [], |
| | contextSources: [], |
| | createdAt: new Date(), |
| | updatedAt: new Date(), |
| | }; |
| |
|
| | function appendUpdate(message: string, args?: string[], type?: "error" | "update") { |
| | updatePad({ type: "webSearch", messageType: type ?? "update", message, args }); |
| | } |
| |
|
| | try { |
| | |
| | if (ragSettings && ragSettings?.allowedLinks.length > 0) { |
| | appendUpdate("Using links specified in Assistant"); |
| |
|
| | let linksToUse = [...ragSettings.allowedLinks]; |
| |
|
| | if (env.ENABLE_LOCAL_FETCH !== "true") { |
| | const localLinks = await Promise.all( |
| | linksToUse.map(async (link) => { |
| | try { |
| | const url = new URL(link); |
| | return await isURLLocal(url); |
| | } catch (e) { |
| | return true; |
| | } |
| | }) |
| | ); |
| |
|
| | linksToUse = linksToUse.filter((_, index) => !localLinks[index]); |
| | } |
| |
|
| | webSearch.results = linksToUse.map((link) => { |
| | return { link, hostname: new URL(link).hostname, title: "", text: "" }; |
| | }); |
| | } else { |
| | webSearch.searchQuery = await generateQuery(messages); |
| | const searchProvider = getWebSearchProvider(); |
| | appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]); |
| |
|
| | let filters = ""; |
| | if (ragSettings && ragSettings?.allowedDomains.length > 0) { |
| | appendUpdate("Filtering on specified domains"); |
| | filters += ragSettings.allowedDomains.map((item) => "site:" + item).join(" OR "); |
| | } |
| |
|
| | |
| | filters += |
| | allowList.map((item) => "site:" + item).join(" OR ") + |
| | " " + |
| | blockList.map((item) => "-site:" + item).join(" "); |
| |
|
| | webSearch.searchQuery = filters + " " + webSearch.searchQuery; |
| |
|
| | const results = await searchWeb(webSearch.searchQuery); |
| | webSearch.results = |
| | (results.organic_results && |
| | results.organic_results.map((el: { title?: string; link: string; text?: string }) => { |
| | try { |
| | const { title, link, text } = el; |
| | const { hostname } = new URL(link); |
| | return { title, link, hostname, text }; |
| | } catch (e) { |
| | |
| | return null; |
| | } |
| | })) ?? |
| | []; |
| | } |
| |
|
| | webSearch.results = webSearch.results.filter((value) => value !== null); |
| | webSearch.results = webSearch.results |
| | .filter(({ link }) => !blockList.some((el) => link.includes(el))) |
| | .slice(0, MAX_N_PAGES_SCRAPE); |
| |
|
| | |
| | const embeddingModel = |
| | embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel; |
| |
|
| | if (!embeddingModel) { |
| | throw new Error(`Embedding model ${conv.embeddingModel} not available anymore`); |
| | } |
| |
|
| | let paragraphChunks: { source: WebSearchSource; text: string }[] = []; |
| | if (webSearch.results.length > 0) { |
| | appendUpdate("Browsing results"); |
| | const promises = webSearch.results.map(async (result) => { |
| | const { link } = result; |
| | let text = result.text ?? ""; |
| | if (!text) { |
| | try { |
| | text = await parseWeb(link); |
| | appendUpdate("Browsing webpage", [link]); |
| | } catch (e) { |
| | appendUpdate("Failed to parse webpage", [(e as Error).message, link], "error"); |
| | |
| | } |
| | } |
| | const MAX_N_CHUNKS = 100; |
| | const texts = chunk(text, embeddingModel.chunkCharLength).slice(0, MAX_N_CHUNKS); |
| | return texts.map((t) => ({ source: result, text: t })); |
| | }); |
| | const nestedParagraphChunks = (await Promise.all(promises)).slice(0, MAX_N_PAGES_EMBED); |
| | paragraphChunks = nestedParagraphChunks.flat(); |
| | if (!paragraphChunks.length) { |
| | throw new Error("No text found on the first 5 results"); |
| | } |
| | } else { |
| | throw new Error("No results found for this search query"); |
| | } |
| |
|
| | appendUpdate("Extracting relevant information"); |
| | const topKClosestParagraphs = 8; |
| | const texts = paragraphChunks.map(({ text }) => text); |
| | const indices = await findSimilarSentences(embeddingModel, prompt, texts, { |
| | topK: topKClosestParagraphs, |
| | }); |
| |
|
| | for (const idx of indices) { |
| | const { source } = paragraphChunks[idx]; |
| | const contextWithId = { idx, text: texts[idx] }; |
| | const usedSource = webSearch.contextSources.find((cSource) => cSource.link === source.link); |
| | if (usedSource) { |
| | usedSource.context.push(contextWithId); |
| | } else { |
| | webSearch.contextSources.push({ ...source, context: [contextWithId] }); |
| | } |
| | } |
| | updatePad({ |
| | type: "webSearch", |
| | messageType: "sources", |
| | message: "sources", |
| | sources: webSearch.contextSources, |
| | }); |
| | } catch (searchError) { |
| | if (searchError instanceof Error) { |
| | appendUpdate("An error occurred", [JSON.stringify(searchError.message)], "error"); |
| | } |
| | } |
| |
|
| | return webSearch; |
| | } |
| |
|