| import { SAIGA_TRUNCATE_WEB_CONTEXT, WEB_SEARCH_TEMPLATE, USE_CUSTOM_PARSER, MAX_N_PAGES_SCRAPE, SIMILARITY_API_URL, SIMILARITY_THRESHOLD } from "$env/static/private"; |
| import { searchWeb } from "$lib/server/websearch/searchWeb"; |
| import type { Message } from "$lib/types/Message"; |
| import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch"; |
| import { generateQuery } from "$lib/server/websearch/generateQuery"; |
| import { parseWeb } from "$lib/server/websearch/parseWeb"; |
| import { chunk } from "$lib/utils/chunk"; |
| import { |
| MAX_SEQ_LEN as CHUNK_CAR_LEN |
| |
| } from "$lib/server/websearch/sentenceSimilarity"; |
| import type { Conversation } from "$lib/types/Conversation"; |
| import type { MessageUpdate } from "$lib/types/MessageUpdate"; |
| import { parseNalogGovRu } from "$lib/server/websearch/parseNalogGovRu"; |
| import { text } from "svelte/internal"; |
|
|
| const MAX_N_PAGES_EMBED = 10 as const; |
|
|
| export async function runWebSearch( |
| conv: Conversation, |
| prompt: string, |
| updatePad: (upd: MessageUpdate) => void |
| ) { |
| const messages = (() => { |
| return [...conv.messages, { content: prompt, from: "user", id: crypto.randomUUID() }]; |
| })() satisfies Message[]; |
|
|
| const webSearch: WebSearch = { |
| prompt: prompt, |
| searchQuery: "", |
| results: [], |
| context: "", |
| contextSources: [], |
| createdAt: new Date(), |
| updatedAt: new Date(), |
| }; |
|
|
| function appendUpdate(message: string, args?: string[], type?: "error" | "update") { |
| updatePad({ type: "webSearch", messageType: type ?? "update", message: message, args: args }); |
| } |
|
|
| try { |
| const userMessages = messages.filter(({ from }) => from === "user"); |
| const lastMessage = userMessages[userMessages.length - 1]; |
|
|
| appendUpdate("Генерируем поисковый запрос..."); |
|
|
| webSearch.searchQuery = await generateQuery(messages); |
| webSearch.searchQuery = (WEB_SEARCH_TEMPLATE ?? "{{query}}").replace("{{query}}", webSearch.searchQuery); |
|
|
| console.log('Web search query: ', webSearch.searchQuery) |
| appendUpdate("Ищем в яндексе по запросу: ", [webSearch.searchQuery]); |
|
|
| const results = await searchWeb(webSearch.searchQuery); |
| console.log('search results', results) |
|
|
| webSearch.results = |
| (results && results.organic_results && |
| results.organic_results.map((el: { title: string; link: string }) => { |
| const { title, link } = el; |
| const { hostname } = new URL(link); |
| return { title, link, hostname }; |
| })) ?? |
| []; |
|
|
| webSearch.results = webSearch.results |
| .filter(({ link }) => !link.includes("youtube.com") && !link.includes(".pdf")); |
| |
|
|
| |
|
|
| let texts: { source: any, text: string }[] = []; |
| if (webSearch.results.length > 0) { |
| appendUpdate("Обработка результатов"); |
| |
| for (const i in webSearch.results) { |
| if (texts.length > 30) break; |
| const { link, hostname, title } = webSearch.results[i]; |
| let text = ""; |
| try { |
| if (USE_CUSTOM_PARSER) { |
| try { |
| text = await parseNalogGovRu(link); |
| } catch (e) { |
| console.log('Custom parser failed. link: ', link) |
| console.log('Custom parser failed. error: ', e) |
| console.log('Starting fallback parser') |
| text = await parseWeb(link); |
| console.log('Fallback returned: ', text) |
| } |
|
|
| } else { |
| text = await parseWeb(link); |
| } |
|
|
| appendUpdate("Обработка страницы", [link]); |
| } catch (e) { |
| console.error(`Error parsing webpage "${link}"`, e); |
| } |
|
|
|
|
|
|
| if (text.length > 0) { |
| texts.push({ source: { link: link, hostname: hostname, title: title }, text }) |
| } |
|
|
|
|
| |
| |
| |
| |
|
|
| |
| } |
| } else { |
| appendUpdate("Поиск не вернул релевантных результатов"); |
| appendUpdate("Генерируем ответ на основе данных нейросети"); |
| appendUpdate("По вашему запросу ничего не найдено.", [], "error"); |
| throw new Error("По вашему запросу ничего не найдено."); |
| } |
|
|
| if (texts && texts.length > 0) { |
| appendUpdate("Получение релевантной информации"); |
| console.log('webSearch.contextSources', webSearch.contextSources) |
|
|
| try { |
| const allIndices = await findSimilarSentences(prompt, texts.map((t) => t.text)); |
|
|
|
|
| console.log('similarity check result:', allIndices); |
| const indices = allIndices.filter((r) => r.score >= Number(SIMILARITY_THRESHOLD)) |
| .sort((a, b) => b.score - a.score) |
| .slice(0, Number(MAX_N_PAGES_SCRAPE)); |
|
|
| let fullText = ''; |
| for (const i in indices) { |
| const text = texts[indices[i].i]; |
| if (fullText.length >= Number(SAIGA_TRUNCATE_WEB_CONTEXT)) { |
| break; |
| } |
|
|
| fullText += text.text + "\n"; |
| webSearch.contextSources.push(text.source); |
| } |
| webSearch.context = fullText.slice(0, Number(SAIGA_TRUNCATE_WEB_CONTEXT)); |
| console.log('web search context:', webSearch.context); |
| updatePad({ |
| type: "webSearch", |
| messageType: "sources", |
| message: "sources", |
| sources: webSearch.contextSources, |
| }); |
| } catch (e) { |
| if (e instanceof Error) { |
| appendUpdate( |
| e.message, |
| [], |
| "error" |
| ); |
| } |
| throw e; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| |
| |
| |
|
|
|
|
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| } |
| } catch (searchError) { |
| console.log('searchError', searchError) |
| if (searchError instanceof Error) { |
| appendUpdate( |
| searchError.message, |
| [], |
| "error" |
| ); |
| } |
|
|
| throw searchError; |
| } |
|
|
| return webSearch; |
| } |
|
|
|
|
| async function findSimilarSentences(query: string, sentences: string[]): Promise<{ i: number, score: number }[]> { |
| const requestOptions = { |
| method: 'POST', |
| headers: { |
| 'Content-Type': 'application/json', |
| }, |
| body: JSON.stringify({ |
| query: query, |
| sentences: sentences, |
| |
| }), |
| }; |
|
|
| let retries = 0; |
| const maxRetries = 5; |
|
|
| while (retries < maxRetries) { |
| try { |
| const response = await fetch(SIMILARITY_API_URL, requestOptions); |
| if (!response.ok) { |
| throw new Error('Network response was not ok'); |
| } |
|
|
| const jsonData = await response.json(); |
|
|
| return jsonData.result; |
|
|
| } catch (error) { |
| console.error('Error:', error); |
| retries++; |
| console.log(`Retrying similarity request (${retries}/${maxRetries})...`); |
| continue; |
|
|
| } |
| } |
|
|
| throw new Error(`Сервер генератора контекста недоступен.`); |
| } |