Changes in search algorithm
Browse files
chat-ui/src/lib/server/websearch/runWebSearch.ts
CHANGED
|
@@ -52,26 +52,27 @@ export async function runWebSearch(
|
|
| 52 |
appendUpdate("Ищем в яндексе по запросу: ", [webSearch.searchQuery]);
|
| 53 |
|
| 54 |
const results = await searchWeb(webSearch.searchQuery);
|
| 55 |
-
|
|
|
|
| 56 |
webSearch.results =
|
| 57 |
-
(results.organic_results &&
|
| 58 |
results.organic_results.map((el: { title: string; link: string }) => {
|
| 59 |
const { title, link } = el;
|
| 60 |
const { hostname } = new URL(link);
|
| 61 |
return { title, link, hostname };
|
| 62 |
})) ??
|
| 63 |
[];
|
| 64 |
-
|
| 65 |
-
|
| 66 |
webSearch.results = webSearch.results
|
| 67 |
.filter(({ link }) => !link.includes("youtube.com") && !link.includes(".pdf")); // filter out youtube links
|
| 68 |
//slice(0, Number(MAX_N_PAGES_SCRAPE)); // limit to first 10 links only
|
| 69 |
|
| 70 |
// let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
|
|
|
|
| 71 |
let texts : {source: any, text: string}[] = [];
|
| 72 |
if (webSearch.results.length > 0) {
|
| 73 |
appendUpdate("Обработка результатов");
|
| 74 |
-
|
| 75 |
for(const i in webSearch.results) {
|
| 76 |
if(texts.length > 30) break;
|
| 77 |
const { link, hostname, title } = webSearch.results[i];
|
|
@@ -96,9 +97,21 @@ export async function runWebSearch(
|
|
| 96 |
} catch (e) {
|
| 97 |
console.error(`Error parsing webpage "${link}"`, e);
|
| 98 |
}
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
texts.push({source: { link: link, hostname: hostname, title: title }, text})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
// texts.push(...chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS));
|
| 103 |
}
|
| 104 |
} else {
|
|
@@ -107,28 +120,57 @@ export async function runWebSearch(
|
|
| 107 |
//throw new Error("No results found for this search query");
|
| 108 |
}
|
| 109 |
|
| 110 |
-
if(texts &&
|
| 111 |
appendUpdate("Получение релевантной информации");
|
| 112 |
-
|
| 113 |
-
const
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
.slice(0, Number(MAX_N_PAGES_SCRAPE))
|
| 120 |
-
.map(({i}) => texts[i].text)
|
| 121 |
-
.join(" ")
|
| 122 |
-
.slice(0, Number(SAIGA_TRUNCATE_WEB_CONTEXT));
|
| 123 |
|
| 124 |
-
console.log('web search context:', webSearch.context);
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
updatePad({
|
| 127 |
type: "webSearch",
|
| 128 |
messageType: "sources",
|
| 129 |
message: "sources",
|
| 130 |
-
sources:
|
| 131 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
// const usedSources = new Set<string>();
|
| 134 |
// for (const idx of indices) {
|
|
|
|
| 52 |
appendUpdate("Ищем в яндексе по запросу: ", [webSearch.searchQuery]);
|
| 53 |
|
| 54 |
const results = await searchWeb(webSearch.searchQuery);
|
| 55 |
+
console.log('search results', results)
|
| 56 |
+
|
| 57 |
webSearch.results =
|
| 58 |
+
(results && results.organic_results &&
|
| 59 |
results.organic_results.map((el: { title: string; link: string }) => {
|
| 60 |
const { title, link } = el;
|
| 61 |
const { hostname } = new URL(link);
|
| 62 |
return { title, link, hostname };
|
| 63 |
})) ??
|
| 64 |
[];
|
| 65 |
+
|
|
|
|
| 66 |
webSearch.results = webSearch.results
|
| 67 |
.filter(({ link }) => !link.includes("youtube.com") && !link.includes(".pdf")); // filter out youtube links
|
| 68 |
//slice(0, Number(MAX_N_PAGES_SCRAPE)); // limit to first 10 links only
|
| 69 |
|
| 70 |
// let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
|
| 71 |
+
|
| 72 |
let texts : {source: any, text: string}[] = [];
|
| 73 |
if (webSearch.results.length > 0) {
|
| 74 |
appendUpdate("Обработка результатов");
|
| 75 |
+
let fullText = '';
|
| 76 |
for(const i in webSearch.results) {
|
| 77 |
if(texts.length > 30) break;
|
| 78 |
const { link, hostname, title } = webSearch.results[i];
|
|
|
|
| 97 |
} catch (e) {
|
| 98 |
console.error(`Error parsing webpage "${link}"`, e);
|
| 99 |
}
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if(text.length > 0){
|
| 104 |
+
webSearch.contextSources.push({ link: link, hostname: hostname, title: title });
|
| 105 |
+
|
| 106 |
texts.push({source: { link: link, hostname: hostname, title: title }, text})
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
fullText += text;
|
| 111 |
+
if(fullText.length >= Number(SAIGA_TRUNCATE_WEB_CONTEXT)){
|
| 112 |
+
break;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
// texts.push(...chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS));
|
| 116 |
}
|
| 117 |
} else {
|
|
|
|
| 120 |
//throw new Error("No results found for this search query");
|
| 121 |
}
|
| 122 |
|
| 123 |
+
if(texts && texts.length > 0){
|
| 124 |
appendUpdate("Получение релевантной информации");
|
| 125 |
+
console.log('webSearch.contextSources', webSearch.contextSources)
|
| 126 |
+
// const allIndices = await findSimilarSentences(prompt, texts.map((t) => t.text));//, { topK: topKClosestParagraphs});
|
| 127 |
+
|
| 128 |
+
// console.log('similarity check result:', allIndices);
|
| 129 |
+
// const indices = allIndices.filter((r) => r.score >= Number(SIMILARITY_THRESHOLD))
|
| 130 |
+
// .sort((a, b) => b.score - a.score)
|
| 131 |
+
// .slice(0, Number(MAX_N_PAGES_SCRAPE));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
|
|
|
| 133 |
|
| 134 |
+
// webSearch.context = indices.map(({i}) => texts[i].text)
|
| 135 |
+
// .join("\n")
|
| 136 |
+
// .slice(0, Number(SAIGA_TRUNCATE_WEB_CONTEXT));
|
| 137 |
+
|
| 138 |
+
// webSearch.contextSources = [
|
| 139 |
+
// {
|
| 140 |
+
// link: 'https://www.nalog.gov.ru/rn77/fl/interest/inn/calculation/',
|
| 141 |
+
// hostname: 'www.nalog.gov.ru',
|
| 142 |
+
// title: 'Я хочу встать на учёт в налоговый орган (получить...)'
|
| 143 |
+
// },
|
| 144 |
+
// {
|
| 145 |
+
// link: 'https://www.nalog.gov.ru/rn71/news/activities_fts/13690572/',
|
| 146 |
+
// hostname: 'www.nalog.gov.ru',
|
| 147 |
+
// title: 'Как получить свидетельство ИНН | ФНС России'
|
| 148 |
+
// },
|
| 149 |
+
// {
|
| 150 |
+
// link: 'https://www.nalog.gov.ru/rn10/news/activities_fts/13604016/',
|
| 151 |
+
// hostname: 'www.nalog.gov.ru',
|
| 152 |
+
// title: 'Как получить ИНН через электронные сервисы ФНС'
|
| 153 |
+
// }
|
| 154 |
+
// ];
|
| 155 |
updatePad({
|
| 156 |
type: "webSearch",
|
| 157 |
messageType: "sources",
|
| 158 |
message: "sources",
|
| 159 |
+
sources: webSearch.contextSources,
|
| 160 |
});
|
| 161 |
+
|
| 162 |
+
webSearch.context = texts.map((t) => t.text)
|
| 163 |
+
.join("\n")
|
| 164 |
+
.slice(0, Number(SAIGA_TRUNCATE_WEB_CONTEXT));
|
| 165 |
+
|
| 166 |
+
console.log('web search context:', webSearch.context);
|
| 167 |
+
|
| 168 |
+
// updatePad({
|
| 169 |
+
// type: "webSearch",
|
| 170 |
+
// messageType: "sources",
|
| 171 |
+
// message: "sources",
|
| 172 |
+
// sources: [],
|
| 173 |
+
// });
|
| 174 |
|
| 175 |
// const usedSources = new Set<string>();
|
| 176 |
// for (const idx of indices) {
|
chat-ui/src/lib/server/websearch/searchWeb.ts
CHANGED
|
@@ -72,9 +72,13 @@ export async function searchWebSerpApi(query: string) {
|
|
| 72 |
} satisfies YandexParameters;
|
| 73 |
|
| 74 |
// Show result as JSON
|
| 75 |
-
|
|
|
|
| 76 |
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
| 78 |
}
|
| 79 |
|
| 80 |
export async function searchYandex(query: string) {
|
|
@@ -103,7 +107,7 @@ export async function searchYandex(query: string) {
|
|
| 103 |
});
|
| 104 |
});
|
| 105 |
|
| 106 |
-
|
| 107 |
|
| 108 |
console.log('Yandex search result', dataArray)
|
| 109 |
return { organic_results: dataArray };
|
|
|
|
| 72 |
} satisfies YandexParameters;
|
| 73 |
|
| 74 |
// Show result as JSON
|
| 75 |
+
try {
|
| 76 |
+
const response = await getJson("yandex", params);
|
| 77 |
|
| 78 |
+
return response;
|
| 79 |
+
} catch (e) {
|
| 80 |
+
console.error('Error fetching yandex results:', e);
|
| 81 |
+
}
|
| 82 |
}
|
| 83 |
|
| 84 |
export async function searchYandex(query: string) {
|
|
|
|
| 107 |
});
|
| 108 |
});
|
| 109 |
|
| 110 |
+
|
| 111 |
|
| 112 |
console.log('Yandex search result', dataArray)
|
| 113 |
return { organic_results: dataArray };
|
chat-ui/src/routes/conversation/[id]/+server.ts
CHANGED
|
@@ -23,6 +23,10 @@ import { buildSearchPrompt } from "$lib/buildSearchPrompt.js";
|
|
| 23 |
import PastebinAPI from 'pastebin-ts';
|
| 24 |
|
| 25 |
function Log(data: any) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
console.log('uploading data to pastebin');
|
| 27 |
try {
|
| 28 |
const pastebin = new PastebinAPI({
|
|
|
|
| 23 |
import PastebinAPI from 'pastebin-ts';
|
| 24 |
|
| 25 |
function Log(data: any) {
|
| 26 |
+
if(!PASTEBIN_DEV_KEY || !PASTEBIN_USERNAME || !PASTEBIN_PASSWORD){
|
| 27 |
+
console.log('Skipping pastebin log')
|
| 28 |
+
return;
|
| 29 |
+
}
|
| 30 |
console.log('uploading data to pastebin');
|
| 31 |
try {
|
| 32 |
const pastebin = new PastebinAPI({
|