muryshev commited on
Commit
1d8089b
·
1 Parent(s): 5abb7ed

Changes in search algorithm

Browse files
chat-ui/src/lib/server/websearch/runWebSearch.ts CHANGED
@@ -52,26 +52,27 @@ export async function runWebSearch(
52
  appendUpdate("Ищем в яндексе по запросу: ", [webSearch.searchQuery]);
53
 
54
  const results = await searchWeb(webSearch.searchQuery);
55
-
 
56
  webSearch.results =
57
- (results.organic_results &&
58
  results.organic_results.map((el: { title: string; link: string }) => {
59
  const { title, link } = el;
60
  const { hostname } = new URL(link);
61
  return { title, link, hostname };
62
  })) ??
63
  [];
64
-
65
-
66
  webSearch.results = webSearch.results
67
  .filter(({ link }) => !link.includes("youtube.com") && !link.includes(".pdf")); // filter out youtube links
68
  //slice(0, Number(MAX_N_PAGES_SCRAPE)); // limit to first 10 links only
69
 
70
  // let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
 
71
  let texts : {source: any, text: string}[] = [];
72
  if (webSearch.results.length > 0) {
73
  appendUpdate("Обработка результатов");
74
-
75
  for(const i in webSearch.results) {
76
  if(texts.length > 30) break;
77
  const { link, hostname, title } = webSearch.results[i];
@@ -96,9 +97,21 @@ export async function runWebSearch(
96
  } catch (e) {
97
  console.error(`Error parsing webpage "${link}"`, e);
98
  }
99
- // const MAX_N_CHUNKS = 20;
100
- if(text.length > 0)
 
 
 
 
101
  texts.push({source: { link: link, hostname: hostname, title: title }, text})
 
 
 
 
 
 
 
 
102
  // texts.push(...chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS));
103
  }
104
  } else {
@@ -107,28 +120,57 @@ export async function runWebSearch(
107
  //throw new Error("No results found for this search query");
108
  }
109
 
110
- if(texts && text.length > 0){
111
  appendUpdate("Получение релевантной информации");
112
-
113
- const indices = await findSimilarSentences(prompt, texts.map((t) => t.text));//, { topK: topKClosestParagraphs});
114
- console.log('similarity check result:', indices);
115
-
116
-
117
- webSearch.context = indices.filter((r) => r.score >= Number(SIMILARITY_THRESHOLD))
118
- .sort((a, b) => b.score - a.score)
119
- .slice(0, Number(MAX_N_PAGES_SCRAPE))
120
- .map(({i}) => texts[i].text)
121
- .join(" ")
122
- .slice(0, Number(SAIGA_TRUNCATE_WEB_CONTEXT));
123
 
124
- console.log('web search context:', webSearch.context);
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  updatePad({
127
  type: "webSearch",
128
  messageType: "sources",
129
  message: "sources",
130
- sources: [],
131
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  // const usedSources = new Set<string>();
134
  // for (const idx of indices) {
 
52
  appendUpdate("Ищем в яндексе по запросу: ", [webSearch.searchQuery]);
53
 
54
  const results = await searchWeb(webSearch.searchQuery);
55
+ console.log('search results', results)
56
+
57
  webSearch.results =
58
+ (results && results.organic_results &&
59
  results.organic_results.map((el: { title: string; link: string }) => {
60
  const { title, link } = el;
61
  const { hostname } = new URL(link);
62
  return { title, link, hostname };
63
  })) ??
64
  [];
65
+
 
66
  webSearch.results = webSearch.results
67
  .filter(({ link }) => !link.includes("youtube.com") && !link.includes(".pdf")); // filter out youtube links
68
  //slice(0, Number(MAX_N_PAGES_SCRAPE)); // limit to first 10 links only
69
 
70
  // let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
71
+
72
  let texts : {source: any, text: string}[] = [];
73
  if (webSearch.results.length > 0) {
74
  appendUpdate("Обработка результатов");
75
+ let fullText = '';
76
  for(const i in webSearch.results) {
77
  if(texts.length > 30) break;
78
  const { link, hostname, title } = webSearch.results[i];
 
97
  } catch (e) {
98
  console.error(`Error parsing webpage "${link}"`, e);
99
  }
100
+
101
+
102
+
103
+ if(text.length > 0){
104
+ webSearch.contextSources.push({ link: link, hostname: hostname, title: title });
105
+
106
  texts.push({source: { link: link, hostname: hostname, title: title }, text})
107
+ }
108
+
109
+
110
+ fullText += text;
111
+ if(fullText.length >= Number(SAIGA_TRUNCATE_WEB_CONTEXT)){
112
+ break;
113
+ }
114
+
115
  // texts.push(...chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS));
116
  }
117
  } else {
 
120
  //throw new Error("No results found for this search query");
121
  }
122
 
123
+ if(texts && texts.length > 0){
124
  appendUpdate("Получение релевантной информации");
125
+ console.log('webSearch.contextSources', webSearch.contextSources)
126
+ // const allIndices = await findSimilarSentences(prompt, texts.map((t) => t.text));//, { topK: topKClosestParagraphs});
127
+
128
+ // console.log('similarity check result:', allIndices);
129
+ // const indices = allIndices.filter((r) => r.score >= Number(SIMILARITY_THRESHOLD))
130
+ // .sort((a, b) => b.score - a.score)
131
+ // .slice(0, Number(MAX_N_PAGES_SCRAPE));
 
 
 
 
132
 
 
133
 
134
+ // webSearch.context = indices.map(({i}) => texts[i].text)
135
+ // .join("\n")
136
+ // .slice(0, Number(SAIGA_TRUNCATE_WEB_CONTEXT));
137
+
138
+ // webSearch.contextSources = [
139
+ // {
140
+ // link: 'https://www.nalog.gov.ru/rn77/fl/interest/inn/calculation/',
141
+ // hostname: 'www.nalog.gov.ru',
142
+ // title: 'Я хочу встать на учёт в налоговый орган (получить...)'
143
+ // },
144
+ // {
145
+ // link: 'https://www.nalog.gov.ru/rn71/news/activities_fts/13690572/',
146
+ // hostname: 'www.nalog.gov.ru',
147
+ // title: 'Как получить свидетельство ИНН | ФНС России'
148
+ // },
149
+ // {
150
+ // link: 'https://www.nalog.gov.ru/rn10/news/activities_fts/13604016/',
151
+ // hostname: 'www.nalog.gov.ru',
152
+ // title: 'Как получить ИНН через электронные сервисы ФНС'
153
+ // }
154
+ // ];
155
  updatePad({
156
  type: "webSearch",
157
  messageType: "sources",
158
  message: "sources",
159
+ sources: webSearch.contextSources,
160
  });
161
+
162
+ webSearch.context = texts.map((t) => t.text)
163
+ .join("\n")
164
+ .slice(0, Number(SAIGA_TRUNCATE_WEB_CONTEXT));
165
+
166
+ console.log('web search context:', webSearch.context);
167
+
168
+ // updatePad({
169
+ // type: "webSearch",
170
+ // messageType: "sources",
171
+ // message: "sources",
172
+ // sources: [],
173
+ // });
174
 
175
  // const usedSources = new Set<string>();
176
  // for (const idx of indices) {
chat-ui/src/lib/server/websearch/searchWeb.ts CHANGED
@@ -72,9 +72,13 @@ export async function searchWebSerpApi(query: string) {
72
  } satisfies YandexParameters;
73
 
74
  // Show result as JSON
75
- const response = await getJson("yandex", params);
 
76
 
77
- return response;
 
 
 
78
  }
79
 
80
  export async function searchYandex(query: string) {
@@ -103,7 +107,7 @@ export async function searchYandex(query: string) {
103
  });
104
  });
105
 
106
-
107
 
108
  console.log('Yandex search result', dataArray)
109
  return { organic_results: dataArray };
 
72
  } satisfies YandexParameters;
73
 
74
  // Show result as JSON
75
+ try {
76
+ const response = await getJson("yandex", params);
77
 
78
+ return response;
79
+ } catch (e) {
80
+ console.error('Error fetching yandex results:', e);
81
+ }
82
  }
83
 
84
  export async function searchYandex(query: string) {
 
107
  });
108
  });
109
 
110
+
111
 
112
  console.log('Yandex search result', dataArray)
113
  return { organic_results: dataArray };
chat-ui/src/routes/conversation/[id]/+server.ts CHANGED
@@ -23,6 +23,10 @@ import { buildSearchPrompt } from "$lib/buildSearchPrompt.js";
23
  import PastebinAPI from 'pastebin-ts';
24
 
25
  function Log(data: any) {
 
 
 
 
26
  console.log('uploading data to pastebin');
27
  try {
28
  const pastebin = new PastebinAPI({
 
23
  import PastebinAPI from 'pastebin-ts';
24
 
25
  function Log(data: any) {
26
+ if(!PASTEBIN_DEV_KEY || !PASTEBIN_USERNAME || !PASTEBIN_PASSWORD){
27
+ console.log('Skipping pastebin log')
28
+ return;
29
+ }
30
  console.log('uploading data to pastebin');
31
  try {
32
  const pastebin = new PastebinAPI({