yuvrajsingh6 commited on
Commit
1c0197a
·
1 Parent(s): 33d94b3

fix: Scale web search scores to match PDF similarity scores

Browse files
Files changed (1) hide show
  1. backend/app/services/web_search.py +16 -5
backend/app/services/web_search.py CHANGED
@@ -16,7 +16,7 @@ from dataclasses import dataclass
16
  from enum import Enum
17
  import json
18
 
19
- from backend.app.config import settings
20
 
21
 
22
  class SearchProvider(Enum):
@@ -159,12 +159,17 @@ class TavilySearchProvider(BaseSearchProvider):
159
  search_results = data.get("results", [])
160
 
161
  for i, result in enumerate(search_results[: self.config.max_results]):
 
 
 
 
 
162
  results.append(
163
  SearchResult(
164
  title=result.get("title", ""),
165
  url=result.get("url", ""),
166
  snippet=result.get("content", ""),
167
- score=result.get("score", 0.9 - (i * 0.05)),
168
  provider="tavily",
169
  )
170
  )
@@ -205,13 +210,15 @@ class SerperSearchProvider(BaseSearchProvider):
205
 
206
  organic_results = data.get("organic_results", [])
207
 
 
208
  for i, result in enumerate(organic_results[: self.config.max_results]):
 
209
  results.append(
210
  SearchResult(
211
  title=result.get("title", ""),
212
  url=result.get("link", ""),
213
  snippet=result.get("snippet", ""),
214
- score=0.8 - (i * 0.05),
215
  provider="serper",
216
  )
217
  )
@@ -252,13 +259,15 @@ class BraveSearchProvider(BaseSearchProvider):
252
 
253
  web_results = data.get("web", {}).get("results", [])
254
 
 
255
  for i, result in enumerate(web_results[: self.config.max_results]):
 
256
  results.append(
257
  SearchResult(
258
  title=result.get("title", ""),
259
  url=result.get("url", ""),
260
  snippet=result.get("description", ""),
261
- score=0.85 - (i * 0.05),
262
  provider="brave",
263
  )
264
  )
@@ -296,13 +305,15 @@ class YouComSearchProvider(BaseSearchProvider):
296
 
297
  search_results = data.get("results", [])
298
 
 
299
  for i, result in enumerate(search_results[: self.config.max_results]):
 
300
  results.append(
301
  SearchResult(
302
  title=result.get("title", ""),
303
  url=result.get("url", ""),
304
  snippet=result.get("snippet", ""),
305
- score=result.get("score", 0.85 - (i * 0.05)),
306
  provider="youcom",
307
  )
308
  )
 
16
  from enum import Enum
17
  import json
18
 
19
+ from app.config import settings
20
 
21
 
22
  class SearchProvider(Enum):
 
159
  search_results = data.get("results", [])
160
 
161
  for i, result in enumerate(search_results[: self.config.max_results]):
162
+ # Scale scores to be comparable with PDF cosine similarity (0.3-0.7)
163
+ # Tavily already provides relevance scores, so we scale them
164
+ tavily_score = result.get("score", 0.7)
165
+ scaled_score = 0.3 + (tavily_score * 0.4) # Maps 0-1 to 0.3-0.7
166
+
167
  results.append(
168
  SearchResult(
169
  title=result.get("title", ""),
170
  url=result.get("url", ""),
171
  snippet=result.get("content", ""),
172
+ score=scaled_score - (i * 0.05),
173
  provider="tavily",
174
  )
175
  )
 
210
 
211
  organic_results = data.get("organic_results", [])
212
 
213
+ # Scale scores to be comparable with PDF cosine similarity (0.3-0.7)
214
  for i, result in enumerate(organic_results[: self.config.max_results]):
215
+ base_score = 0.65 - (i * 0.05) # Start at 0.65, decrease by 0.05
216
  results.append(
217
  SearchResult(
218
  title=result.get("title", ""),
219
  url=result.get("link", ""),
220
  snippet=result.get("snippet", ""),
221
+ score=base_score,
222
  provider="serper",
223
  )
224
  )
 
259
 
260
  web_results = data.get("web", {}).get("results", [])
261
 
262
+ # Scale scores to be comparable with PDF cosine similarity (0.3-0.7)
263
  for i, result in enumerate(web_results[: self.config.max_results]):
264
+ base_score = 0.65 - (i * 0.05) # Start at 0.65, decrease by 0.05
265
  results.append(
266
  SearchResult(
267
  title=result.get("title", ""),
268
  url=result.get("url", ""),
269
  snippet=result.get("description", ""),
270
+ score=base_score,
271
  provider="brave",
272
  )
273
  )
 
305
 
306
  search_results = data.get("results", [])
307
 
308
+ # Scale scores to be comparable with PDF cosine similarity (0.3-0.7)
309
  for i, result in enumerate(search_results[: self.config.max_results]):
310
+ base_score = 0.65 - (i * 0.05) # Start at 0.65, decrease by 0.05
311
  results.append(
312
  SearchResult(
313
  title=result.get("title", ""),
314
  url=result.get("url", ""),
315
  snippet=result.get("snippet", ""),
316
+ score=base_score,
317
  provider="youcom",
318
  )
319
  )