Spaces:
Sleeping
Sleeping
yuvrajsingh6
commited on
Commit
·
1c0197a
1
Parent(s):
33d94b3
fix: Scale web search scores to match PDF similarity scores
Browse files
backend/app/services/web_search.py
CHANGED
|
@@ -16,7 +16,7 @@ from dataclasses import dataclass
|
|
| 16 |
from enum import Enum
|
| 17 |
import json
|
| 18 |
|
| 19 |
-
from
|
| 20 |
|
| 21 |
|
| 22 |
class SearchProvider(Enum):
|
|
@@ -159,12 +159,17 @@ class TavilySearchProvider(BaseSearchProvider):
|
|
| 159 |
search_results = data.get("results", [])
|
| 160 |
|
| 161 |
for i, result in enumerate(search_results[: self.config.max_results]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
results.append(
|
| 163 |
SearchResult(
|
| 164 |
title=result.get("title", ""),
|
| 165 |
url=result.get("url", ""),
|
| 166 |
snippet=result.get("content", ""),
|
| 167 |
-
score=
|
| 168 |
provider="tavily",
|
| 169 |
)
|
| 170 |
)
|
|
@@ -205,13 +210,15 @@ class SerperSearchProvider(BaseSearchProvider):
|
|
| 205 |
|
| 206 |
organic_results = data.get("organic_results", [])
|
| 207 |
|
|
|
|
| 208 |
for i, result in enumerate(organic_results[: self.config.max_results]):
|
|
|
|
| 209 |
results.append(
|
| 210 |
SearchResult(
|
| 211 |
title=result.get("title", ""),
|
| 212 |
url=result.get("link", ""),
|
| 213 |
snippet=result.get("snippet", ""),
|
| 214 |
-
score=
|
| 215 |
provider="serper",
|
| 216 |
)
|
| 217 |
)
|
|
@@ -252,13 +259,15 @@ class BraveSearchProvider(BaseSearchProvider):
|
|
| 252 |
|
| 253 |
web_results = data.get("web", {}).get("results", [])
|
| 254 |
|
|
|
|
| 255 |
for i, result in enumerate(web_results[: self.config.max_results]):
|
|
|
|
| 256 |
results.append(
|
| 257 |
SearchResult(
|
| 258 |
title=result.get("title", ""),
|
| 259 |
url=result.get("url", ""),
|
| 260 |
snippet=result.get("description", ""),
|
| 261 |
-
score=
|
| 262 |
provider="brave",
|
| 263 |
)
|
| 264 |
)
|
|
@@ -296,13 +305,15 @@ class YouComSearchProvider(BaseSearchProvider):
|
|
| 296 |
|
| 297 |
search_results = data.get("results", [])
|
| 298 |
|
|
|
|
| 299 |
for i, result in enumerate(search_results[: self.config.max_results]):
|
|
|
|
| 300 |
results.append(
|
| 301 |
SearchResult(
|
| 302 |
title=result.get("title", ""),
|
| 303 |
url=result.get("url", ""),
|
| 304 |
snippet=result.get("snippet", ""),
|
| 305 |
-
score=
|
| 306 |
provider="youcom",
|
| 307 |
)
|
| 308 |
)
|
|
|
|
| 16 |
from enum import Enum
|
| 17 |
import json
|
| 18 |
|
| 19 |
+
from app.config import settings
|
| 20 |
|
| 21 |
|
| 22 |
class SearchProvider(Enum):
|
|
|
|
| 159 |
search_results = data.get("results", [])
|
| 160 |
|
| 161 |
for i, result in enumerate(search_results[: self.config.max_results]):
|
| 162 |
+
# Scale scores to be comparable with PDF cosine similarity (0.3-0.7)
|
| 163 |
+
# Tavily already provides relevance scores, so we scale them
|
| 164 |
+
tavily_score = result.get("score", 0.7)
|
| 165 |
+
scaled_score = 0.3 + (tavily_score * 0.4) # Maps 0-1 to 0.3-0.7
|
| 166 |
+
|
| 167 |
results.append(
|
| 168 |
SearchResult(
|
| 169 |
title=result.get("title", ""),
|
| 170 |
url=result.get("url", ""),
|
| 171 |
snippet=result.get("content", ""),
|
| 172 |
+
score=scaled_score - (i * 0.05),
|
| 173 |
provider="tavily",
|
| 174 |
)
|
| 175 |
)
|
|
|
|
| 210 |
|
| 211 |
organic_results = data.get("organic_results", [])
|
| 212 |
|
| 213 |
+
# Scale scores to be comparable with PDF cosine similarity (0.3-0.7)
|
| 214 |
for i, result in enumerate(organic_results[: self.config.max_results]):
|
| 215 |
+
base_score = 0.65 - (i * 0.05) # Start at 0.65, decrease by 0.05
|
| 216 |
results.append(
|
| 217 |
SearchResult(
|
| 218 |
title=result.get("title", ""),
|
| 219 |
url=result.get("link", ""),
|
| 220 |
snippet=result.get("snippet", ""),
|
| 221 |
+
score=base_score,
|
| 222 |
provider="serper",
|
| 223 |
)
|
| 224 |
)
|
|
|
|
| 259 |
|
| 260 |
web_results = data.get("web", {}).get("results", [])
|
| 261 |
|
| 262 |
+
# Scale scores to be comparable with PDF cosine similarity (0.3-0.7)
|
| 263 |
for i, result in enumerate(web_results[: self.config.max_results]):
|
| 264 |
+
base_score = 0.65 - (i * 0.05) # Start at 0.65, decrease by 0.05
|
| 265 |
results.append(
|
| 266 |
SearchResult(
|
| 267 |
title=result.get("title", ""),
|
| 268 |
url=result.get("url", ""),
|
| 269 |
snippet=result.get("description", ""),
|
| 270 |
+
score=base_score,
|
| 271 |
provider="brave",
|
| 272 |
)
|
| 273 |
)
|
|
|
|
| 305 |
|
| 306 |
search_results = data.get("results", [])
|
| 307 |
|
| 308 |
+
# Scale scores to be comparable with PDF cosine similarity (0.3-0.7)
|
| 309 |
for i, result in enumerate(search_results[: self.config.max_results]):
|
| 310 |
+
base_score = 0.65 - (i * 0.05) # Start at 0.65, decrease by 0.05
|
| 311 |
results.append(
|
| 312 |
SearchResult(
|
| 313 |
title=result.get("title", ""),
|
| 314 |
url=result.get("url", ""),
|
| 315 |
snippet=result.get("snippet", ""),
|
| 316 |
+
score=base_score,
|
| 317 |
provider="youcom",
|
| 318 |
)
|
| 319 |
)
|