Spaces:
Runtime error
Runtime error
Lenient thres
Browse files- models/reranker.py +9 -9
models/reranker.py
CHANGED
|
@@ -42,7 +42,7 @@ class MedicalReranker:
|
|
| 42 |
'generic_health_site': 0.30
|
| 43 |
}
|
| 44 |
|
| 45 |
-
# Irrelevant content patterns
|
| 46 |
self.irrelevant_patterns = [
|
| 47 |
r'quiz|test|assessment|survey',
|
| 48 |
r'homepage|main page|index',
|
|
@@ -54,7 +54,7 @@ class MedicalReranker:
|
|
| 54 |
r'healthy-sleep/quiz', # Sleep quiz example
|
| 55 |
]
|
| 56 |
|
| 57 |
-
def rerank_results(self, query: str, results: List[Dict], min_score: float = 0.
|
| 58 |
"""Rerank search results based on medical relevance"""
|
| 59 |
if not results:
|
| 60 |
return []
|
|
@@ -105,10 +105,10 @@ class MedicalReranker:
|
|
| 105 |
logger.debug(f"Filtered irrelevant result: {url}")
|
| 106 |
continue
|
| 107 |
|
| 108 |
-
# Only skip if we have content and it's
|
| 109 |
# Don't filter based on content length if no content is available yet
|
| 110 |
-
if content and len(content) <
|
| 111 |
-
logger.debug(f"Filtered result with short content: {url}")
|
| 112 |
continue
|
| 113 |
|
| 114 |
filtered.append(result)
|
|
@@ -123,8 +123,8 @@ class MedicalReranker:
|
|
| 123 |
url = result.get('url', '')
|
| 124 |
domain = self._extract_domain(url)
|
| 125 |
|
| 126 |
-
# Get domain score - be more lenient with unknown domains
|
| 127 |
-
domain_score = self.domain_scores.get(domain, 0.
|
| 128 |
|
| 129 |
# Boost score for medical-specific content
|
| 130 |
title = result.get('title', '').lower()
|
|
@@ -243,8 +243,8 @@ class MedicalReranker:
|
|
| 243 |
if query.lower() in title:
|
| 244 |
title_relevance = min(title_relevance + 0.3, 1.0)
|
| 245 |
|
| 246 |
-
# Update composite score - be more lenient
|
| 247 |
-
domain_score = result.get('domain_score', 0.
|
| 248 |
result['title_relevance'] = title_relevance
|
| 249 |
result['composite_score'] = (domain_score * 0.3) + (title_relevance * 0.7) # Favor title relevance
|
| 250 |
|
|
|
|
| 42 |
'generic_health_site': 0.30
|
| 43 |
}
|
| 44 |
|
| 45 |
+
# Irrelevant content patterns - more specific to avoid false positives
|
| 46 |
self.irrelevant_patterns = [
|
| 47 |
r'quiz|test|assessment|survey',
|
| 48 |
r'homepage|main page|index',
|
|
|
|
| 54 |
r'healthy-sleep/quiz', # Sleep quiz example
|
| 55 |
]
|
| 56 |
|
| 57 |
+
def rerank_results(self, query: str, results: List[Dict], min_score: float = 0.05) -> List[Dict]:
|
| 58 |
"""Rerank search results based on medical relevance"""
|
| 59 |
if not results:
|
| 60 |
return []
|
|
|
|
| 105 |
logger.debug(f"Filtered irrelevant result: {url}")
|
| 106 |
continue
|
| 107 |
|
| 108 |
+
# Only skip if we have content and it's extremely short
|
| 109 |
# Don't filter based on content length if no content is available yet
|
| 110 |
+
if content and len(content) < 20: # Much more lenient - only filter very short content
|
| 111 |
+
logger.debug(f"Filtered result with very short content: {url}")
|
| 112 |
continue
|
| 113 |
|
| 114 |
filtered.append(result)
|
|
|
|
| 123 |
url = result.get('url', '')
|
| 124 |
domain = self._extract_domain(url)
|
| 125 |
|
| 126 |
+
# Get domain score - be much more lenient with unknown domains
|
| 127 |
+
domain_score = self.domain_scores.get(domain, 0.70) # Much higher default score
|
| 128 |
|
| 129 |
# Boost score for medical-specific content
|
| 130 |
title = result.get('title', '').lower()
|
|
|
|
| 243 |
if query.lower() in title:
|
| 244 |
title_relevance = min(title_relevance + 0.3, 1.0)
|
| 245 |
|
| 246 |
+
# Update composite score - be much more lenient
|
| 247 |
+
domain_score = result.get('domain_score', 0.7) # Much higher default
|
| 248 |
result['title_relevance'] = title_relevance
|
| 249 |
result['composite_score'] = (domain_score * 0.3) + (title_relevance * 0.7) # Favor title relevance
|
| 250 |
|