Spaces:
Running
Running
feat: implement 'latest=3 days' logic and Jina content date extraction
Browse files
src/core/orchestrator/query_orchestrator.py
CHANGED
|
@@ -67,7 +67,7 @@ class QueryOrchestrator:
|
|
| 67 |
TEMPORAL_KEYWORDS = [
|
| 68 |
"today", "now", "latest", "breaking", "just", "current",
|
| 69 |
"this morning", "this afternoon", "this evening", "tonight",
|
| 70 |
-
"yesterday", "recent", "recently", "new", "fresh"
|
| 71 |
]
|
| 72 |
|
| 73 |
# Historical keywords that indicate DB-only search
|
|
|
|
| 67 |
TEMPORAL_KEYWORDS = [
|
| 68 |
"today", "now", "latest", "breaking", "just", "current",
|
| 69 |
"this morning", "this afternoon", "this evening", "tonight",
|
| 70 |
+
"yesterday", "recent", "recently", "new", "fresh", "update"
|
| 71 |
]
|
| 72 |
|
| 73 |
# Historical keywords that indicate DB-only search
|
src/core/use_cases/rag_chat_use_case.py
CHANGED
|
@@ -162,6 +162,19 @@ Document:
|
|
| 162 |
|
| 163 |
return best_sentence.strip()[:250]
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
def _extract_intents_and_translate(self, query: str) -> Dict[str, Any]:
|
| 166 |
"""
|
| 167 |
Single LLM call: query understanding + multilingual translation.
|
|
@@ -272,8 +285,10 @@ JSON:"""
|
|
| 272 |
query_lower = query.lower()
|
| 273 |
|
| 274 |
# Temporal fallback - more aggressive keywords
|
| 275 |
-
if any(w in query_lower for w in ("today", "tonight", "now", "breaking", "
|
| 276 |
result["days_back"] = 1
|
|
|
|
|
|
|
| 277 |
elif "yesterday" in query_lower:
|
| 278 |
result["days_back"] = 2
|
| 279 |
elif any(w in query_lower for w in ("this week", "recently", "past few days", "days ago")):
|
|
@@ -331,16 +346,26 @@ JSON:"""
|
|
| 331 |
for hit in results:
|
| 332 |
score_multiplier = 1.0
|
| 333 |
pub_at = hit.metadata.get("published_at")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
if pub_at:
|
| 335 |
try:
|
| 336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
days_old = (now.replace(tzinfo=None) - pub_date.replace(tzinfo=None)).days
|
| 338 |
# Aggressive news freshness multiplier
|
| 339 |
-
if days_old <= 0: score_multiplier = 1.
|
| 340 |
-
elif days_old =
|
| 341 |
elif days_old < 7: score_multiplier = 0.85 # Slight penalty this week
|
| 342 |
-
elif days_old < 30: score_multiplier = 0.
|
| 343 |
-
else: score_multiplier = 0.
|
| 344 |
except:
|
| 345 |
pass
|
| 346 |
|
|
|
|
| 162 |
|
| 163 |
return best_sentence.strip()[:250]
|
| 164 |
|
| 165 |
+
def _try_extract_date_from_content(self, content: str) -> Optional[str]:
|
| 166 |
+
"""
|
| 167 |
+
Attempts to find a date string (YYYY-MM-DD) in the first 500 chars of content.
|
| 168 |
+
Useful for Jina-extracted content where metadata might be missing.
|
| 169 |
+
"""
|
| 170 |
+
import re
|
| 171 |
+
# Look for YYYY-MM-DD or Month DD, YYYY
|
| 172 |
+
iso_match = re.search(r'(\d{4}-\d{2}-\d{2})', content[:500])
|
| 173 |
+
if iso_match:
|
| 174 |
+
return iso_match.group(1)
|
| 175 |
+
|
| 176 |
+
return None
|
| 177 |
+
|
| 178 |
def _extract_intents_and_translate(self, query: str) -> Dict[str, Any]:
|
| 179 |
"""
|
| 180 |
Single LLM call: query understanding + multilingual translation.
|
|
|
|
| 285 |
query_lower = query.lower()
|
| 286 |
|
| 287 |
# Temporal fallback - more aggressive keywords
|
| 288 |
+
if any(w in query_lower for w in ("today", "tonight", "now", "breaking", "just")):
|
| 289 |
result["days_back"] = 1
|
| 290 |
+
elif any(w in query_lower for w in ("latest", "recently", "current", "update")):
|
| 291 |
+
result["days_back"] = 3 # "Latest" = 3 days as requested
|
| 292 |
elif "yesterday" in query_lower:
|
| 293 |
result["days_back"] = 2
|
| 294 |
elif any(w in query_lower for w in ("this week", "recently", "past few days", "days ago")):
|
|
|
|
| 346 |
for hit in results:
|
| 347 |
score_multiplier = 1.0
|
| 348 |
pub_at = hit.metadata.get("published_at")
|
| 349 |
+
|
| 350 |
+
# ENHANCEMENT: If no date in metadata, try to extract from content (Jina fallback)
|
| 351 |
+
if not pub_at:
|
| 352 |
+
pub_at = self._try_extract_date_from_content(hit.content)
|
| 353 |
+
|
| 354 |
if pub_at:
|
| 355 |
try:
|
| 356 |
+
# Handle various date formats (ISO, or simple YYYY-MM-DD)
|
| 357 |
+
if "T" in pub_at:
|
| 358 |
+
pub_date = datetime.fromisoformat(pub_at.replace("Z", "+00:00"))
|
| 359 |
+
else:
|
| 360 |
+
pub_date = datetime.strptime(pub_at[:10], "%Y-%m-%d")
|
| 361 |
+
|
| 362 |
days_old = (now.replace(tzinfo=None) - pub_date.replace(tzinfo=None)).days
|
| 363 |
# Aggressive news freshness multiplier
|
| 364 |
+
if days_old <= 0: score_multiplier = 1.1 # Even more boost for today
|
| 365 |
+
elif days_old <= 3: score_multiplier = 1.0 # Full score for "latest" (3 days)
|
| 366 |
elif days_old < 7: score_multiplier = 0.85 # Slight penalty this week
|
| 367 |
+
elif days_old < 30: score_multiplier = 0.5 # Heavy penalty this month
|
| 368 |
+
else: score_multiplier = 0.2 # Archive data
|
| 369 |
except:
|
| 370 |
pass
|
| 371 |
|