Peterase commited on
Commit
470548c
·
1 Parent(s): ad5faef

feat: implement 'latest=3 days' logic and Jina content date extraction

Browse files
src/core/orchestrator/query_orchestrator.py CHANGED
@@ -67,7 +67,7 @@ class QueryOrchestrator:
67
  TEMPORAL_KEYWORDS = [
68
  "today", "now", "latest", "breaking", "just", "current",
69
  "this morning", "this afternoon", "this evening", "tonight",
70
- "yesterday", "recent", "recently", "new", "fresh"
71
  ]
72
 
73
  # Historical keywords that indicate DB-only search
 
67
  TEMPORAL_KEYWORDS = [
68
  "today", "now", "latest", "breaking", "just", "current",
69
  "this morning", "this afternoon", "this evening", "tonight",
70
+ "yesterday", "recent", "recently", "new", "fresh", "update"
71
  ]
72
 
73
  # Historical keywords that indicate DB-only search
src/core/use_cases/rag_chat_use_case.py CHANGED
@@ -162,6 +162,19 @@ Document:
162
 
163
  return best_sentence.strip()[:250]
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  def _extract_intents_and_translate(self, query: str) -> Dict[str, Any]:
166
  """
167
  Single LLM call: query understanding + multilingual translation.
@@ -272,8 +285,10 @@ JSON:"""
272
  query_lower = query.lower()
273
 
274
  # Temporal fallback - more aggressive keywords
275
- if any(w in query_lower for w in ("today", "tonight", "now", "breaking", "latest", "current", "update")):
276
  result["days_back"] = 1
 
 
277
  elif "yesterday" in query_lower:
278
  result["days_back"] = 2
279
  elif any(w in query_lower for w in ("this week", "recently", "past few days", "days ago")):
@@ -331,16 +346,26 @@ JSON:"""
331
  for hit in results:
332
  score_multiplier = 1.0
333
  pub_at = hit.metadata.get("published_at")
 
 
 
 
 
334
  if pub_at:
335
  try:
336
- pub_date = datetime.fromisoformat(pub_at.replace("Z", "+00:00"))
 
 
 
 
 
337
  days_old = (now.replace(tzinfo=None) - pub_date.replace(tzinfo=None)).days
338
  # Aggressive news freshness multiplier
339
- if days_old <= 0: score_multiplier = 1.05 # Boost today
340
- elif days_old == 1: score_multiplier = 1.0 # Normal yesterday
341
  elif days_old < 7: score_multiplier = 0.85 # Slight penalty this week
342
- elif days_old < 30: score_multiplier = 0.6 # Heavy penalty this month
343
- else: score_multiplier = 0.3 # Archive data
344
  except:
345
  pass
346
 
 
162
 
163
  return best_sentence.strip()[:250]
164
 
165
+ def _try_extract_date_from_content(self, content: str) -> Optional[str]:
166
+ """
167
+ Attempts to find a date string (YYYY-MM-DD) in the first 500 chars of content.
168
+ Useful for Jina-extracted content where metadata might be missing.
169
+ """
170
+ import re
171
+ # Look for YYYY-MM-DD or Month DD, YYYY
172
+ iso_match = re.search(r'(\d{4}-\d{2}-\d{2})', content[:500])
173
+ if iso_match:
174
+ return iso_match.group(1)
175
+
176
+ return None
177
+
178
  def _extract_intents_and_translate(self, query: str) -> Dict[str, Any]:
179
  """
180
  Single LLM call: query understanding + multilingual translation.
 
285
  query_lower = query.lower()
286
 
287
  # Temporal fallback - more aggressive keywords
288
+ if any(w in query_lower for w in ("today", "tonight", "now", "breaking", "just")):
289
  result["days_back"] = 1
290
+ elif any(w in query_lower for w in ("latest", "recently", "current", "update")):
291
+ result["days_back"] = 3 # "Latest" = 3 days as requested
292
  elif "yesterday" in query_lower:
293
  result["days_back"] = 2
294
  elif any(w in query_lower for w in ("this week", "recently", "past few days", "days ago")):
 
346
  for hit in results:
347
  score_multiplier = 1.0
348
  pub_at = hit.metadata.get("published_at")
349
+
350
+ # ENHANCEMENT: If no date in metadata, try to extract from content (Jina fallback)
351
+ if not pub_at:
352
+ pub_at = self._try_extract_date_from_content(hit.content)
353
+
354
  if pub_at:
355
  try:
356
+ # Handle various date formats (ISO, or simple YYYY-MM-DD)
357
+ if "T" in pub_at:
358
+ pub_date = datetime.fromisoformat(pub_at.replace("Z", "+00:00"))
359
+ else:
360
+ pub_date = datetime.strptime(pub_at[:10], "%Y-%m-%d")
361
+
362
  days_old = (now.replace(tzinfo=None) - pub_date.replace(tzinfo=None)).days
363
  # Aggressive news freshness multiplier
364
+ if days_old <= 0: score_multiplier = 1.1 # Even more boost for today
365
+ elif days_old <= 3: score_multiplier = 1.0 # Full score for "latest" (3 days)
366
  elif days_old < 7: score_multiplier = 0.85 # Slight penalty this week
367
+ elif days_old < 30: score_multiplier = 0.5 # Heavy penalty this month
368
+ else: score_multiplier = 0.2 # Archive data
369
  except:
370
  pass
371