Peterase commited on
Commit
2da1e29
·
1 Parent(s): 6246bba

fix(news): Add time-based filtering and cache-control to /latest endpoint

Browse files

PROBLEM:
- /api/v1/news/latest endpoint was returning old cached articles
- No time-based filtering on browse queries
- Browser/CDN caching causing stale data display
- Qdrant scroll returning results in insertion order, not by date

SOLUTION:
1. Added days_back parameter to browse() method (default: 7 days for /latest)
2. Implemented Qdrant order_by for server-side sorting by published_at DESC
3. Added fallback for older Qdrant versions without order_by support
4. Added Cache-Control headers to prevent browser/CDN caching
5. Updated VectorStorePort interface to support time-based filtering

CHANGES:
- src/infrastructure/adapters/qdrant_adapter.py:
* Added days_back parameter to browse() method
* Added datetime range filter using published_at field
* Implemented order_by with published_at DESC for server-side sorting
* Added fallback logic for Qdrant versions without order_by support
* Enhanced error handling with retry logic

- src/core/ports/vector_store_port.py:
* Updated browse() interface to include days_back parameter
* Added documentation for time-based filtering

- src/api/routes/news.py:
* Added days_back query parameter (default: 7, range: 1-30)
* Added Cache-Control, Pragma, and Expires headers to prevent caching
* Removed redundant client-side sorting (now done by Qdrant)
* Imported Response from FastAPI for header manipulation

IMPACT:
- Fresh news articles always displayed (last 7 days by default)
- No browser/CDN caching of news results
- Better performance with server-side sorting
- Configurable time window (1-30 days)
- Backward compatible with older Qdrant versions

TESTING:
- Endpoint: GET /api/v1/news/latest?limit=6&days_back=7
- Expected: Articles from last 7 days, sorted newest first
- Headers: Cache-Control: no-cache, no-store, must-revalidate

Version: v2.2

src/api/routes/news.py CHANGED
@@ -1,4 +1,4 @@
1
- from fastapi import APIRouter, Query, HTTPException, Depends
2
  from typing import Optional
3
  from src.core.ports.vector_store_port import VectorStorePort
4
  from src.core.ports.embedder_port import EmbedderPort
@@ -36,24 +36,34 @@ def _dict_to_article(payload: dict, score: float = None) -> NewsArticle:
36
 
37
  @router.get("/latest", response_model=BrowseResponse)
38
  def get_latest_news(
 
39
  limit: int = Query(10, le=50),
40
  source: Optional[str] = None,
41
  language: Optional[str] = None,
 
42
  vector_store: VectorStorePort = Depends(get_vector_store_port)
43
  ):
44
- """Get latest news articles sorted by publication date"""
 
 
 
 
 
45
  try:
46
- result = vector_store.browse(limit=limit, offset=0, source=source, language=language)
 
 
 
 
 
 
47
 
48
- # Convert Qdrant points to articles and sort by published_at
49
  articles = []
50
  for p in result["articles"]:
51
  article = _dict_to_article(p.payload or {}, getattr(p, "score", None))
52
  articles.append(article)
53
 
54
- # Sort by published_at descending (latest first)
55
- articles.sort(key=lambda x: x.published_at or "", reverse=True)
56
-
57
  # Ensure next_offset is an integer or None
58
  next_offset = result.get("next_offset")
59
  if next_offset is not None and not isinstance(next_offset, int):
 
1
+ from fastapi import APIRouter, Query, HTTPException, Depends, Response
2
  from typing import Optional
3
  from src.core.ports.vector_store_port import VectorStorePort
4
  from src.core.ports.embedder_port import EmbedderPort
 
36
 
37
  @router.get("/latest", response_model=BrowseResponse)
38
  def get_latest_news(
39
+ response: Response,
40
  limit: int = Query(10, le=50),
41
  source: Optional[str] = None,
42
  language: Optional[str] = None,
43
+ days_back: int = Query(7, ge=1, le=30, description="Number of days to look back (default: 7)"),
44
  vector_store: VectorStorePort = Depends(get_vector_store_port)
45
  ):
46
+ """Get latest news articles sorted by publication date (default: last 7 days)"""
47
+ # Prevent caching of news results - always fetch fresh data
48
+ response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate, max-age=0"
49
+ response.headers["Pragma"] = "no-cache"
50
+ response.headers["Expires"] = "0"
51
+
52
  try:
53
+ result = vector_store.browse(
54
+ limit=limit,
55
+ offset=0,
56
+ source=source,
57
+ language=language,
58
+ days_back=days_back
59
+ )
60
 
61
+ # Convert Qdrant points to articles (already sorted by Qdrant)
62
  articles = []
63
  for p in result["articles"]:
64
  article = _dict_to_article(p.payload or {}, getattr(p, "score", None))
65
  articles.append(article)
66
 
 
 
 
67
  # Ensure next_offset is an integer or None
68
  next_offset = result.get("next_offset")
69
  if next_offset is not None and not isinstance(next_offset, int):
src/core/ports/vector_store_port.py CHANGED
@@ -42,8 +42,9 @@ class VectorStorePort(abc.ABC):
42
  pass
43
 
44
  @abc.abstractmethod
45
- def browse(self, limit: int = 20, offset: int = 0, source: Optional[str] = None, language: Optional[str] = None) -> Dict[str, Any]:
46
  """
47
  Browses the vector collection with pagination and filtering.
 
48
  """
49
  pass
 
42
  pass
43
 
44
  @abc.abstractmethod
45
+ def browse(self, limit: int = 20, offset: int = 0, source: Optional[str] = None, language: Optional[str] = None, days_back: Optional[int] = None) -> Dict[str, Any]:
46
  """
47
  Browses the vector collection with pagination and filtering.
48
+ days_back: Optional filter to only return articles from the last N days (default: None = all articles)
49
  """
50
  pass
src/infrastructure/adapters/qdrant_adapter.py CHANGED
@@ -316,6 +316,7 @@ class QdrantAdapter(VectorStorePort):
316
  offset: int = 0,
317
  source: Optional[str] = None,
318
  language: Optional[str] = None,
 
319
  ) -> Dict[str, Any]:
320
  if not self.client:
321
  return {"articles": [], "next_offset": None}
@@ -329,6 +330,15 @@ class QdrantAdapter(VectorStorePort):
329
  must.append(models.FieldCondition(
330
  key="language", match=models.MatchValue(value=language)
331
  ))
 
 
 
 
 
 
 
 
 
332
  filter_obj = models.Filter(must=must) if must else None
333
 
334
  try:
@@ -340,6 +350,10 @@ class QdrantAdapter(VectorStorePort):
340
  offset=offset,
341
  with_payload=True,
342
  with_vectors=False,
 
 
 
 
343
  )
344
 
345
  # Keep only the lowest chunk_index per doc_id (first chunk of each article)
@@ -352,6 +366,7 @@ class QdrantAdapter(VectorStorePort):
352
  seen_docs[doc_id] = (point, chunk_index)
353
 
354
  deduped = [v[0] for v in seen_docs.values()]
 
355
  deduped.sort(
356
  key=lambda p: (p.payload or {}).get("published_at") or "",
357
  reverse=True,
@@ -359,5 +374,33 @@ class QdrantAdapter(VectorStorePort):
359
  return {"articles": deduped[:limit], "next_offset": next_page_offset}
360
 
361
  except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  logger.error(f"Error browsing Qdrant: {e}")
363
  return {"articles": [], "next_offset": None}
 
316
  offset: int = 0,
317
  source: Optional[str] = None,
318
  language: Optional[str] = None,
319
+ days_back: Optional[int] = None,
320
  ) -> Dict[str, Any]:
321
  if not self.client:
322
  return {"articles": [], "next_offset": None}
 
330
  must.append(models.FieldCondition(
331
  key="language", match=models.MatchValue(value=language)
332
  ))
333
+
334
+ # Add time-based filter for fresh results (default: last 7 days for browse)
335
+ if days_back is not None:
336
+ min_date = datetime.utcnow() - timedelta(days=days_back)
337
+ must.append(models.FieldCondition(
338
+ key="published_at",
339
+ range=models.DatetimeRange(gte=min_date)
340
+ ))
341
+
342
  filter_obj = models.Filter(must=must) if must else None
343
 
344
  try:
 
350
  offset=offset,
351
  with_payload=True,
352
  with_vectors=False,
353
+ order_by=models.OrderBy(
354
+ key="published_at",
355
+ direction=models.Direction.DESC
356
+ )
357
  )
358
 
359
  # Keep only the lowest chunk_index per doc_id (first chunk of each article)
 
366
  seen_docs[doc_id] = (point, chunk_index)
367
 
368
  deduped = [v[0] for v in seen_docs.values()]
369
+ # Results are already sorted by Qdrant, but re-sort to ensure consistency after dedup
370
  deduped.sort(
371
  key=lambda p: (p.payload or {}).get("published_at") or "",
372
  reverse=True,
 
374
  return {"articles": deduped[:limit], "next_offset": next_page_offset}
375
 
376
  except Exception as e:
377
+ # If order_by fails (older Qdrant version), retry without it
378
+ if "order_by" in str(e).lower():
379
+ logger.warning(f"Qdrant order_by not supported, falling back to client-side sort: {e}")
380
+ try:
381
+ results, next_page_offset = self.client.scroll(
382
+ collection_name=settings.QDRANT_COLLECTION,
383
+ scroll_filter=filter_obj,
384
+ limit=limit * 8,
385
+ offset=offset,
386
+ with_payload=True,
387
+ with_vectors=False,
388
+ )
389
+ seen_docs: dict = {}
390
+ for point in results:
391
+ payload = point.payload or {}
392
+ doc_id = payload.get("doc_id", point.id)
393
+ chunk_index = payload.get("chunk_index", 0)
394
+ if doc_id not in seen_docs or chunk_index < seen_docs[doc_id][1]:
395
+ seen_docs[doc_id] = (point, chunk_index)
396
+ deduped = [v[0] for v in seen_docs.values()]
397
+ deduped.sort(
398
+ key=lambda p: (p.payload or {}).get("published_at") or "",
399
+ reverse=True,
400
+ )
401
+ return {"articles": deduped[:limit], "next_offset": next_page_offset}
402
+ except Exception as e2:
403
+ logger.error(f"Error browsing Qdrant (fallback): {e2}")
404
+ return {"articles": [], "next_offset": None}
405
  logger.error(f"Error browsing Qdrant: {e}")
406
  return {"articles": [], "next_offset": None}