Peterase commited on
Commit
35f6d98
Β·
1 Parent(s): 89eda61

feat: 100% live search-only top stories with multi-query parallelization

Browse files
Files changed (1) hide show
  1. src/api/routes/top_stories.py +47 -61
src/api/routes/top_stories.py CHANGED
@@ -14,9 +14,8 @@ from fastapi import APIRouter, Query, Depends
14
  from pydantic import BaseModel
15
  from datetime import datetime
16
 
17
- from src.api.dependencies import get_cache_port, get_live_search_port, get_vector_store_port
18
  from src.core.ports.cache_port import CachePort
19
- from src.core.ports.vector_store_port import VectorStorePort
20
  from src.infrastructure.adapters.duckduckgo_adapter import DuckDuckGoAdapter
21
 
22
  try:
@@ -232,35 +231,52 @@ async def fetch_live_stories(n: int = 6, adapter: DuckDuckGoAdapter = None) -> L
232
  if not adapter:
233
  return []
234
 
235
- # Focus on Ethiopia news
236
- query = "Ethiopia breaking news latest"
237
- results = await adapter.search(query)
 
 
 
 
 
 
 
 
 
238
 
239
- # If very few results, try a broader regional search
240
- if len(results) < 3:
241
- logger.info("Live search yielded few results, trying broader regional query")
242
- results += await adapter.search("East Africa breaking news today")
243
 
 
244
  stories = []
245
  seen_urls = set()
246
- for r in results:
247
- url = r.get("url", "#")
248
- if url in seen_urls: continue
249
- seen_urls.add(url)
250
-
251
- stories.append(TopStory(
252
- title=r.get("title", "Untitled"),
253
- url=url,
254
- source=r.get("source", "Live News"),
255
- published_at=r.get("published_at", datetime.utcnow().isoformat()),
256
- category="BREAKING",
257
- excerpt=r.get("content", "")[:150],
258
- image_url=r.get("image_url") or r.get("thumbnail"),
259
- origin="live",
260
- ))
261
 
262
- logger.info(f"Live top stories: fetched {len(stories)}")
263
- return stories
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  except Exception as e:
265
  logger.error(f"Live top stories error: {e}")
266
  return []
@@ -272,8 +288,7 @@ async def fetch_live_stories(n: int = 6, adapter: DuckDuckGoAdapter = None) -> L
272
  async def get_top_stories(
273
  force_refresh: bool = Query(default=False, description="Force cache refresh"),
274
  cache: CachePort = Depends(get_cache_port),
275
- adapter: DuckDuckGoAdapter = Depends(get_live_search_port),
276
- vector_store: VectorStorePort = Depends(get_vector_store_port)
277
  ):
278
  """
279
  Get top 6 news stories for the landing page.
@@ -314,41 +329,12 @@ async def get_top_stories(
314
  seen_titles.add(title_key)
315
  all_stories.append(story)
316
 
317
- # ── FALLBACK: If still less than 6, pull from Vector DB (Qdrant) ──────────
318
- if len(all_stories) < 6:
319
- needed = 6 - len(all_stories)
320
- logger.info(f"Top stories fallback: only have {len(all_stories)}, pulling {needed} from Vector DB")
321
- try:
322
- # Try 30 days back to be safe
323
- db_res = vector_store.browse(limit=needed * 4, days_back=30)
324
- db_articles = db_res.get("articles", [])
325
- logger.info(f"Vector DB returned {len(db_articles)} potential fallback articles")
326
-
327
- for p in db_articles:
328
- payload = p.payload or {}
329
- title = payload.get("title") or payload.get("content", "")[:100]
330
- url = payload.get("url") or "#"
331
-
332
- title_key = title.lower().strip()[:60]
333
- if title_key not in seen_titles and len(all_stories) < 6:
334
- seen_titles.add(title_key)
335
- all_stories.append(TopStory(
336
- title=title,
337
- url=url,
338
- source=payload.get("source", "ARKI Intelligence"),
339
- published_at=payload.get("published_at", datetime.utcnow().isoformat()),
340
- category="UPDATE",
341
- excerpt=payload.get("content", "")[:150],
342
- image_url=payload.get("image_url") or payload.get("thumbnail"),
343
- origin="db"
344
- ))
345
- except Exception as e:
346
- logger.error(f"Top stories DB fallback failed: {e}")
347
-
348
- logger.info(f"Final top stories count: {len(all_stories)}")
349
- now_iso = datetime.utcnow().isoformat()
350
  final_stories = all_stories[:6]
351
 
 
 
 
352
  payload = {
353
  "stories": [s.dict() for s in final_stories],
354
  "fetched_at": now_iso,
 
14
  from pydantic import BaseModel
15
  from datetime import datetime
16
 
17
+ from src.api.dependencies import get_cache_port, get_live_search_port
18
  from src.core.ports.cache_port import CachePort
 
19
  from src.infrastructure.adapters.duckduckgo_adapter import DuckDuckGoAdapter
20
 
21
  try:
 
231
  if not adapter:
232
  return []
233
 
234
+ async def fetch_live_stories(n: int = 6, adapter: DuckDuckGoAdapter = None) -> List[TopStory]:
235
+ """Fetch N live stories from DuckDuckGo using multiple queries to ensure high yield"""
236
+ if not adapter:
237
+ return []
238
+
239
+ try:
240
+ # Run multiple queries in parallel for better coverage
241
+ queries = [
242
+ "Ethiopia news breaking today",
243
+ "Addis Ababa latest updates",
244
+ "Ethiopia world news headlines"
245
+ ]
246
 
247
+ search_tasks = [adapter.search(q) for q in queries]
248
+ all_results_lists = await asyncio.gather(*search_tasks)
 
 
249
 
250
+ # Flatten and deduplicate
251
  stories = []
252
  seen_urls = set()
253
+ seen_titles = set()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
+ for results in all_results_lists:
256
+ for r in results:
257
+ url = r.get("url", "#")
258
+ title = r.get("title", "Untitled")
259
+ title_key = title.lower().strip()[:60]
260
+
261
+ if url in seen_urls or title_key in seen_titles:
262
+ continue
263
+
264
+ seen_urls.add(url)
265
+ seen_titles.add(title_key)
266
+
267
+ stories.append(TopStory(
268
+ title=title,
269
+ url=url,
270
+ source=r.get("source", "Live News"),
271
+ published_at=r.get("published_at", datetime.utcnow().isoformat()),
272
+ category="BREAKING",
273
+ excerpt=r.get("content", "")[:150],
274
+ image_url=r.get("image_url") or r.get("thumbnail"),
275
+ origin="live",
276
+ ))
277
+
278
+ logger.info(f"Multi-query live search: collected {len(stories)} unique stories")
279
+ return stories[:n]
280
  except Exception as e:
281
  logger.error(f"Live top stories error: {e}")
282
  return []
 
288
  async def get_top_stories(
289
  force_refresh: bool = Query(default=False, description="Force cache refresh"),
290
  cache: CachePort = Depends(get_cache_port),
291
+ adapter: DuckDuckGoAdapter = Depends(get_live_search_port)
 
292
  ):
293
  """
294
  Get top 6 news stories for the landing page.
 
329
  seen_titles.add(title_key)
330
  all_stories.append(story)
331
 
332
+ # Ensure exactly 6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  final_stories = all_stories[:6]
334
 
335
+ logger.info(f"Final top stories count: {len(final_stories)}")
336
+ now_iso = datetime.utcnow().isoformat()
337
+
338
  payload = {
339
  "stories": [s.dict() for s in final_stories],
340
  "fetched_at": now_iso,