Spaces:
Running
Running
feat: 100% live search-only top stories with multi-query parallelization
Browse files- src/api/routes/top_stories.py +47 -61
src/api/routes/top_stories.py
CHANGED
|
@@ -14,9 +14,8 @@ from fastapi import APIRouter, Query, Depends
|
|
| 14 |
from pydantic import BaseModel
|
| 15 |
from datetime import datetime
|
| 16 |
|
| 17 |
-
from src.api.dependencies import get_cache_port, get_live_search_port
|
| 18 |
from src.core.ports.cache_port import CachePort
|
| 19 |
-
from src.core.ports.vector_store_port import VectorStorePort
|
| 20 |
from src.infrastructure.adapters.duckduckgo_adapter import DuckDuckGoAdapter
|
| 21 |
|
| 22 |
try:
|
|
@@ -232,35 +231,52 @@ async def fetch_live_stories(n: int = 6, adapter: DuckDuckGoAdapter = None) -> L
|
|
| 232 |
if not adapter:
|
| 233 |
return []
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
logger.info("Live search yielded few results, trying broader regional query")
|
| 242 |
-
results += await adapter.search("East Africa breaking news today")
|
| 243 |
|
|
|
|
| 244 |
stories = []
|
| 245 |
seen_urls = set()
|
| 246 |
-
|
| 247 |
-
url = r.get("url", "#")
|
| 248 |
-
if url in seen_urls: continue
|
| 249 |
-
seen_urls.add(url)
|
| 250 |
-
|
| 251 |
-
stories.append(TopStory(
|
| 252 |
-
title=r.get("title", "Untitled"),
|
| 253 |
-
url=url,
|
| 254 |
-
source=r.get("source", "Live News"),
|
| 255 |
-
published_at=r.get("published_at", datetime.utcnow().isoformat()),
|
| 256 |
-
category="BREAKING",
|
| 257 |
-
excerpt=r.get("content", "")[:150],
|
| 258 |
-
image_url=r.get("image_url") or r.get("thumbnail"),
|
| 259 |
-
origin="live",
|
| 260 |
-
))
|
| 261 |
|
| 262 |
-
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
except Exception as e:
|
| 265 |
logger.error(f"Live top stories error: {e}")
|
| 266 |
return []
|
|
@@ -272,8 +288,7 @@ async def fetch_live_stories(n: int = 6, adapter: DuckDuckGoAdapter = None) -> L
|
|
| 272 |
async def get_top_stories(
|
| 273 |
force_refresh: bool = Query(default=False, description="Force cache refresh"),
|
| 274 |
cache: CachePort = Depends(get_cache_port),
|
| 275 |
-
adapter: DuckDuckGoAdapter = Depends(get_live_search_port)
|
| 276 |
-
vector_store: VectorStorePort = Depends(get_vector_store_port)
|
| 277 |
):
|
| 278 |
"""
|
| 279 |
Get top 6 news stories for the landing page.
|
|
@@ -314,41 +329,12 @@ async def get_top_stories(
|
|
| 314 |
seen_titles.add(title_key)
|
| 315 |
all_stories.append(story)
|
| 316 |
|
| 317 |
-
#
|
| 318 |
-
if len(all_stories) < 6:
|
| 319 |
-
needed = 6 - len(all_stories)
|
| 320 |
-
logger.info(f"Top stories fallback: only have {len(all_stories)}, pulling {needed} from Vector DB")
|
| 321 |
-
try:
|
| 322 |
-
# Try 30 days back to be safe
|
| 323 |
-
db_res = vector_store.browse(limit=needed * 4, days_back=30)
|
| 324 |
-
db_articles = db_res.get("articles", [])
|
| 325 |
-
logger.info(f"Vector DB returned {len(db_articles)} potential fallback articles")
|
| 326 |
-
|
| 327 |
-
for p in db_articles:
|
| 328 |
-
payload = p.payload or {}
|
| 329 |
-
title = payload.get("title") or payload.get("content", "")[:100]
|
| 330 |
-
url = payload.get("url") or "#"
|
| 331 |
-
|
| 332 |
-
title_key = title.lower().strip()[:60]
|
| 333 |
-
if title_key not in seen_titles and len(all_stories) < 6:
|
| 334 |
-
seen_titles.add(title_key)
|
| 335 |
-
all_stories.append(TopStory(
|
| 336 |
-
title=title,
|
| 337 |
-
url=url,
|
| 338 |
-
source=payload.get("source", "ARKI Intelligence"),
|
| 339 |
-
published_at=payload.get("published_at", datetime.utcnow().isoformat()),
|
| 340 |
-
category="UPDATE",
|
| 341 |
-
excerpt=payload.get("content", "")[:150],
|
| 342 |
-
image_url=payload.get("image_url") or payload.get("thumbnail"),
|
| 343 |
-
origin="db"
|
| 344 |
-
))
|
| 345 |
-
except Exception as e:
|
| 346 |
-
logger.error(f"Top stories DB fallback failed: {e}")
|
| 347 |
-
|
| 348 |
-
logger.info(f"Final top stories count: {len(all_stories)}")
|
| 349 |
-
now_iso = datetime.utcnow().isoformat()
|
| 350 |
final_stories = all_stories[:6]
|
| 351 |
|
|
|
|
|
|
|
|
|
|
| 352 |
payload = {
|
| 353 |
"stories": [s.dict() for s in final_stories],
|
| 354 |
"fetched_at": now_iso,
|
|
|
|
| 14 |
from pydantic import BaseModel
|
| 15 |
from datetime import datetime
|
| 16 |
|
| 17 |
+
from src.api.dependencies import get_cache_port, get_live_search_port
|
| 18 |
from src.core.ports.cache_port import CachePort
|
|
|
|
| 19 |
from src.infrastructure.adapters.duckduckgo_adapter import DuckDuckGoAdapter
|
| 20 |
|
| 21 |
try:
|
|
|
|
| 231 |
if not adapter:
|
| 232 |
return []
|
| 233 |
|
| 234 |
+
async def fetch_live_stories(n: int = 6, adapter: DuckDuckGoAdapter = None) -> List[TopStory]:
|
| 235 |
+
"""Fetch N live stories from DuckDuckGo using multiple queries to ensure high yield"""
|
| 236 |
+
if not adapter:
|
| 237 |
+
return []
|
| 238 |
+
|
| 239 |
+
try:
|
| 240 |
+
# Run multiple queries in parallel for better coverage
|
| 241 |
+
queries = [
|
| 242 |
+
"Ethiopia news breaking today",
|
| 243 |
+
"Addis Ababa latest updates",
|
| 244 |
+
"Ethiopia world news headlines"
|
| 245 |
+
]
|
| 246 |
|
| 247 |
+
search_tasks = [adapter.search(q) for q in queries]
|
| 248 |
+
all_results_lists = await asyncio.gather(*search_tasks)
|
|
|
|
|
|
|
| 249 |
|
| 250 |
+
# Flatten and deduplicate
|
| 251 |
stories = []
|
| 252 |
seen_urls = set()
|
| 253 |
+
seen_titles = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
+
for results in all_results_lists:
|
| 256 |
+
for r in results:
|
| 257 |
+
url = r.get("url", "#")
|
| 258 |
+
title = r.get("title", "Untitled")
|
| 259 |
+
title_key = title.lower().strip()[:60]
|
| 260 |
+
|
| 261 |
+
if url in seen_urls or title_key in seen_titles:
|
| 262 |
+
continue
|
| 263 |
+
|
| 264 |
+
seen_urls.add(url)
|
| 265 |
+
seen_titles.add(title_key)
|
| 266 |
+
|
| 267 |
+
stories.append(TopStory(
|
| 268 |
+
title=title,
|
| 269 |
+
url=url,
|
| 270 |
+
source=r.get("source", "Live News"),
|
| 271 |
+
published_at=r.get("published_at", datetime.utcnow().isoformat()),
|
| 272 |
+
category="BREAKING",
|
| 273 |
+
excerpt=r.get("content", "")[:150],
|
| 274 |
+
image_url=r.get("image_url") or r.get("thumbnail"),
|
| 275 |
+
origin="live",
|
| 276 |
+
))
|
| 277 |
+
|
| 278 |
+
logger.info(f"Multi-query live search: collected {len(stories)} unique stories")
|
| 279 |
+
return stories[:n]
|
| 280 |
except Exception as e:
|
| 281 |
logger.error(f"Live top stories error: {e}")
|
| 282 |
return []
|
|
|
|
| 288 |
async def get_top_stories(
|
| 289 |
force_refresh: bool = Query(default=False, description="Force cache refresh"),
|
| 290 |
cache: CachePort = Depends(get_cache_port),
|
| 291 |
+
adapter: DuckDuckGoAdapter = Depends(get_live_search_port)
|
|
|
|
| 292 |
):
|
| 293 |
"""
|
| 294 |
Get top 6 news stories for the landing page.
|
|
|
|
| 329 |
seen_titles.add(title_key)
|
| 330 |
all_stories.append(story)
|
| 331 |
|
| 332 |
+
# Ensure exactly 6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
final_stories = all_stories[:6]
|
| 334 |
|
| 335 |
+
logger.info(f"Final top stories count: {len(final_stories)}")
|
| 336 |
+
now_iso = datetime.utcnow().isoformat()
|
| 337 |
+
|
| 338 |
payload = {
|
| 339 |
"stories": [s.dict() for s in final_stories],
|
| 340 |
"fetched_at": now_iso,
|