Spaces:

Peterase
/

rag-api-node-1

Running

Peterase commited on 15 days ago

Commit

c886471

1 Parent(s): 27034e2

feat(rag-api): hybrid top stories — 3 from Kafka news.processed + 3 from DuckDuckGo

- Kafka consumer reads N most recent messages from news.processed topic
by seeking to partition end and reading backwards (no offset tracking)
- SSL cert resolution: env var content > file path env var > /app/certs/ default
- DuckDuckGo fetches 3 live Ethiopia stories in parallel
- Both sources run concurrently via asyncio.gather
- Deduplication by title prefix (60 chars) across both sources
- Fallback: if Kafka unavailable, fills all 6 slots from DuckDuckGo
- Cache TTL reduced to 2 minutes for freshness
- Response includes kafka_count and live_count for debugging
- origin field on each story: 'kafka' or 'live'

Files changed (3) hide show

.env +9 -0
src/api/routes/top_stories.py +273 -156
src/core/config.py +7 -0

.env CHANGED Viewed

@@ -90,6 +90,15 @@ ENABLE_JINA_READER=true
 JINA_READER_TIMEOUT=8.0
 JINA_READER_MAX_CONCURRENT=5
 # Live Search Engine Configuration
 LIVE_SEARCH_PRIMARY=searxng
 LIVE_SEARCH_FALLBACK=duckduckgo

 JINA_READER_TIMEOUT=8.0
 JINA_READER_MAX_CONCURRENT=5
+# --- Kafka Settings (Top Stories — read-only consumer) ---
+KAFKA_BOOTSTRAP_SERVERS=kafka-23e11337-weldamedhan2020-b406.i.aivencloud.com:20010
+KAFKA_TOPIC_PROCESSED=news.processed
+# SSL certs are read from /tmp/ — written at startup from env vars or cert files
+# Set these to the content of your Aiven SSL certificates (newlines as \n)
+# KAFKA_SSL_CA=<content of ca.pem>
+# KAFKA_SSL_CERT=<content of service.cert>
+# KAFKA_SSL_KEY=<content of service.key>
 # Live Search Engine Configuration
 LIVE_SEARCH_PRIMARY=searxng
 LIVE_SEARCH_FALLBACK=duckduckgo

src/api/routes/top_stories.py CHANGED Viewed

@@ -2,22 +2,18 @@
 Top Stories API Endpoint
 Provides fresh news headlines for the landing page.
 Fast, cached, and optimized for frontend display.
-Features:
-- DuckDuckGo news search (fast, no API key)
-- 5-minute cache (reduce API calls)
-- Ethiopia-focused by default
-- Multiple categories support
-- Clean, frontend-ready format
 """
 import logging
 from typing import List, Optional
-from fastapi import APIRouter, Query, HTTPException
 from pydantic import BaseModel
 from datetime import datetime
-import asyncio
 logger = logging.getLogger(__name__)
@@ -32,6 +28,7 @@ class TopStory(BaseModel):
     published_at: str
     category: str = "NEWS"
     excerpt: Optional[str] = None
 class TopStoriesResponse(BaseModel):
@@ -39,175 +36,315 @@ class TopStoriesResponse(BaseModel):
     stories: List[TopStory]
     fetched_at: str
     cache_hit: bool = False
-# Simple in-memory cache (5 minutes)
-_cache = {}
-_cache_ttl = 300  # 5 minutes
-async def fetch_top_stories_from_ddg(
-    query: str = "Ethiopia",
-    max_results: int = 10,
-    region: str = "et-en"
-) -> List[TopStory]:
     """
-    Fetch top stories from DuckDuckGo news search.
-    Args:
-        query: Search query (default: "Ethiopia")
-        max_results: Number of results (default: 10)
-        region: DuckDuckGo region (default: "et-en" for Ethiopia)
-    Returns:
-        List of TopStory objects
     """
     try:
         from ddgs import DDGS
-        # Run DuckDuckGo search in thread pool (it's synchronous)
         loop = asyncio.get_event_loop()
         def _search():
             ddgs = DDGS()
-            results = ddgs.news(
-                query,
-                region=region,
-                max_results=max_results
-            )
-            return list(results)
-        raw_results = await asyncio.wait_for(
             loop.run_in_executor(None, _search),
-            timeout=5.0  # Fast timeout for landing page
         )
-        # Convert to TopStory format
         stories = []
-        for r in raw_results:
-            try:
-                story = TopStory(
-                    title=r.get("title", "").strip(),
-                    url=r.get("url", "").strip(),
-                    source=r.get("source", "Unknown").strip(),
-                    published_at=r.get("date", datetime.utcnow().isoformat()),
-                    category="NEWS",
-                    excerpt=r.get("body", "")[:150] if r.get("body") else None
-                )
-                # Validate required fields
-                if story.title and story.url:
-                    stories.append(story)
-            except Exception as e:
-                logger.warning(f"Failed to parse story: {e}")
                 continue
-        logger.info(f"Fetched {len(stories)} top stories from DuckDuckGo")
-        return stories
     except asyncio.TimeoutError:
-        logger.warning("DuckDuckGo timeout - returning empty stories")
         return []
     except Exception as e:
-        logger.error(f"Failed to fetch top stories: {e}")
         return []
 @router.get("/top-stories", response_model=TopStoriesResponse)
 async def get_top_stories(
-    query: str = Query(
-        default="Ethiopia",
-        description="Search query for top stories"
-    ),
-    max_results: int = Query(
-        default=10,
-        ge=1,
-        le=20,
-        description="Number of stories to return (1-20)"
-    ),
-    category: Optional[str] = Query(
-        default=None,
-        description="Filter by category (not implemented yet)"
-    ),
-    force_refresh: bool = Query(
-        default=False,
-        description="Force cache refresh"
-    )
 ):
     """
-    Get top news stories for the landing page.
-    **Features:**
-    - Fast response (< 2s)
-    - 5-minute cache
-    - Ethiopia-focused by default
-    - Clean, frontend-ready format
-    **Example:**
-    ```
-    GET /api/v1/top-stories?query=Ethiopia&max_results=10
-    ```
-    **Response:**
-    ```json
-    {
-      "stories": [
-        {
-          "title": "Ethiopia announces new economic reforms",
-          "url": "https://example.com/article",
-          "source": "BBC",
-          "published_at": "2026-05-04T10:30:00",
-          "category": "NEWS",
-          "excerpt": "Prime Minister announces..."
-        }
-      ],
-      "fetched_at": "2026-05-04T10:35:00",
-      "cache_hit": false
-    }
-    ```
     """
-    cache_key = f"{query}:{max_results}"
-    # Check cache (unless force refresh)
     if not force_refresh and cache_key in _cache:
         cached_data, cached_time = _cache[cache_key]
         age = (datetime.utcnow() - cached_time).total_seconds()
         if age < _cache_ttl:
-            logger.info(f"Cache HIT for top stories (age={age:.0f}s)")
             return TopStoriesResponse(
-                stories=cached_data,
                 fetched_at=cached_time.isoformat(),
-                cache_hit=True
             )
-    # Fetch fresh stories
-    logger.info(f"Cache MISS - fetching top stories for: {query}")
-    stories = await fetch_top_stories_from_ddg(
-        query=query,
-        max_results=max_results
     )
-    # Update cache
     now = datetime.utcnow()
-    _cache[cache_key] = (stories, now)
     return TopStoriesResponse(
-        stories=stories,
         fetched_at=now.isoformat(),
-        cache_hit=False
     )
 @router.get("/top-stories/categories")
 async def get_categories():
-    """
-    Get available story categories.
-    **Note:** Currently only "NEWS" is supported.
-    Future: POLITICS, ECONOMY, SPORTS, etc.
-    """
     return {
         "categories": [
             {"id": "news", "name": "News", "query": "Ethiopia"},
@@ -216,23 +353,3 @@ async def get_categories():
             {"id": "sports", "name": "Sports", "query": "Ethiopia sports"},
         ]
     }
-@router.post("/top-stories/refresh")
-async def refresh_top_stories():
-    """
-    Clear the top stories cache.
-    **Use case:** Admin wants to force refresh all cached stories.
-    """
-    global _cache
-    old_size = len(_cache)
-    _cache.clear()
-    logger.info(f"Cleared top stories cache ({old_size} entries)")
-    return {
-        "success": True,
-        "message": f"Cleared {old_size} cached entries",
-        "cleared_at": datetime.utcnow().isoformat()
-    }

 Top Stories API Endpoint
 Provides fresh news headlines for the landing page.
+Hybrid approach: 3 from Kafka news.processed (pipeline-fresh) + 3 from DuckDuckGo (live).
 Fast, cached, and optimized for frontend display.
 """
 import logging
+import asyncio
+import json
+import msgpack
 from typing import List, Optional
+from fastapi import APIRouter, Query
 from pydantic import BaseModel
 from datetime import datetime
 logger = logging.getLogger(__name__)
     published_at: str
     category: str = "NEWS"
     excerpt: Optional[str] = None
+    origin: str = "kafka"  # "kafka" or "live"
 class TopStoriesResponse(BaseModel):
     stories: List[TopStory]
     fetched_at: str
     cache_hit: bool = False
+    kafka_count: int = 0
+    live_count: int = 0
+# Simple in-memory cache (2 minutes — shorter TTL for freshness)
+_cache: dict = {}
+_cache_ttl = 120  # 2 minutes
+# ── Kafka: read latest N messages from news.processed ────────────────────────
+def _fetch_kafka_stories_sync(n: int = 3) -> List[TopStory]:
     """
+    Read the N most recent messages from the news.processed Kafka topic.
+    Uses a temporary consumer that seeks to the end of each partition,
+    then reads backwards to get the latest messages.
+    Runs synchronously (called via executor).
     """
+    import os
+    from confluent_kafka import Consumer, TopicPartition
+    bootstrap = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "")
+    topic     = os.getenv("KAFKA_TOPIC_PROCESSED", "news.processed")
+    if not bootstrap:
+        logger.warning("KAFKA_BOOTSTRAP_SERVERS not set — skipping Kafka top stories")
+        return []
+    # SSL certs: support both env-var content and file paths
+    # Priority: env var content → file path → skip SSL
+    def _write_cert(env_content_key: str, env_path_key: str, tmp_path: str) -> bool:
+        content = os.getenv(env_content_key, "")
+        if content:
+            with open(tmp_path, "w") as f:
+                f.write(content.replace("\\n", "\n"))
+            return True
+        file_path = os.getenv(env_path_key, "")
+        if file_path and os.path.exists(file_path):
+            import shutil
+            shutil.copy(file_path, tmp_path)
+            return True
+        # Try default cert locations (HF Spaces mounts certs here)
+        default_paths = [
+            f"/app/certs/{os.path.basename(tmp_path)}",
+            f"certs/{os.path.basename(tmp_path)}",
+        ]
+        for dp in default_paths:
+            if os.path.exists(dp):
+                import shutil
+                shutil.copy(dp, tmp_path)
+                return True
+        return False
+    has_ca   = _write_cert("KAFKA_SSL_CA",   "KAFKA_SSL_CA_PATH",   "/tmp/ca.pem")
+    has_cert = _write_cert("KAFKA_SSL_CERT", "KAFKA_SSL_CERT_PATH", "/tmp/service.cert")
+    has_key  = _write_cert("KAFKA_SSL_KEY",  "KAFKA_SSL_KEY_PATH",  "/tmp/service.key")
+    conf = {
+        "bootstrap.servers": bootstrap,
+        "group.id": "top-stories-reader",
+        "auto.offset.reset": "latest",
+        "enable.auto.commit": False,
+        "log_level": 0,
+        "session.timeout.ms": 10000,
+    }
+    if has_ca and has_cert and has_key:
+        conf["security.protocol"] = "SSL"
+        conf["ssl.ca.location"]          = "/tmp/ca.pem"
+        conf["ssl.certificate.location"] = "/tmp/service.cert"
+        conf["ssl.key.location"]         = "/tmp/service.key"
+        logger.info("Kafka SSL configured for top stories consumer")
+    else:
+        logger.warning("Kafka SSL certs not found — connecting without SSL")
+    consumer = Consumer(conf)
+    stories: List[TopStory] = []
+    try:
+        # Get partition metadata
+        meta = consumer.list_topics(topic, timeout=5)
+        if topic not in meta.topics:
+            logger.warning(f"Kafka topic '{topic}' not found")
+            return []
+        partitions = [
+            TopicPartition(topic, p)
+            for p in meta.topics[topic].partitions.keys()
+        ]
+        # Get high watermarks and seek to (high - n) per partition
+        assigned = []
+        for tp in partitions:
+            low, high = consumer.get_watermark_offsets(tp, timeout=5)
+            if high > 0:
+                start = max(low, high - n)
+                assigned.append(TopicPartition(topic, tp.partition, start))
+        if not assigned:
+            return []
+        consumer.assign(assigned)
+        # Poll until we have n messages or timeout
+        import time
+        deadline = time.time() + 5.0
+        raw_messages = []
+        while len(raw_messages) < n and time.time() < deadline:
+            msg = consumer.poll(timeout=1.0)
+            if msg is None:
+                break
+            if msg.error():
+                continue
+            raw_messages.append(msg)
+        # Parse messages
+        seen_titles: set = set()
+        for msg in raw_messages:
+            try:
+                value = msg.value()
+                try:
+                    event = msgpack.unpackb(value, raw=False)
+                except Exception:
+                    event = json.loads(value.decode("utf-8", errors="ignore"))
+                title   = event.get("title") or event.get("content", "")[:80]
+                url     = event.get("url") or event.get("link") or ""
+                source  = event.get("source") or event.get("publisher") or "ARKI"
+                pub_at  = event.get("published_at") or event.get("pub_date") or datetime.utcnow().isoformat()
+                content = event.get("content") or event.get("text") or ""
+                excerpt = content[:150] if content else None
+                if not title or title in seen_titles:
+                    continue
+                seen_titles.add(title)
+                stories.append(TopStory(
+                    title=title.strip()[:200],
+                    url=url.strip(),
+                    source=source.strip(),
+                    published_at=pub_at,
+                    category="NEWS",
+                    excerpt=excerpt,
+                    origin="kafka",
+                ))
+            except Exception as e:
+                logger.debug(f"Failed to parse Kafka message: {e}")
+                continue
+    except Exception as e:
+        logger.error(f"Kafka top stories error: {e}")
+    finally:
+        consumer.close()
+    logger.info(f"Kafka top stories: fetched {len(stories)} from '{topic}'")
+    return stories[:n]
+async def fetch_kafka_stories(n: int = 3) -> List[TopStory]:
+    """Async wrapper — runs Kafka consumer in thread pool"""
+    loop = asyncio.get_event_loop()
+    try:
+        return await asyncio.wait_for(
+            loop.run_in_executor(None, _fetch_kafka_stories_sync, n),
+            timeout=6.0
+        )
+    except asyncio.TimeoutError:
+        logger.warning("Kafka top stories timeout")
+        return []
+    except Exception as e:
+        logger.error(f"Kafka top stories async error: {e}")
+        return []
+# ── DuckDuckGo: fetch N live stories ─────────────────────────────────────────
+async def fetch_ddg_stories(n: int = 3) -> List[TopStory]:
+    """Fetch N live stories from DuckDuckGo"""
     try:
         from ddgs import DDGS
         loop = asyncio.get_event_loop()
         def _search():
             ddgs = DDGS()
+            return list(ddgs.news("Ethiopia", region="et-en", max_results=n))
+        raw = await asyncio.wait_for(
             loop.run_in_executor(None, _search),
+            timeout=5.0
         )
         stories = []
+        for r in raw:
+            title = r.get("title", "").strip()
+            url   = r.get("url", "").strip()
+            if not title or not url:
                 continue
+            stories.append(TopStory(
+                title=title,
+                url=url,
+                source=r.get("source", "Unknown").strip(),
+                published_at=r.get("date", datetime.utcnow().isoformat()),
+                category="NEWS",
+                excerpt=r.get("body", "")[:150] if r.get("body") else None,
+                origin="live",
+            ))
+        logger.info(f"DuckDuckGo top stories: fetched {len(stories)}")
+        return stories[:n]
     except asyncio.TimeoutError:
+        logger.warning("DuckDuckGo top stories timeout")
         return []
     except Exception as e:
+        logger.error(f"DuckDuckGo top stories error: {e}")
         return []
+# ── Endpoint ──────────────────────────────────────────────────────────────────
 @router.get("/top-stories", response_model=TopStoriesResponse)
 async def get_top_stories(
+    force_refresh: bool = Query(default=False, description="Force cache refresh"),
 ):
     """
+    Get top 6 news stories for the landing page.
+    **Sources:**
+    - 3 from Kafka `news.processed` topic (pipeline-fresh, multilingual)
+    - 3 from DuckDuckGo live search (real-time, English)
+    **Cache:** 2-minute TTL for freshness.
     """
+    cache_key = "top_stories_hybrid"
     if not force_refresh and cache_key in _cache:
         cached_data, cached_time = _cache[cache_key]
         age = (datetime.utcnow() - cached_time).total_seconds()
         if age < _cache_ttl:
+            logger.info(f"Top stories cache HIT (age={age:.0f}s)")
             return TopStoriesResponse(
+                stories=cached_data["stories"],
                 fetched_at=cached_time.isoformat(),
+                cache_hit=True,
+                kafka_count=cached_data["kafka_count"],
+                live_count=cached_data["live_count"],
             )
+    # Fetch both sources in parallel
+    kafka_stories, ddg_stories = await asyncio.gather(
+        fetch_kafka_stories(3),
+        fetch_ddg_stories(3),
     )
+    # Merge: Kafka first (pipeline-fresh), then DuckDuckGo (live)
+    # Deduplicate by title similarity
+    all_stories: List[TopStory] = []
+    seen_titles: set = set()
+    for story in kafka_stories + ddg_stories:
+        title_key = story.title.lower()[:60]
+        if title_key not in seen_titles:
+            seen_titles.add(title_key)
+            all_stories.append(story)
+    # Fallback: if Kafka returned nothing, fill with more DuckDuckGo
+    if len(kafka_stories) == 0:
+        extra_ddg = await fetch_ddg_stories(6)
+        for story in extra_ddg:
+            title_key = story.title.lower()[:60]
+            if title_key not in seen_titles and len(all_stories) < 6:
+                seen_titles.add(title_key)
+                all_stories.append(story)
     now = datetime.utcnow()
+    payload = {
+        "stories": all_stories[:6],
+        "kafka_count": len(kafka_stories),
+        "live_count": len(ddg_stories),
+    }
+    _cache[cache_key] = (payload, now)
+    logger.info(
+        f"Top stories: {len(kafka_stories)} Kafka + {len(ddg_stories)} DuckDuckGo "
+        f"= {len(all_stories[:6])} total"
+    )
     return TopStoriesResponse(
+        stories=all_stories[:6],
         fetched_at=now.isoformat(),
+        cache_hit=False,
+        kafka_count=len(kafka_stories),
+        live_count=len(ddg_stories),
     )
+@router.post("/top-stories/refresh")
+async def refresh_top_stories():
+    """Clear the top stories cache"""
+    global _cache
+    _cache.clear()
+    return {"success": True, "cleared_at": datetime.utcnow().isoformat()}
 @router.get("/top-stories/categories")
 async def get_categories():
     return {
         "categories": [
             {"id": "news", "name": "News", "query": "Ethiopia"},
             {"id": "sports", "name": "Sports", "query": "Ethiopia sports"},
         ]
     }

src/core/config.py CHANGED Viewed

@@ -93,5 +93,12 @@ class Settings(BaseSettings):
     # Security Settings
     SECRET_KEY: str = os.getenv("SECRET_KEY", "a_very_secret_key_change_me_in_production")
     ACCESS_TOKEN_EXPIRE_MINUTES: int = int(os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES", "60"))
 settings = Settings()

     # Security Settings
     SECRET_KEY: str = os.getenv("SECRET_KEY", "a_very_secret_key_change_me_in_production")
     ACCESS_TOKEN_EXPIRE_MINUTES: int = int(os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES", "60"))
+    # Kafka Settings (for Top Stories — read-only consumer)
+    KAFKA_BOOTSTRAP_SERVERS: str = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "")
+    KAFKA_SSL_CA: str = os.getenv("KAFKA_SSL_CA", "")
+    KAFKA_SSL_CERT: str = os.getenv("KAFKA_SSL_CERT", "")
+    KAFKA_SSL_KEY: str = os.getenv("KAFKA_SSL_KEY", "")
+    KAFKA_TOPIC_PROCESSED: str = os.getenv("KAFKA_TOPIC_PROCESSED", "news.processed")
 settings = Settings()