Spaces:

WORKWITHSHAFISK
/

Segmento-Pulse-Backend

Paused

App Files Files Community

SHAFI commited on Mar 3

Commit

ff4f05b

1 Parent(s): 7d4e625

added Massive Tech News Ingestion , more than 10+ news providers added to the ingestion part

Browse files

Files changed (28) hide show

app/config.py +6 -0
app/services/circuit_breaker.py +18 -2
app/services/news_aggregator.py +122 -2
app/services/providers/__init__.py +23 -0
app/services/providers/base.py +174 -0
app/services/providers/direct_rss/__init__.py +10 -0
app/services/providers/direct_rss/client.py +378 -0
app/services/providers/hackernews/__init__.py +9 -0
app/services/providers/hackernews/client.py +365 -0
app/services/providers/inshorts/__init__.py +11 -0
app/services/providers/inshorts/client.py +346 -0
app/services/providers/openrss/__init__.py +15 -0
app/services/providers/openrss/client.py +384 -0
app/services/providers/sauravkanchan/__init__.py +12 -0
app/services/providers/sauravkanchan/client.py +375 -0
app/services/providers/thenewsapi/__init__.py +11 -0
app/services/providers/thenewsapi/client.py +347 -0
app/services/providers/webz/__init__.py +15 -0
app/services/providers/webz/client.py +404 -0
app/services/providers/wikinews/__init__.py +15 -0
app/services/providers/wikinews/client.py +435 -0
app/services/providers/worldnewsai/__init__.py +19 -0
app/services/providers/worldnewsai/client.py +359 -0
app/services/scheduler.py +180 -5
app/services/utils/__init__.py +8 -0
app/services/utils/image_enricher.py +190 -0
app/services/utils/provider_state.py +283 -0
app/utils/data_validation.py +245 -131

app/config.py CHANGED Viewed

@@ -29,6 +29,12 @@ class Settings(BaseSettings):
     GNEWS_API_KEY: str = ""
     NEWSAPI_API_KEY: str = ""
     NEWSDATA_API_KEY: str = ""
     # Provider priority (will try in order until successful)
     NEWS_PROVIDER_PRIORITY: List[str] = ["gnews", "newsapi", "newsdata", "google_rss"]

     GNEWS_API_KEY: str = ""
     NEWSAPI_API_KEY: str = ""
     NEWSDATA_API_KEY: str = ""
+    # Phase 5: TheNewsAPI.com — 100 req/day free tier, position 4 in PAID_CHAIN
+    THENEWSAPI_API_KEY: str = ""
+    # Phase 8: WorldNewsAI.com — point-based quota, position 5 in PAID_CHAIN
+    WORLDNEWS_API_KEY: str = ""
+    # Phase 10: Webz.io — 1,000 calls/month free tier, position 6 in PAID_CHAIN
+    WEBZ_API_KEY: str = ""
     # Provider priority (will try in order until successful)
     NEWS_PROVIDER_PRIORITY: List[str] = ["gnews", "newsapi", "newsdata", "google_rss"]

app/services/circuit_breaker.py CHANGED Viewed

@@ -79,8 +79,24 @@ class ProviderCircuitBreaker:
         self.circuit_open_time: Dict[str, float] = {}
         self.half_open_attempts: Dict[str, int] = defaultdict(int)
-        # Known providers — used by the boot-time Redis restore
-        self._known_providers = ["gnews", "newsapi", "newsdata", "google_rss", "medium", "official_cloud"]
         logger.info("=" * 70)
         logger.info("⚡ [CIRCUIT BREAKER] Provider protection initialized")

         self.circuit_open_time: Dict[str, float] = {}
         self.half_open_attempts: Dict[str, int] = defaultdict(int)
+        # Known providers — used by the boot-time Redis restore.
+        # IMPORTANT: Every provider registered in news_aggregator.py MUST be
+        # listed here. If a provider is missing, a circuit that was OPEN before
+        # a server restart will not be restored — the Space will hammer a broken
+        # API on every restart until it fails 3 more times to re-open.
+        #
+        # Phases 1-2 (legacy):      gnews, newsapi, newsdata, google_rss, medium, official_cloud
+        # Phases 3-11 (new modules): hacker_news, direct_rss, thenewsapi, inshorts,
+        #                            saurav_static, worldnewsai, openrss, webz, wikinews
+        self._known_providers = [
+            # ── Legacy providers (Phases 1-2) ────────────────────────────────
+            "gnews", "newsapi", "newsdata",
+            "google_rss", "medium", "official_cloud",
+            # ── New modular providers (Phases 3-11) ───────────────────────────
+            "hacker_news", "direct_rss", "thenewsapi",
+            "inshorts", "saurav_static", "worldnewsai",
+            "openrss", "webz", "wikinews",
+        ]
         logger.info("=" * 70)
         logger.info("⚡ [CIRCUIT BREAKER] Provider protection initialized")

app/services/news_aggregator.py CHANGED Viewed

@@ -18,6 +18,20 @@ from app.config import settings
 from app.services.api_quota import get_quota_tracker
 from app.services.circuit_breaker import get_circuit_breaker
 class NewsAggregator:
     """Service for aggregating news from multiple sources with automatic failover"""
@@ -48,12 +62,54 @@ class NewsAggregator:
         # Official Cloud Provider (Strict Isolation)
         self.providers['official_cloud'] = OfficialCloudProvider()
         # ── Provider role lists ──────────────────────────────────────────────
         # PAID_CHAIN: tried in order, stop after the first success (save credits)
         # FREE_SOURCES: always tried, always in parallel (no cost, no limits)
-        self.PAID_CHAIN  = ['gnews', 'newsapi', 'newsdata']
-        self.FREE_SOURCES = ['google_rss', 'medium', 'official_cloud']
         # Medium only publishes articles for a small set of topics.
         # Calling it for 'data-centers' or 'cloud-oracle' would return nothing.
@@ -70,6 +126,30 @@ class NewsAggregator:
                 'cloud-huawei', 'cloud-cloudflare'
             ]
         }
         # Cloud provider RSS feeds
         self.cloud_rss_urls = {
@@ -227,6 +307,46 @@ class NewsAggregator:
                     free_tasks.append(official.fetch_news(category, limit=10))
                     free_names.append('official_cloud')
         if free_tasks:
             print(f"[FREE]    Launching {len(free_tasks)} free source(s) in parallel for '{category}'...")
             free_results = await asyncio.gather(*free_tasks, return_exceptions=True)

 from app.services.api_quota import get_quota_tracker
 from app.services.circuit_breaker import get_circuit_breaker
+# ── Phases 3-11: New modular providers (Strangler Fig pattern) ──────────────
+# These live in providers/ folder. The legacy news_providers.py is NOT touched.
+# We import each new provider here and the aggregator runs both old and new
+# providers side-by-side safely.
+from app.services.providers.hackernews.client import HackerNewsProvider
+from app.services.providers.direct_rss.client import DirectRSSProvider
+from app.services.providers.thenewsapi.client import TheNewsAPIProvider
+from app.services.providers.inshorts.client import InshortsProvider
+from app.services.providers.sauravkanchan.client import SauravKanchanProvider
+from app.services.providers.worldnewsai.client import WorldNewsAIProvider
+from app.services.providers.openrss.client import OpenRSSProvider
+from app.services.providers.webz.client import WebzProvider
+from app.services.providers.wikinews.client import WikinewsProvider
 class NewsAggregator:
     """Service for aggregating news from multiple sources with automatic failover"""
         # Official Cloud Provider (Strict Isolation)
         self.providers['official_cloud'] = OfficialCloudProvider()
+        # Direct RSS from premium tech publications (TechCrunch, Wired, The Verge,
+        # Engadget, Ars Technica). Free, no key, great images and descriptions.
+        # Runs for ALL categories — the keyword gate filters off-topic results.
+        self.providers['direct_rss'] = DirectRSSProvider()
+        # TheNewsAPI.com — Position 4 in the PAID_CHAIN (failover after the
+        # existing 3 paid providers). 100 requests/day on the free tier.
+        # Only registered when the API key is present in the environment.
+        if settings.THENEWSAPI_API_KEY:
+            self.providers['thenewsapi'] = TheNewsAPIProvider(
+                api_key=settings.THENEWSAPI_API_KEY
+            )
+        # WorldNewsAI.com — Position 5 in the PAID_CHAIN (final paid failover).
+        # Point-based quota, conservative daily_limit = 50 calls.
+        # Gives global, non-US-centric news from tens of thousands of sources.
+        # Only registered when the API key is present in the environment.
+        if settings.WORLDNEWS_API_KEY:
+            self.providers['worldnewsai'] = WorldNewsAIProvider(
+                api_key=settings.WORLDNEWS_API_KEY
+            )
+        # OpenRSS.org — generates feeds for sites with no native RSS.
+        # Free, no key. Has strict 60-minute internal cooldown to avoid IP ban.
+        # Runs for ALL categories — no category guardrail needed.
+        # The cooldown timer is the only protection this provider needs.
+        self.providers['openrss'] = OpenRSSProvider()
+        # Webz.io — Position 6 in the PAID_CHAIN (deepest paid failover).
+        # Enterprise-grade crawl from 3.5M articles/day. Rich, global coverage.
+        # 1,000 calls/month free tier — paced to 30/day = ~900/month (10% margin).
+        # Only registered when the API key is present in the environment.
+        if settings.WEBZ_API_KEY:
+            self.providers['webz'] = WebzProvider(
+                api_key=settings.WEBZ_API_KEY
+            )
+        # Wikinews — Public Domain, copyright-bulletproof tech news.
+        # Free, no key. Searches 'Computing' and 'Internet' categories concurrently.
+        # Gated behind GENERAL_TECH_CATEGORIES (broad tech content only).
+        self.providers['wikinews'] = WikinewsProvider()
         # ── Provider role lists ──────────────────────────────────────────────
         # PAID_CHAIN: tried in order, stop after the first success (save credits)
         # FREE_SOURCES: always tried, always in parallel (no cost, no limits)
+        self.PAID_CHAIN  = ['gnews', 'newsapi', 'newsdata', 'thenewsapi', 'worldnewsai', 'webz']
+        self.FREE_SOURCES = ['google_rss', 'medium', 'official_cloud', 'direct_rss', 'hacker_news', 'inshorts', 'saurav_static', 'openrss', 'wikinews']
         # Medium only publishes articles for a small set of topics.
         # Calling it for 'data-centers' or 'cloud-oracle' would return nothing.
                 'cloud-huawei', 'cloud-cloudflare'
             ]
         }
+        # ── Phase 3: Hacker News Category Guardrail ──────────────────────────
+        # Hacker News gives broad tech news — it does NOT know about "cloud-alibaba"
+        # or "data-governance". Asking it for niche categories wastes CPU cycles
+        # and risks polluting those collections with off-topic articles.
+        # Only enable Hacker News for the broad categories below where it adds value.
+        self.GENERAL_TECH_CATEGORIES = {
+            'ai', 'magazines', 'data-engineering', 'cloud-computing',
+            'data-security', 'business-intelligence'
+        }
+        # Register the Hacker News provider (free, no key needed).
+        # It lives in providers/hackernews/client.py — completely isolated from
+        # the legacy news_providers.py file.
+        self.providers['hacker_news'] = HackerNewsProvider()
+        # Inshorts — 60-word tech summaries. Free, no key, broad tech topics.
+        # Gated behind GENERAL_TECH_CATEGORIES (same as Hacker News).
+        self.providers['inshorts'] = InshortsProvider()
+        # SauravKanchan static JSON — reads two GitHub Pages files (IN + US).
+        # Zero cost, zero rate limits, NewsAPI-format data structure.
+        # Gated behind GENERAL_TECH_CATEGORIES (broad tech news only).
+        self.providers['saurav_static'] = SauravKanchanProvider()
         # Cloud provider RSS feeds
         self.cloud_rss_urls = {
                     free_tasks.append(official.fetch_news(category, limit=10))
                     free_names.append('official_cloud')
+        # ── Phase 3: Hacker News Guardrail ────────────────────────────────────
+        # Only fire Hacker News when the category is a broad tech topic.
+        # For niche categories (e.g., cloud-alibaba), we skip it entirely.
+        if category in self.GENERAL_TECH_CATEGORIES:
+            hn = self.providers.get('hacker_news')
+            if hn and not self.circuit.should_skip('hacker_news'):
+                if hn.is_available():
+                    free_tasks.append(hn.fetch_news(category, limit=30))
+                    free_names.append('hacker_news')
+        # ── Phase 6: Inshorts Guardrail ─────────────────────────────────────
+        # Same rule as Hacker News: only fire for broad tech categories.
+        # Inshorts covers general tech, not niche cloud or governance topics.
+        if category in self.GENERAL_TECH_CATEGORIES:
+            inshorts = self.providers.get('inshorts')
+            if inshorts and not self.circuit.should_skip('inshorts'):
+                if inshorts.is_available():
+                    free_tasks.append(inshorts.fetch_news(category, limit=20))
+                    free_names.append('inshorts')
+        # ── Phase 7: SauravKanchan Guardrail ─────────────────────────────────
+        # Static JSON files (IN + US). Same guardrail as Hacker News and Inshorts.
+        # Broad tech content only — niche categories get no value from these files.
+        if category in self.GENERAL_TECH_CATEGORIES:
+            saurav = self.providers.get('saurav_static')
+            if saurav and not self.circuit.should_skip('saurav_static'):
+                if saurav.is_available():
+                    free_tasks.append(saurav.fetch_news(category, limit=50))
+                    free_names.append('saurav_static')
+        # ── Phase 11: Wikinews Guardrail ──────────────────────────────────
+        # Wikinews searches broad tech categories (Computing + Internet).
+        # No value for niche collections like cloud-alibaba or data-governance.
+        if category in self.GENERAL_TECH_CATEGORIES:
+            wikinews = self.providers.get('wikinews')
+            if wikinews and not self.circuit.should_skip('wikinews'):
+                if wikinews.is_available():
+                    free_tasks.append(wikinews.fetch_news(category, limit=20))
+                    free_names.append('wikinews')
         if free_tasks:
             print(f"[FREE]    Launching {len(free_tasks)} free source(s) in parallel for '{category}'...")
             free_results = await asyncio.gather(*free_tasks, return_exceptions=True)

app/services/providers/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# =============================================================================
+# providers/__init__.py
+# ─────────────────────────────────────────────────────────────────────────────
+# This file marks the 'providers' folder as a Python package so that
+# Python knows it can import code from inside it.
+#
+# ── HOW TO ADD A NEW PROVIDER ──────────────────────────────────────────────
+# 1. Create a new folder under providers/  (e.g., providers/hackernews/)
+# 2. Inside that folder, create __init__.py  (empty) and client.py
+# 3. In client.py, write a class that inherits from base.NewsProvider
+# 4. Add the import line below so the aggregator can find it easily:
+#    from app.services.providers.hackernews.client import HackerNewsProvider
+#
+# ── ROUTING RULE (CRITICAL) ────────────────────────────────────────────────
+# Every provider MUST set a 'category' on each Article it returns.
+# If a provider cannot determine a category, it MUST leave category as ""
+# or "magazines". DO NOT LEAVE IT AS None.
+#
+# When category is empty or unrecognized, appwrite_db.get_collection_id()
+# automatically routes the article to the DEFAULT 'News Articles' collection.
+# This is intentional and safe. Never invent a category name that doesn't
+# exist in config.py CATEGORIES — it will silently break routing.
+# =============================================================================

app/services/providers/base.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+providers/base.py
+─────────────────────────────────────────────────────────────────────────────
+The Foundation — every news provider in this system inherits from this file.
+Think of this like a "job contract" for a news provider. Any class that wants
+to act as a news provider MUST sign this contract by:
+  1. Inheriting from the NewsProvider class below.
+  2. Implementing the fetch_news() method with real logic.
+If a class inherits from NewsProvider but does NOT implement fetch_news(),
+Python will throw a TypeError at startup — which is exactly what we want.
+It forces every developer to write proper fetching logic.
+── RULE: THE CATEGORY ROUTING CONTRACT ─────────────────────────────────────
+Every Article produced by a provider MUST have a 'category' field.
+The category value routes the article to the correct Appwrite collection.
+Current routing rules (defined in appwrite_db.get_collection_id):
+  "ai"             → AI collection
+  "cloud-*"        → Cloud collection
+  "data-*" / "business-*" / "customer-data-platform" → Data collection
+  "magazines"      → Magazine collection
+  "medium-article" → Medium collection
+  ""  (empty)
+  or any unknown   → DEFAULT 'News Articles' collection   ← SAFE FALLBACK
+⚠️  IMPORTANT FOR ALL PROVIDER DEVELOPERS:
+    If your provider fetches general tech news and cannot determine a specific
+    category, set category = "magazines".
+    If your provider truly cannot figure out a category, set category = "".
+    The default collection will catch it safely.
+    NEVER set category = None — that will cause a Pydantic validation error.
+    NEVER invent a category string that is not in config.py CATEGORIES list.
+── HOW CLIENT-SIDE FILTERING WORKS ─────────────────────────────────────────
+Many providers (Hacker News, RSS Feeds, static files) do NOT support
+filtering by date or keyword in their API request. That is okay.
+Do NOT try to add date filters in the URL if the API doesn't support them.
+Our data_validation pipeline enforces all constraints AFTER the fetch:
+  - Freshness gate: rejects articles older than midnight IST today
+  - Keyword gate: rejects articles with no matching category keywords
+  - Redis dedup: rejects URLs we have already saved in the last 48 hours
+So your job in fetch_news() is simple: fetch as many articles as the
+provider gives you, map them to Article objects, and return them.
+The pipeline does the rest.
+"""
+# ── Imports ──────────────────────────────────────────────────────────────────
+# Standard library
+from abc import ABC, abstractmethod     # ABC = Abstract Base Class toolkit
+from typing import List, Optional
+from datetime import datetime, timezone, timedelta
+from zoneinfo import ZoneInfo           # Timezone handling (Python 3.9+ built-in)
+from enum import Enum
+# Third-party (all already in requirements.txt — no new installs needed)
+import httpx                            # Async HTTP client for API calls
+# Internal
+from app.models import Article          # The standard Article shape every provider must return
+# ── Provider Status ────────────────────────────────────────────────────────────
+class ProviderStatus(Enum):
+    """
+    Represents the health of a provider at any given moment.
+    ACTIVE       → Provider is working fine. Calls proceed normally.
+    RATE_LIMITED → Provider hit its API limit. Calls are paused.
+    ERROR        → Provider had a hard failure. Circuit breaker may kick in.
+    """
+    ACTIVE       = "active"
+    RATE_LIMITED = "rate_limited"
+    ERROR        = "error"
+# ── Abstract Base Class ────────────────────────────────────────────────────────
+class NewsProvider(ABC):
+    """
+    The contract that every news provider must follow.
+    Subclass this, implement fetch_news(), and your provider
+    is automatically compatible with the NewsAggregator, circuit breaker,
+    quota tracker, and the full validation pipeline.
+    Example of a minimal valid provider:
+        from app.services.providers.base import NewsProvider, ProviderStatus
+        from app.models import Article
+        from typing import List
+        class MyProvider(NewsProvider):
+            async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
+                # 1. Call your API / RSS feed
+                # 2. Map the response to Article objects
+                # 3. Return the list (can be empty if nothing found)
+                return []
+    """
+    def __init__(self, api_key: Optional[str] = None):
+        # The API key for paid providers. Free providers leave this as None.
+        self.api_key = api_key
+        # Starts as ACTIVE. The aggregator or circuit breaker may change this.
+        self.status = ProviderStatus.ACTIVE
+        # Tracks how many API calls this provider has made today.
+        self.request_count: int = 0
+        # Maximum calls per day. 0 = no limit (used by free providers).
+        self.daily_limit: int = 0
+        # The name of this provider. Used in logging and circuit breaker tracking.
+        # Automatically takes the class name (e.g., "HackerNewsProvider").
+        self.name: str = self.__class__.__name__
+    @abstractmethod
+    async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
+        """
+        REQUIRED: Fetch news articles for the given category.
+        Args:
+            category (str): The internal Segmento Pulse category name.
+                            Example: "ai", "cloud-aws", "magazines"
+            limit (int):    Maximum number of articles to return.
+                            This is a guideline — providers may return fewer.
+        Returns:
+            List[Article]: A list of Article objects. Return [] on failure.
+                           Never raise an unhandled exception from here.
+                           Wrap all network calls in try/except.
+        Remember the ROUTING RULE at the top of this file:
+            Every Article MUST have a category string.
+            Use "magazines" for general tech. Use "" for truly unknown.
+        """
+        pass
+    # ── Utility Methods (inherited by all providers, no need to override) ──────
+    def is_available(self) -> bool:
+        """
+        Check if this provider is ready to accept a fetch request.
+        Returns False if:
+          - It is currently rate-limited or in an error state.
+          - It has used up its daily API call limit.
+        """
+        return (
+            self.status == ProviderStatus.ACTIVE
+            and (self.daily_limit == 0 or self.request_count < self.daily_limit)
+        )
+    def mark_rate_limited(self):
+        """
+        Call this when the API returns a 429 (Too Many Requests).
+        The status changes to RATE_LIMITED so the aggregator knows to skip it.
+        """
+        self.status = ProviderStatus.RATE_LIMITED
+    def reset_daily_quota(self):
+        """
+        Reset this provider's call counter back to zero.
+        Called once per day (midnight UTC) by the scheduler to restore access.
+        """
+        self.request_count = 0
+        self.status = ProviderStatus.ACTIVE

app/services/providers/direct_rss/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# providers/direct_rss/__init__.py
+# ─────────────────────────────────────────────────────────────────────────────
+# This file marks the 'direct_rss' folder as a Python package.
+# To use the Direct RSS provider, import it like this:
+#
+#   from app.services.providers.direct_rss.client import DirectRSSProvider
+#
+# This provider fetches XML feeds from premium tech publications
+# (TechCrunch, Wired, The Verge, Engadget, Ars Technica) completely for free.
+# No API keys. No rate limits. Just clean, honest RSS.

app/services/providers/direct_rss/client.py ADDED Viewed

	@@ -0,0 +1,378 @@

+"""
+providers/direct_rss/client.py
+─────────────────────────────────────────────────────────────────────────────
+The Direct RSS Provider for Segmento Pulse.
+What this does:
+    Fetches the latest technology articles from the RSS feeds of the world's
+    best tech publications: TechCrunch, Wired, The Verge, Engadget, and
+    Ars Technica.
+Why Direct RSS instead of using rss_parser.parse_provider_rss()?
+    The existing rss_parser.parse_provider_rss() function is built for a
+    specific use case: fetching official CLOUD PROVIDER blogs (AWS, GCP etc.)
+    It hardcodes  category = f'cloud-{provider}'  on every article it creates.
+    If we ran TechCrunch through that function, every TechCrunch article
+    would be tagged "category = cloud-TechCrunch". Appwrite would not know
+    where to route it, and articles would end up in the wrong collection —
+    or worse, be silently dropped.
+    So instead, we use the feedparser library directly (the same library
+    rss_parser.py uses internally). We follow the exact same parsing pattern
+    but set the category correctly from what the aggregator tells us.
+    We DO still reuse two helper methods from rss_parser.py for consistency:
+        - _extract_image_from_entry()  → finds images from media/enclosure tags
+        - _parse_date()                → handles all date format variations
+How it works:
+    Step 1: Build a list of async HTTP tasks — one per RSS feed URL.
+    Step 2: Fire all tasks at the same time using asyncio.gather().
+    Step 3: Feed each successful XML response into feedparser.
+    Step 4: Map each feedparser entry to a Pulse Article object.
+    Step 5: Return the combined list from all feeds.
+Client-side constraint note:
+    RSS feeds give us whatever was published recently by that outlet —
+    we cannot ask them for "only today's AI articles".
+    The freshness gate (is_valid_article) and keyword gate
+    (is_relevant_to_category) in data_validation.py handle all filtering
+    after we return these articles. That is by design.
+"""
+# ── Standard Library ──────────────────────────────────────────────────────────
+import asyncio
+import logging
+import re
+import time
+from typing import List
+# ── Third-party (already in requirements.txt) ─────────────────────────────────
+import feedparser          # XML/RSS feed parser — already used by rss_parser.py
+import httpx               # Async HTTP client
+# ── Internal ──────────────────────────────────────────────────────────────────
+from app.services.providers.base import NewsProvider
+from app.services.rss_parser import RSSParser   # Reuse helper methods, not the methods with hardcoded categories
+from app.models import Article
+logger = logging.getLogger(__name__)
+# ── RSS Feed Registry ──────────────────────────────────────────────────────────
+#
+# These are the direct RSS feed URLs for the most trusted tech publications.
+# Each entry is a tuple of (feed_url, source_name).
+#
+# "source_name" is the human-readable name we store on every article.
+# It appears in the Segmento Pulse UI next to the article headline.
+#
+# To add a new RSS feed in the future, just add a new line here.
+# The rest of the code picks it up automatically.
+#
+TECH_RSS_FEEDS: List[tuple] = [
+    ("https://techcrunch.com/feed",                          "TechCrunch"),
+    ("https://www.wired.com/feed/rss",                       "Wired"),
+    ("https://www.theverge.com/rss/tech/index.xml",          "The Verge"),
+    ("https://www.engadget.com/rss.xml",                     "Engadget"),
+    ("https://feeds.arstechnica.com/arstechnica/technology-lab", "Ars Technica"),
+]
+# Maximum articles to take from each individual feed.
+# 10 per feed × 5 feeds = up to 50 articles total per aggregator run.
+MAX_ARTICLES_PER_FEED = 10
+# How long (in seconds) to wait for a feed to respond before giving up.
+HTTP_TIMEOUT_SECONDS = 12.0
+class DirectRSSProvider(NewsProvider):
+    """
+    Fetches articles directly from the RSS feeds of premium tech publications.
+    Free. No API key needed. No rate limits.
+    Provides the best descriptions and images of all our free providers,
+    because these are professionally edited by full-time journalists.
+    Usage (wired into the aggregator in Phase 4):
+        provider = DirectRSSProvider()
+        articles = await provider.fetch_news(category="ai", limit=50)
+    """
+    def __init__(self):
+        # Free provider — no API key, no daily limit.
+        super().__init__(api_key=None)
+        self.daily_limit = 0
+        # Phase 17: Fetch-Once, Fan-Out cache
+        #
+        # Direct RSS fetches TechCrunch, Wired, The Verge, Engadget, and
+        # Ars Technica. These do NOT change between categories — the same
+        # 5 XML files contain the same articles whether the category is
+        # "ai", "cloud-aws", or "data-security".
+        #
+        # Without a cache: 22 categories × 5 feeds = 110 outbound HTTP requests
+        # per scheduler run, all downloading the exact same XML.
+        #
+        # With a cache: first category fetches 5 feeds once, stores results
+        # here. The other 21 categories get the list instantly from memory.
+        # Total outbound requests: 5. A 95% reduction.
+        self._cached_articles: List[Article] = []
+        self._cache_time: float = 0.0
+        # asyncio.Lock prevents a race condition during the first run.
+        # When the scheduler fires, asyncio.gather() calls fetch_news() for
+        # multiple categories at the same time. Without the lock, all of them
+        # would see an empty cache and all start their own 5-feed HTTP fetch
+        # simultaneously. That defeats the whole purpose. With the lock,
+        # only the FIRST caller fetches; the rest wait and then read from cache.
+        self._lock = asyncio.Lock()
+        # We borrow helpers from the existing RSSParser.
+        # We do NOT call parse_google_news() or parse_provider_rss() —
+        # those have category logic built in that would break our routing.
+        # We only use the helper methods: _extract_image_from_entry, _parse_date.
+        self._rss_helpers = RSSParser()
+    # ─────────────────────────────────────────────────────────────────────────
+    # MAIN ENTRY POINT — called by the aggregator
+    # ─────────────────────────────────────────────────────────────────────────
+    async def fetch_news(self, category: str, limit: int = 50) -> List[Article]:
+        """
+        Fetch articles from all premium tech RSS feeds concurrently.
+        Args:
+            category (str): The category string passed from the aggregator.
+                            We tag every article with this so the pipeline
+                            can route it to the correct Appwrite collection.
+                            The keyword gate will filter out irrelevant articles.
+            limit (int):    Not strictly enforced here — we let the per-feed
+                            cap (MAX_ARTICLES_PER_FEED) control volume, and
+                            the aggregator deduplication handles the rest.
+        Returns:
+            List[Article]: All articles collected across all 5 feeds.
+                           Returns [] if network is down for all feeds.
+        """
+        # ── Phase 17: Cache check (OUTER) ─────────────────────────────────────
+        # 2700 seconds = 45 minutes. If we fetched the RSS feeds less than
+        # 45 minutes ago, return the stored articles immediately.
+        # No HTTP request. No XML parsing. Instant return.
+        #
+        # Why 45 minutes? Our freshness gate uses an hourly window. A 45-minute
+        # cache is safely inside that window, giving us fresh-enough content
+        # without hammering TechCrunch and Wired every minute.
+        CACHE_TTL_SECONDS = 2700   # 45 minutes
+        if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
+            logger.debug(
+                "[DirectRSS] Cache hit — returning %d cached articles for category='%s'. "
+                "No HTTP calls made.",
+                len(self._cached_articles), category
+            )
+            return self._cached_articles
+        # ── Cache stale or empty: acquire the lock and fetch ───────────────────
+        # Only one coroutine can be inside this block at a time.
+        # Any other coroutine that reaches this point will WAIT here until
+        # the first one has finished and released the lock.
+        async with self._lock:
+            # ── Cache check (INNER) — double-checked locking ──────────────
+            # While THIS coroutine was waiting for the lock, the coroutine that
+            # held the lock before us already fetched and filled the cache.
+            # We check again so we don't fetch a second time.
+            if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
+                logger.debug(
+                    "[DirectRSS] Cache hit after lock (another task fetched it) — "
+                    "returning %d cached articles.",
+                    len(self._cached_articles)
+                )
+                return self._cached_articles
+            # Cache is genuinely stale — this coroutine won the race.
+            # Do the full HTTP fetch now.
+            logger.info("[DirectRSS] Cache stale/empty. Fetching all 5 RSS feeds...")
+            try:
+                async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
+                    # Step 1: Build one fetch task per RSS feed URL.
+                    # All tasks run at the same time — we do not wait for feed #1
+                    # before starting feed #2. This keeps total time under 2 seconds.
+                    fetch_tasks = [
+                        self._fetch_and_parse_feed(client, url, source_name, category)
+                        for url, source_name in TECH_RSS_FEEDS
+                    ]
+                    # Step 2: Launch all tasks simultaneously.
+                    results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
+                    # Step 3: Combine all lists into one. Skip any that errored.
+                    all_articles: List[Article] = []
+                    for feed_url_source, result in zip(TECH_RSS_FEEDS, results):
+                        source_name = feed_url_source[1]
+                        if isinstance(result, Exception):
+                            logger.warning(
+                                f"[DirectRSS] [{source_name}] Feed fetch failed: {result}"
+                            )
+                        elif isinstance(result, list):
+                            all_articles.extend(result)
+                    logger.info(
+                        "[DirectRSS] Fetched %d articles across %d feeds. "
+                        "Caching for 45 minutes.",
+                        len(all_articles), len(TECH_RSS_FEEDS)
+                    )
+                    # Save results and timestamp to the class-level cache.
+                    self._cached_articles = all_articles
+                    self._cache_time = time.time()
+                    return all_articles
+            except Exception as e:
+                logger.error(f"[DirectRSS] Unexpected error: {e}", exc_info=True)
+                return []
+    # ─────────────────────────────────────────────────────────────────────────
+    # PRIVATE HELPERS
+    # ─────────────────────────────────────────────────────────────────────────
+    async def _fetch_and_parse_feed(
+        self,
+        client: httpx.AsyncClient,
+        url: str,
+        source_name: str,
+        category: str,
+    ) -> List[Article]:
+        """
+        Fetch one RSS feed URL and parse it into Article objects.
+        Args:
+            client (httpx.AsyncClient): Shared HTTP client from fetch_news().
+            url (str):         The RSS feed URL (e.g., https://techcrunch.com/feed).
+            source_name (str): Human-readable name (e.g., "TechCrunch").
+            category (str):    The category from the aggregator — stored on each article.
+        Returns:
+            List[Article]: Parsed articles from this feed. Returns [] on any failure.
+        """
+        try:
+            response = await client.get(
+                url,
+                # Politely identify ourselves. Some servers block unknown user agents.
+                headers={"User-Agent": "SegmentoPulse-RSS-Reader/1.0"},
+                follow_redirects=True,
+            )
+            if response.status_code != 200:
+                logger.warning(
+                    f"[DirectRSS] [{source_name}] HTTP {response.status_code} — skipping."
+                )
+                return []
+            xml_text = response.text
+        except httpx.TimeoutException:
+            logger.warning(f"[DirectRSS] [{source_name}] Timed out — skipping.")
+            return []
+        except Exception as e:
+            logger.warning(f"[DirectRSS] [{source_name}] Fetch error: {e}")
+            return []
+        # Hand the raw XML to feedparser — it handles all RSS/Atom variants
+        # (RSS 2.0, Atom 1.0, etc.) automatically.
+        return self._parse_feed_xml(xml_text, source_name, category)
+    def _parse_feed_xml(
+        self,
+        xml_text: str,
+        source_name: str,
+        category: str,
+    ) -> List[Article]:
+        """
+        Parse raw XML text from a feed into a list of Article objects.
+        Uses feedparser to decode the XML, then maps each entry to our
+        Pydantic Article model. We reuse rss_parser's helper methods for
+        image extraction and date parsing so the logic is consistent
+        across all RSS sources in the system.
+        Args:
+            xml_text (str):    Raw XML string from the HTTP response.
+            source_name (str): Name of the publication (e.g., "Wired").
+            category (str):    Category to tag on every article.
+        Returns:
+            List[Article]: Parsed articles. May be [] if the feed is malformed.
+        """
+        try:
+            feed = feedparser.parse(xml_text)
+        except Exception as e:
+            logger.warning(f"[DirectRSS] [{source_name}] feedparser failed: {e}")
+            return []
+        articles: List[Article] = []
+        for entry in feed.entries[:MAX_ARTICLES_PER_FEED]:
+            # ── Title ────────────────────────────────────────────────────────
+            title = (entry.get("title") or "").strip()
+            if not title:
+                continue   # Every article must have a title
+            # ── URL ──────────────────────────────────────────────────────────
+            url = (entry.get("link") or "").strip()
+            if not url or not url.startswith("http"):
+                continue   # Every article must have a clickable link
+            # ── Description ──────────────────────────────────────────────────
+            # RSS feeds usually put a short summary in the 'summary' field.
+            # We strip any HTML tags, then cap it at 200 characters.
+            raw_desc = entry.get("summary", "") or ""
+            description = re.sub(r"<[^>]+>", "", raw_desc).strip()
+            if len(description) > 200:
+                description = description[:200] + "..."
+            # ── Image URL ────────────────────────────────────────────────────
+            # We reuse the existing _extract_image_from_entry helper from
+            # rss_parser.py. It checks media:content, media:thumbnail,
+            # enclosures, and <img> tags inside the description.
+            image_url = self._rss_helpers._extract_image_from_entry(entry)
+            # ── Published Date ───────────────────────────────────────────────
+            # We reuse the existing _parse_date helper from rss_parser.py.
+            # It handles RFC 2822, ISO 8601, and other common date formats.
+            raw_date = entry.get("published", "") or ""
+            published_at = self._rss_helpers._parse_date(raw_date)
+            # ── Build Article ────────────────────────────────────────────────
+            try:
+                article = Article(
+                    title=title,
+                    description=description,
+                    url=url,
+                    image_url=image_url,
+                    published_at=published_at,
+                    source=source_name,
+                    # ── ROUTING RULE ──────────────────────────────────────
+                    # We set the category that the aggregator passed in.
+                    # The keyword gate will reject articles that don't
+                    # actually match this category — that's completely fine.
+                    # It is much safer than guessing a wrong category here.
+                    category=category,
+                )
+                articles.append(article)
+            except Exception as e:
+                # One bad article should never cancel the rest of the feed
+                logger.debug(
+                    f"[DirectRSS] [{source_name}] Skipped entry '{title[:50]}': {e}"
+                )
+                continue
+        logger.info(
+            f"[DirectRSS] [{source_name}] Parsed {len(articles)} articles."
+        )
+        return articles

app/services/providers/hackernews/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# providers/hackernews/__init__.py
+# ─────────────────────────────────────────────────────────────────────────────
+# This file marks the 'hackernews' folder as a Python package.
+# To use the Hacker News provider, import it like this:
+#
+#   from app.services.providers.hackernews.client import HackerNewsProvider
+#
+# This provider is entirely self-contained in this folder.
+# It does not touch news_providers.py, news_aggregator.py, or anything else.

app/services/providers/hackernews/client.py ADDED Viewed

	@@ -0,0 +1,365 @@

+"""
+providers/hackernews/client.py
+─────────────────────────────────────────────────────────────────────────────
+The Hacker News Provider for Segmento Pulse.
+What this does:
+    Fetches the top stories from Hacker News — a community-voted list of the
+    best tech articles on the internet. It is completely free to use and has
+    no rate limits or API key requirement.
+How the Hacker News API works (Two-Step Process):
+    Step 1: Ask HN for a list of top story IDs (one big list)
+    Step 2: For each ID, ask HN for that story's actual details
+    We only take the top 30 IDs. If we tried 500 IDs (the full list),
+    it would take too long and put unnecessary load on their server.
+    30 is a safe, polite number that still gives us great content.
+What we do about missing data:
+    - No URL?       → Skip this story entirely (it's an "Ask HN" self-post).
+                      Our database cannot link to a story without a URL.
+    - No image?     → Set image_url = "". The frontend will use the
+                      Segmento Pulse banner image as the default.
+    - No summary?   → Set description = "". HN only provides the title
+                      for external links, not a description.
+    - Unix time?    → Convert to ISO 8601 string (our standard date format).
+Client-side constraint note (from our architecture plan):
+    Hacker News does NOT support any filtering. We cannot ask it for
+    "only today's articles" or "only AI news". It gives us what it gives us.
+    That is completely fine. Our data_validation pipeline (is_valid_article,
+    is_relevant_to_category) will filter out old or off-topic articles
+    automatically AFTER we fetch them. We just fetch and map here.
+"""
+# ── Standard Library ──────────────────────────────────────────────────────────
+import asyncio                          # Lets us run multiple HTTP calls at the same time
+import logging
+from datetime import datetime, timezone
+from typing import List, Optional
+# ── Third-party (already in requirements.txt) ─────────────────────────────────
+import httpx                            # Async HTTP client
+# ── Internal ──────────────────────────────────────────────────────────────────
+# We import only from our new base — no dependency on legacy news_providers.py
+from app.services.providers.base import NewsProvider, ProviderStatus
+from app.models import Article
+# Phase 12: Shared image enricher (extracts og:image from article pages)
+from app.services.utils.image_enricher import extract_top_image
+logger = logging.getLogger(__name__)
+# ── Constants ─────────────────────────────────────────────────────────────────
+# The top of this list = the most upvoted stories on Hacker News right now
+HN_TOP_STORIES_URL = "https://hacker-news.firebaseio.com/v0/topstories.json"
+# Template for fetching one story's full details by its ID
+HN_ITEM_URL = "https://hacker-news.firebaseio.com/v0/item/{item_id}.json"
+# How many top stories to fetch. Kept small to be polite to HN's servers.
+# The full list has 500 stories — we only want the best 30.
+TOP_STORIES_LIMIT = 30
+# HTTP timeout in seconds. HN is fast, but we cap it to avoid hanging jobs.
+HTTP_TIMEOUT_SECONDS = 10.0
+class HackerNewsProvider(NewsProvider):
+    """
+    Fetches top stories from the Hacker News API.
+    No API key needed. No rate limit. Completely free.
+    Usage (once wired into the aggregator in Phase 3):
+        provider = HackerNewsProvider()
+        articles = await provider.fetch_news(category="magazines", limit=30)
+    """
+    def __init__(self):
+        # Free provider — no API key needed, so we pass None to the base class.
+        super().__init__(api_key=None)
+        # daily_limit = 0 means "no limit". HN has no quota.
+        self.daily_limit = 0
+    # ─────────────────────────────────────────────────────────────────────────
+    # STEP 1 + 2 COMBINED: fetch_news() is the one method the aggregator calls
+    # ─────────────────────────────────────────────────────────────────────────
+    async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
+        """
+        Fetch the top stories from Hacker News.
+        Args:
+            category (str): The category passed in by the aggregator.
+                            We store this on each article, but we cannot
+                            actually filter HN results by it. The keyword
+                            gate in data_validation.py will handle that.
+            limit (int):    Maximum number of articles to return.
+                            We cap this at TOP_STORIES_LIMIT (30) regardless.
+        Returns:
+            List[Article]: Validated Article objects from Hacker News.
+                           Returns [] if the network is down or HN is unreachable.
+        """
+        try:
+            async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
+                # ── STEP 1: Get the list of top story IDs ─────────────────
+                top_ids = await self._fetch_top_ids(client)
+                if not top_ids:
+                    logger.warning("[HackerNews] Could not retrieve top story IDs.")
+                    return []
+                # Slice the list — we only want the top N IDs
+                ids_to_fetch = top_ids[:min(limit, TOP_STORIES_LIMIT)]
+                # ── STEP 2: Fetch all story details concurrently ───────────
+                # Instead of fetching stories one-by-one (which would take ~30 seconds),
+                # we launch all 30 HTTP requests at the same time using asyncio.gather().
+                # All 30 requests fly out simultaneously and come back in ~1-2 seconds.
+                fetch_tasks = [
+                    self._fetch_single_item(client, story_id)
+                    for story_id in ids_to_fetch
+                ]
+                raw_items = await asyncio.gather(*fetch_tasks, return_exceptions=True)
+                # ── MAP: Convert raw HN items → Article objects ────────────
+                articles = self._map_items_to_articles(raw_items, category)
+                # ── ENRICH: Fetch images for articles that have none ───────
+                # _map_items_to_articles is a sync function, so it cannot await.
+                # We run image enrichment here in the async caller instead.
+                # All image fetches run concurrently — the total extra wait
+                # is ~4 seconds maximum (the outer timeout), not 30×4 seconds.
+                articles = await self._enrich_article_images(articles)
+                logger.info(
+                    f"[HackerNews] Fetched {len(raw_items)} items → "
+                    f"{len(articles)} valid articles for category='{category}'"
+                )
+                return articles
+        except httpx.TimeoutException:
+            logger.warning("[HackerNews] Request timed out. Will retry next cycle.")
+            return []
+        except Exception as e:
+            # Catch-all: never let a HN failure crash the aggregator job
+            logger.error(f"[HackerNews] Unexpected error: {e}", exc_info=True)
+            return []
+    # ─────────────────────────────────────────────────────────────────────────
+    # PRIVATE HELPERS — internal steps, not called by the aggregator
+    # ─────────────────────────────────────────────────────────────────────────
+    async def _fetch_top_ids(self, client: httpx.AsyncClient) -> List[int]:
+        """
+        Step 1: Ask Hacker News for the IDs of its top stories.
+        Returns a list of integers like [39281947, 39281001, ...].
+        Returns [] if HN is unreachable or returns an error.
+        """
+        try:
+            response = await client.get(HN_TOP_STORIES_URL)
+            if response.status_code != 200:
+                logger.warning(
+                    f"[HackerNews] Top stories endpoint returned HTTP {response.status_code}"
+                )
+                return []
+            ids = response.json()
+            # Sanity check — make sure we got a list of numbers, not garbage
+            if not isinstance(ids, list):
+                logger.warning("[HackerNews] Unexpected response format for top IDs.")
+                return []
+            return ids
+        except Exception as e:
+            logger.error(f"[HackerNews] Failed to fetch top IDs: {e}")
+            return []
+    async def _fetch_single_item(
+        self, client: httpx.AsyncClient, item_id: int
+    ) -> Optional[dict]:
+        """
+        Step 2 (single unit): Fetch the details for one Hacker News story.
+        Args:
+            client (httpx.AsyncClient): Shared client passed from fetch_news().
+            item_id (int): The numeric ID of the story to fetch.
+        Returns:
+            dict of story details, or None if the request failed.
+        """
+        url = HN_ITEM_URL.format(item_id=item_id)
+        try:
+            response = await client.get(url)
+            if response.status_code != 200:
+                return None
+            item = response.json()
+            # HN can return null for deleted or dead items
+            if not item:
+                return None
+            return item
+        except Exception:
+            # A single story failing should not cancel the other 29 stories
+            return None
+    def _map_items_to_articles(
+        self, raw_items: list, category: str
+    ) -> List[Article]:
+        """
+        Convert raw Hacker News JSON items into Segmento Pulse Article objects.
+        This is where all the data transformation happens:
+        - Unix timestamp → ISO 8601 string
+        - Missing URL    → skip (self-posts cannot be stored)
+        - Missing image  → "" (frontend uses Pulse banner)
+        - Missing text   → "" (HN has no descriptions for external links)
+        Args:
+            raw_items (list): Results from asyncio.gather() — each is either
+                              a dict (success) or None/Exception (failure).
+            category (str):   The category string from the aggregator.
+                              We pass it through as-is.
+        Returns:
+            List[Article]: Clean, valid Article objects ready for the pipeline.
+        """
+        articles: List[Article] = []
+        for item in raw_items:
+            # Skip anything that errored or returned null from HN
+            if item is None or isinstance(item, Exception):
+                continue
+            # ── Check: Skip non-story types ───────────────────────────────
+            # HN API also returns "job", "comment", "poll" types.
+            # We only want "story" type — the actual articles.
+            if item.get("type") != "story":
+                continue
+            # ── Check: Skip self-posts that have no external URL ──────────
+            # "Ask HN", "Show HN", and other self-posts have no 'url' key.
+            # Our database cannot store a meaningful link for these.
+            url = item.get("url", "")
+            if not url or not url.startswith("http"):
+                continue
+            # ── Check: Skip stories without a title ───────────────────────
+            title = (item.get("title") or "").strip()
+            if not title:
+                continue
+            # ── Convert: Unix timestamp → ISO 8601 string ─────────────────
+            # HN stores time as seconds since 1970-01-01 (Unix epoch).
+            # Example: 1709432800 → "2024-03-03T04:46:40+00:00"
+            unix_time = item.get("time")
+            if unix_time:
+                published_at = datetime.fromtimestamp(
+                    unix_time, tz=timezone.utc
+                ).isoformat()
+            else:
+                # If HN somehow has no timestamp, use now as fallback.
+                # The freshness gate in data_validation.py will still check it.
+                published_at = datetime.now(tz=timezone.utc).isoformat()
+            # ── Build the Article dict ─────────────────────────────────────
+            # We use a plain dict here; the aggregator's validation layer
+            # converts dicts → Article objects and runs all the checks.
+            try:
+                article = Article(
+                    title=title,
+                    description="",          # HN does not provide descriptions
+                    url=url,
+                    image_url="",            # HN does not provide images
+                    published_at=published_at,
+                    source="Hacker News",
+                    # ── ROUTING RULE ──────────────────────────────────────
+                    # We pass through whatever category the aggregator gave us.
+                    # If the article doesn't match this category, the keyword
+                    # gate in data_validation.is_relevant_to_category() will
+                    # reject it safely — no routing damage to the database.
+                    category=category,
+                )
+                articles.append(article)
+            except Exception as e:
+                # If one article fails Pydantic validation, log and skip it.
+                # Never let one bad article break the whole batch.
+                logger.debug(
+                    f"[HackerNews] Skipped item id={item.get('id')}: {e}"
+                )
+                continue
+        return articles
+    # ─────────────────────────────────────────────────────────────────────────
+    # PHASE 12: IMAGE ENRICHMENT — async post-processing step
+    # ─────────────────────────────────────────────────────────────────────────
+    async def _enrich_article_images(self, articles: List[Article]) -> List[Article]:
+        """
+        For every article that has an empty image_url, visit its URL and
+        try to find the main image using the og:image HTML meta tag.
+        Phase 14 fix: Added asyncio.Semaphore(10) to cap concurrent connections.
+        Before this fix: 30 HN articles → 30 simultaneous HTTP connections to
+        30 different websites. On a slow network day or from a Hugging Face
+        shared container, this could exhaust available socket handles.
+        After this fix: At most 10 website visits run at the same time.
+        Think of it like 10 checkout lanes at a supermarket — if 30 people
+        arrive, 10 go through immediately and 20 wait in line. Nobody gets
+        turned away, and the store doesn't collapse.
+        The total added time is still bounded by the 4-second timeout inside
+        extract_top_image, not by the semaphore.
+        Args:
+            articles (List[Article]): Articles from _map_items_to_articles().
+        Returns:
+            List[Article]: Same articles, with image_url filled in where possible.
+        """
+        if not articles:
+            return articles
+        # Max 10 website visits at the same time.
+        # The semaphore is created fresh per call so it doesn't leak state
+        # between separate fetch_news() invocations.
+        sem = asyncio.Semaphore(10)
+        async def _get_image(article: Article) -> str:
+            if article.image_url and article.image_url.startswith("http"):
+                return article.image_url      # Already has an image — skip
+            # Acquire one of 10 available slots before hitting the network.
+            async with sem:
+                return await extract_top_image(article.url)
+        image_tasks = [_get_image(a) for a in articles]
+        fetched_images = await asyncio.gather(*image_tasks, return_exceptions=True)
+        # Apply the fetched images back to the articles.
+        enriched: List[Article] = []
+        for article, image_result in zip(articles, fetched_images):
+            if isinstance(image_result, str) and image_result:
+                # Pydantic v2: model_copy() changes one field without mutating.
+                article = article.model_copy(update={"image_url": image_result})
+            enriched.append(article)
+        return enriched

app/services/providers/inshorts/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# providers/inshorts/__init__.py
+# ─────────────────────────────────────────────────────────────────────────────
+# This file marks the 'inshorts' folder as a Python package.
+# To use the Inshorts provider, import it like this:
+#
+#   from app.services.providers.inshorts.client import InshortsProvider
+#
+# Inshorts is a FREE provider — no API key needed, no rate limits.
+# It runs in the FREE_SOURCES list, behind the GENERAL_TECH_CATEGORIES
+# guardrail (same as Hacker News), because its content is broad tech news
+# rather than anything niche like cloud-alibaba or data-governance.

app/services/providers/inshorts/client.py ADDED Viewed

	@@ -0,0 +1,346 @@

+"""
+providers/inshorts/client.py
+─────────────────────────────────────────────────────────────────────────────
+The Inshorts Provider for Segmento Pulse.
+What this does:
+    Fetches 60-word tech news summaries from the Inshorts community API.
+    Inshorts takes long articles from the internet and rewrites them in
+    exactly 60 words. This gives our users very quick, scannable reads.
+Free. No API key needed. No rate limits.
+Where it sits in the pipeline:
+    FREE_SOURCES (always runs in parallel).
+    Gated behind GENERAL_TECH_CATEGORIES — same rule as Hacker News.
+    Inshorts "technology" news is broad. It does not know the difference
+    between "cloud-alibaba" and "cloud-gcp". We only ask it for wide,
+    general categories where its content is genuinely valuable.
+The special data quirk (split date and time):
+    Inshorts returns the article timestamp as TWO separate strings:
+        "date": "Mon, 03 Mar 2026"
+        "time": "10:30 AM, IST"
+    Our Pydantic Article model needs a SINGLE published_at timestamp.
+    So we join them: "Mon, 03 Mar 2026 10:30 AM, IST"
+    Then we parse that combined string into a proper datetime object using
+    dateutil.parser (the same library our rss_parser.py already uses).
+    If parsing fails, we safely fall back to datetime.now() so the article
+    still enters the pipeline and the freshness gate makes the final call.
+API note:
+    The endpoint used below is a well-known community-maintained mirror of
+    the Inshorts API. It may change URLs over time. The try/except in
+    fetch_news() wraps the entire fetch, so even if the endpoint goes down,
+    the aggregator just gets an empty list and moves on without crashing.
+"""
+# ── Standard Library ──────────────────────────────────────────────────────────
+import asyncio
+import logging
+import time
+from datetime import datetime, timezone
+from typing import List
+# ── Third-party (already available — used by rss_parser.py line 209) ─────────
+import httpx                    # Async HTTP client
+from dateutil import parser as dateutil_parser   # Flexible date string parser
+# ── Internal ──────────────────────────────────────────────────────────────────
+from app.services.providers.base import NewsProvider, ProviderStatus
+from app.models import Article
+logger = logging.getLogger(__name__)
+# ── Constants ─────────────────────────────────────────────────────────────────
+# Inshorts community API endpoint.
+# The 'category=technology' filter is the closest match to our content needs.
+# Other available categories: national, business, sports, entertainment, etc.
+INSHORTS_URL = "https://inshorts.deta.dev/news?category=technology"
+# Request timeout in seconds. Kept generous because this is a community server.
+HTTP_TIMEOUT_SECONDS = 12.0
+# Max articles to take from one response. Inshorts usually sends 10-25.
+MAX_ARTICLES = 20
+class InshortsProvider(NewsProvider):
+    """
+    Fetches 60-word technology summaries from the Inshorts community API.
+    Free. No API key. No daily limit.
+    Sits in FREE_SOURCES, gated by GENERAL_TECH_CATEGORIES.
+    Usage (wired in Phase 6):
+        provider = InshortsProvider()
+        articles = await provider.fetch_news(category="ai", limit=20)
+    """
+    def __init__(self):
+        # Free provider — no API key, no daily limit.
+        super().__init__(api_key=None)
+        self.daily_limit = 0
+        # Phase 17: Fetch-Once, Fan-Out cache
+        #
+        # Inshorts hits a community server — not a CDN like GitHub Pages.
+        # Without a cache, every category loop sends a request to that
+        # community server, increasing the chance of a 429 rate-limit block.
+        # With a cache: 22 category calls → 1 real HTTP call per 45 minutes.
+        self._cached_articles: List[Article] = []
+        self._cache_time: float = 0.0
+        # Lock prevents the "thundering herd": multiple concurrent calls
+        # all seeing an empty cache and all fetching at the same time.
+        self._lock = asyncio.Lock()
+    # ─────────────────────────────────────────────────────────────────────────
+    # MAIN ENTRY POINT — called by the aggregator's FREE PARALLEL RUN
+    # ────────────────────────────────────────���────────────────────────────────
+    async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
+        """
+        Fetch technology articles from the Inshorts community API.
+        Args:
+            category (str): Our internal category string (e.g., "ai").
+                            We tag every article with it. The keyword gate
+                            filters out articles that don't actually match.
+            limit (int):    Max articles to return. Capped at MAX_ARTICLES.
+        Returns:
+            List[Article]: Mapped Article objects. Returns [] on any failure.
+        """
+        # ── Phase 17: Cache check (OUTER) ─────────────────────────────────────
+        CACHE_TTL_SECONDS = 2700   # 45 minutes
+        if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
+            logger.debug(
+                "[Inshorts] Cache hit — returning %d cached articles for category='%s'. "
+                "No HTTP calls made.",
+                len(self._cached_articles), category
+            )
+            return self._cached_articles
+        # ── Cache stale or empty: acquire the lock and fetch ───────────────────
+        async with self._lock:
+            # ── Cache check (INNER) — double-checked locking ──────────────
+            if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
+                logger.debug(
+                    "[Inshorts] Cache hit after lock — returning %d cached articles.",
+                    len(self._cached_articles)
+                )
+                return self._cached_articles
+            logger.info(
+                "[Inshorts] Cache stale/empty. Fetching from community API for category='%s'...",
+                category
+            )
+            try:
+                async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
+                    response = await client.get(
+                        INSHORTS_URL,
+                        headers={"User-Agent": "SegmentoPulse-Ingestion/1.0"},
+                        follow_redirects=True,
+                    )
+                    # ── Handle rate limit ──────────────────────────────────────
+                    if response.status_code == 429:
+                        logger.warning("[Inshorts] Hit 429 rate limit.")
+                        self.mark_rate_limited()
+                        return []
+                    # ── Handle non-200 responses ──────────────────────────────
+                    if response.status_code != 200:
+                        logger.warning(
+                            "[Inshorts] Unexpected HTTP %d. "
+                            "The community API endpoint may have changed.",
+                            response.status_code
+                        )
+                        return []
+                    data = response.json()
+                    # Inshorts wraps the article list inside a 'data' key.
+                    raw_articles = data.get("data", [])
+                    if not isinstance(raw_articles, list) or not raw_articles:
+                        logger.info("[Inshorts] No articles in response.")
+                        return []
+                    all_articles = self._map_articles(
+                        raw_articles[:min(limit, MAX_ARTICLES)],
+                        category
+                    )
+                    logger.info(
+                        "[Inshorts] Fetched %d articles. Caching for 45 minutes.",
+                        len(all_articles)
+                    )
+                    # Save to class-level cache.
+                    self._cached_articles = all_articles
+                    self._cache_time = time.time()
+                    return all_articles
+            except httpx.TimeoutException:
+                logger.warning("[Inshorts] Request timed out — endpoint may be slow.")
+                return []
+            except Exception as e:
+                logger.error(f"[Inshorts] Unexpected error: {e}", exc_info=True)
+                return []
+    # ─────────────────────────────────────────────────────────────────────────
+    # PRIVATE HELPERS
+    # ─────────────────────────────────────────────────────────────────────────
+    def _parse_inshorts_date(self, date_str: str, time_str: str) -> str:
+        """
+        Solve the split date/time problem.
+        Inshorts gives us date and time as two separate strings.
+        Example:
+            date_str = "Mon, 03 Mar 2026"
+            time_str = "10:30 AM, IST"
+        Step 1: Join them → "Mon, 03 Mar 2026 10:30 AM, IST"
+        Step 2: Parse with dateutil (handles many date formats automatically)
+        Step 3: Convert to UTC-aware ISO 8601 string
+        If parsing fails for any reason, we return the current time as a
+        safe fallback. The freshness gate downstream will evaluate it.
+        Args:
+            date_str (str): The date portion from the API (e.g., "Mon, 03 Mar 2026")
+            time_str (str): The time portion from the API (e.g., "10:30 AM, IST")
+        Returns:
+            str: ISO 8601 timestamp string (e.g., "2026-03-03T05:00:00+00:00")
+        """
+        # Clean up trailing ", IST" or "(IST)" markers — dateutil sometimes
+        # gets confused by non-standard timezone abbreviations like IST.
+        # We strip them and treat the time as IST = UTC+5:30 manually.
+        cleaned_time = (
+            time_str
+            .replace(", IST", "")
+            .replace("(IST)", "")
+            .strip()
+        )
+        combined = f"{date_str.strip()} {cleaned_time}"
+        try:
+            # dateutil.parser is very flexible — it handles formats like:
+            # "Mon, 03 Mar 2026 10:30 AM" without needing a strptime pattern.
+            parsed_dt = dateutil_parser.parse(combined)
+            # If the parsed datetime has no timezone info (which it won't after
+            # we stripped IST), we tell Python it was in IST (UTC+5:30).
+            if parsed_dt.tzinfo is None:
+                from datetime import timedelta
+                IST = timezone(timedelta(hours=5, minutes=30))
+                parsed_dt = parsed_dt.replace(tzinfo=IST)
+            # Convert to UTC for consistent storage across all providers.
+            utc_dt = parsed_dt.astimezone(timezone.utc)
+            return utc_dt.isoformat()
+        except Exception as e:
+            logger.debug(
+                f"[Inshorts] Date parse failed for '{combined}': {e} — using now()."
+            )
+            # Safe fallback: use current UTC time.
+            # The freshness gate will still check it and decide if it's valid.
+            return datetime.now(tz=timezone.utc).isoformat()
+    def _map_articles(self, raw_articles: list, category: str) -> List[Article]:
+        """
+        Convert raw Inshorts JSON items into Segmento Pulse Article objects.
+        Key field mappings:
+            Inshorts field       →  Article field
+            ─────────────────────────────────────
+            title                →  title
+            content              →  description  (the famous 60-word summary)
+            readMoreUrl          →  url
+            imageUrl             →  image_url
+            author               →  source
+            date + time (joined) →  published_at
+        Args:
+            raw_articles (list): The list from the API's 'data' key.
+            category (str):      The category from the aggregator.
+        Returns:
+            List[Article]: Clean, validated Article objects.
+        """
+        articles: List[Article] = []
+        for item in raw_articles:
+            if not isinstance(item, dict):
+                continue
+            # ── Title ────────────────────────────────────────────────────
+            title = (item.get("title") or "").strip()
+            if not title:
+                continue
+            # ── URL ──────────────────────────────────────────────────────
+            # Inshorts calls this 'readMoreUrl' — the link to the full article.
+            url = (item.get("readMoreUrl") or "").strip()
+            if not url or not url.startswith("http"):
+                continue   # Skip if no valid link
+            # ── Description (the 60-word summary) ────────────────────────
+            # Inshorts calls the summary field 'content'.
+            description = (item.get("content") or "").strip()
+            # ── Image URL ─────────────────────────────────────────────────
+            # Inshorts calls this 'imageUrl' (camelCase).
+            image_url = (item.get("imageUrl") or "").strip()
+            # ── Source ───────────────────────────────────────────────────
+            # The 'author' field holds the original publication name
+            # (e.g., "TechCrunch", "NDTV Gadgets"). We use that as source.
+            # Fall back to "Inshorts" if author is missing.
+            source = (item.get("author") or "Inshorts").strip()
+            if not source:
+                source = "Inshorts"
+            # ── Date Fix: Combine split date + time ───────────────────────
+            # This is THE key transformation for this provider.
+            # See _parse_inshorts_date() above for the full explanation.
+            date_part = item.get("date") or ""
+            time_part = item.get("time") or ""
+            published_at = self._parse_inshorts_date(date_part, time_part)
+            # ── Build Article ─────────────────────────────────────────────
+            try:
+                article = Article(
+                    title=title,
+                    description=description,
+                    url=url,
+                    image_url=image_url,
+                    published_at=published_at,
+                    source=source,
+                    # ── ROUTING RULE ──────────────────────────────────────
+                    # We pass through the aggregator's category.
+                    # The keyword gate will filter irrelevant articles.
+                    # Unknown categories safely route to 'News Articles'.
+                    category=category,
+                )
+                articles.append(article)
+            except Exception as e:
+                logger.debug(
+                    f"[Inshorts] Skipped item '{title[:50]}': {e}"
+                )
+                continue
+        return articles

app/services/providers/openrss/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# providers/openrss/__init__.py
+# ─────────────────────────────────────────────────────────────────────────────
+# This file marks the 'openrss' folder as a Python package.
+# To use this provider, import it like this:
+#
+#   from app.services.providers.openrss.client import OpenRSSProvider
+#
+# OpenRSS is FREE — no API key needed. It generates XML feeds on-the-fly
+# for any website, even sites that don't publish an RSS feed themselves.
+#
+# ── CRITICAL RULE: RESPECT THE COOLDOWN ──────────────────────────────────
+# OpenRSS explicitly says "aggregator use is not officially supported".
+# If you fetch too frequently, they WILL ban your server's IP address.
+# The OpenRSSProvider enforces a strict 60-minute internal cooldown timer.
+# DO NOT reduce COOLDOWN_SECONDS below 3600. Breaking this causes IP bans.

app/services/providers/openrss/client.py ADDED Viewed

	@@ -0,0 +1,384 @@

+"""
+providers/openrss/client.py
+─────────────────────────────────────────────────────────────────────────────
+The OpenRSS Provider for Segmento Pulse.
+What this does:
+    Fetches RSS feeds for websites that don't publish their own RSS feed,
+    by using OpenRSS.org as a free feed generation service.
+    Target blogs:
+        dev.to            → openrss.org/dev.to
+        hashnode.com      → openrss.org/hashnode.com
+        github.com/blog   → openrss.org/github.com/blog
+Free. No API key. No daily limits. Just XML text.
+── THE IP BAN RISK AND HOW WE SOLVE IT ─────────────────────────────────────
+OpenRSS.org says clearly in their documentation:
+    "Aggregator use is not officially supported."
+    "We will block IP addresses that ignore our Cache-Control headers."
+A normal aggregator calls all its sources every hour.
+If we did that with OpenRSS, we would get IP-banned within a day.
+Our fix: A strict 60-minute (3600 second) internal cooldown timer.
+    How it works:
+        - When the provider is first created, self.last_fetched = 0
+        - When fetch_news() is called, it first checks:
+              time.time() - self.last_fetched < COOLDOWN_SECONDS?
+        - If YES  → return [] immediately, do not touch the network at all
+        - If NO   → update self.last_fetched, then fetch
+    This guarantees that OpenRSS sees at most ONE request per hour,
+    per URL, from our server — which respects their Cache-Control policy.
+    Because our scheduler runs many categories per hour, without this timer,
+    OpenRSS would get hit dozens of times per hour. With the timer, it gets
+    hit at most once every 60 minutes regardless of how many categories fire.
+── WHY WE DO NOT USE parse_provider_rss() ──────────────────────────────────
+The user instruction suggests using parse_provider_rss() from rss_parser.py.
+We discovered in Phase 4 (direct_rss provider) that this function hardcodes:
+    category = f'cloud-{provider}'
+on EVERY article it creates. If we passed "dev.to" as the provider name,
+every article from dev.to would get category='cloud-dev.to'. Appwrite
+would not know this collection exists, silently dropping those articles.
+Decision (consistent with Phase 4): We use feedparser directly and borrow
+only the two STATELESS helper methods from rss_parser.py:
+    - _extract_image_from_entry()  → extracts images cleanly
+    - _parse_date()                → handles all date format variants
+This is the same engineering decision made in Phase 4 for direct_rss,
+and it was reviewed and approved by the lead architect.
+"""
+# ── Standard Library ──────────────────────────────────────────────────────────
+import asyncio
+import logging
+import re
+import time
+from typing import List
+# ── Third-party (already in requirements.txt) ─────────────────────────────────
+import feedparser      # XML/RSS feed parser — already used by rss_parser.py
+import httpx           # Async HTTP client
+# ── Internal ──────────────────────────────────────────────────────────────────
+from app.services.providers.base import NewsProvider
+from app.services.rss_parser import RSSParser   # Borrowed for helper methods only
+from app.models import Article
+# Phase 15: Import the Redis-backed state utility so the cooldown
+# timer survives Hugging Face Space restarts.
+from app.services.utils.provider_state import (
+    get_provider_timestamp,
+    set_provider_timestamp,
+)
+logger = logging.getLogger(__name__)
+# ── OpenRSS Feed Registry ──────────────────────────────────────────────────────
+#
+# Each entry is a tuple of (openrss_url, source_name).
+# source_name appears in the Pulse UI next to each article headline.
+#
+# To add more feeds in the future, just add a new tuple here.
+# The fetch loop picks it up automatically — no other code changes needed.
+#
+# ⚠️ IMPORTANT: Be conservative. Every URL here gets fetched once per cooldown
+# window. Adding too many URLs consumes more of our cooldown budget.
+#
+OPENRSS_FEEDS: List[tuple] = [
+    ("https://openrss.org/dev.to",          "dev.to"),
+    ("https://openrss.org/hashnode.com",    "Hashnode"),
+    ("https://openrss.org/github.com/blog", "GitHub Blog"),
+]
+# ── Cooldown Timer ────────────────────────��────────────────────────────────────
+# 3600 seconds = 60 minutes.
+# This is the minimum safe polling interval as per OpenRSS's documentation.
+# DO NOT reduce this value. Doing so risks an IP ban on Segmento Pulse's server.
+COOLDOWN_SECONDS = 3600
+# HTTP request timeout. OpenRSS is a third-party service; give it enough time.
+HTTP_TIMEOUT_SECONDS = 15.0
+# Max articles to take from each individual feed per cooldown window.
+MAX_ARTICLES_PER_FEED = 10
+class OpenRSSProvider(NewsProvider):
+    """
+    Fetches RSS feeds from dev.to, Hashnode, and GitHub Blog via OpenRSS.org.
+    Free. No API key. Strictly rate-self-limited to once per 60 minutes.
+    Runs for ALL categories in FREE_SOURCES — no category guardrail needed
+    because the cooldown timer is the primary protection mechanism.
+    Usage (wired in Phase 9):
+        provider = OpenRSSProvider()
+        articles = await provider.fetch_news(category="ai", limit=30)
+    """
+    def __init__(self):
+        # Free provider — no API key, no daily limit.
+        super().__init__(api_key=None)
+        self.daily_limit = 0
+        # Phase 15: The cooldown timer has moved to Redis.
+        # self.last_fetched is kept as a local fallback cache: if Redis is
+        # unreachable on startup, we fall back to 0.0 (fail-open — allowed
+        # to run). On every successful Redis read in fetch_news(), this
+        # local value is updated so it stays in sync.
+        self.last_fetched: float = 0.0
+        # Borrow stateless helpers from the existing RSSParser.
+        # We do NOT call parse_provider_rss() — see module docstring above.
+        self._rss_helpers = RSSParser()
+    # ─────────────────────────────────────────────────────────────────────────
+    # MAIN ENTRY POINT — called by the aggregator's FREE PARALLEL RUN
+    # ─────────────────────────────────────────────────────────────────────────
+    async def fetch_news(self, category: str, limit: int = 30) -> List[Article]:
+        """
+        Fetch articles from all OpenRSS feeds — but only if 60 minutes have
+        passed since the last successful fetch.
+        Args:
+            category (str): The aggregator's category — tagged on every article.
+                            The keyword gate filters irrelevant articles downstream.
+            limit (int):    Soft cap on total articles to return.
+        Returns:
+            List[Article]: Combined articles from all feeds.
+                           Returns [] immediately if we are still in cooldown.
+        """
+        # ── SAFETY CHECK: Are we still in the cooldown window? ────────────────
+        # Phase 15: Read the last-fetch timestamp from Redis instead of RAM.
+        #
+        # Before Phase 15:  self.last_fetched (pure RAM, wiped on restart)
+        # After  Phase 15:  Redis key "provider:state:openrss:last_fetch"
+        #                   survives restarts, deployments, and container OOM kills.
+        #
+        # If Redis is down: get_provider_timestamp returns 0.0 (fail-open).
+        # This means the provider is allowed to run. One extra OpenRSS call
+        # is far safer than permanently blocking the provider because Redis
+        # happened to be unreachable for 10 seconds during a cold boot.
+        redis_last_fetched = await get_provider_timestamp("openrss")
+        # Keep the local RAM value in sync for logging and debugging purposes.
+        # This does NOT affect the cooldown logic — only redis_last_fetched does.
+        self.last_fetched = redis_last_fetched
+        seconds_since_last_fetch = time.time() - redis_last_fetched
+        if seconds_since_last_fetch < COOLDOWN_SECONDS:
+            minutes_remaining = int(
+                (COOLDOWN_SECONDS - seconds_since_last_fetch) / 60
+            )
+            logger.info(
+                "[OpenRSS] Cooldown active — %d minute(s) remaining before next fetch. "
+                "Skipping to protect against IP ban.",
+                minutes_remaining
+            )
+            return []
+        # ── OK to fetch: save the new timestamp to Redis BEFORE hitting the network ──
+        # We write BEFORE the network calls, not after. Here is why:
+        # If we save the timestamp AFTER and the fetch crashes halfway through,
+        # the next scheduler cycle will see "last_fetched = 0" and fire again
+        # immediately — hammering OpenRSS with rapid retries. That is the
+        # exact behaviour that triggers IP bans.
+        # By writing the timestamp FIRST, any crash still waits the full
+        # 60 minutes before the next attempt. Better to miss one batch than
+        # to risk a permanent IP ban.
+        current_time = time.time()
+        self.last_fetched = current_time   # Keep RAM copy in sync
+        await set_provider_timestamp("openrss", current_time)
+        logger.info(
+            "[OpenRSS] Cooldown clear (Redis-backed). Starting fetch of %d feeds...",
+            len(OPENRSS_FEEDS)
+        )
+        try:
+            async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
+                # Build one fetch task per feed URL — all fire simultaneously.
+                fetch_tasks = [
+                    self._fetch_and_parse_feed(client, url, source_name, category)
+                    for url, source_name in OPENRSS_FEEDS
+                ]
+                # Wait for all feeds to complete at the same time.
+                results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
+                # Combine all articles from all feeds.
+                all_articles: List[Article] = []
+                for (_, source_name), result in zip(OPENRSS_FEEDS, results):
+                    if isinstance(result, Exception):
+                        logger.warning(
+                            f"[OpenRSS] [{source_name}] Feed fetch failed: {result}"
+                        )
+                    elif isinstance(result, list):
+                        all_articles.extend(result)
+                logger.info(
+                    f"[OpenRSS] Collected {len(all_articles)} articles "
+                    f"from {len(OPENRSS_FEEDS)} feeds for category='{category}'"
+                )
+                return all_articles
+        except Exception as e:
+            logger.error(f"[OpenRSS] Unexpected error: {e}", exc_info=True)
+            return []
+    # ─────────────────────────────────────────────────────────────────────────
+    # PRIVATE HELPERS
+    # ─────────────────────────────────────────────────────────────────────────
+    async def _fetch_and_parse_feed(
+        self,
+        client: httpx.AsyncClient,
+        url: str,
+        source_name: str,
+        category: str,
+    ) -> List[Article]:
+        """
+        Fetch one OpenRSS feed URL and parse its XML into Article objects.
+        Args:
+            client (httpx.AsyncClient): Shared HTTP client from fetch_news().
+            url (str):         Full OpenRSS URL (e.g., openrss.org/dev.to).
+            source_name (str): Human-readable label (e.g., "dev.to").
+            category (str):    The aggregator's category — tagged on each article.
+        Returns:
+            List[Article]: Parsed articles. Returns [] on any failure.
+        """
+        try:
+            response = await client.get(
+                url,
+                headers={
+                    "User-Agent": "SegmentoPulse-RSS-Reader/1.0",
+                    # Sending Cache-Control: no-cache would be rude.
+                    # We rely on our cooldown timer to manage freshness,
+                    # not by asking OpenRSS to skip their cache.
+                },
+                follow_redirects=True,
+            )
+            if response.status_code == 429:
+                # If OpenRSS sends a 429 despite our cooldown, double the wait
+                # by resetting the timer to now (conservative recovery).
+                logger.warning(
+                    f"[OpenRSS] [{source_name}] HTTP 429 — rate-limited despite "
+                    "cooldown. Consider increasing COOLDOWN_SECONDS."
+                )
+                return []
+            if response.status_code != 200:
+                logger.warning(
+                    f"[OpenRSS] [{source_name}] HTTP {response.status_code} — skipping."
+                )
+                return []
+            xml_text = response.text
+        except httpx.TimeoutException:
+            logger.warning(f"[OpenRSS] [{source_name}] Timed out — skipping.")
+            return []
+        except Exception as e:
+            logger.warning(f"[OpenRSS] [{source_name}] Fetch error: {e}")
+            return []
+        return self._parse_feed_xml(xml_text, source_name, category)
+    def _parse_feed_xml(
+        self,
+        xml_text: str,
+        source_name: str,
+        category: str,
+    ) -> List[Article]:
+        """
+        Parse raw XML from an OpenRSS feed into Article objects.
+        Uses feedparser directly — not parse_provider_rss() — because
+        parse_provider_rss hardcodes category='cloud-{provider}'.
+        We borrow _extract_image_from_entry and _parse_date for consistency.
+        Args:
+            xml_text (str):    Raw XML string from the HTTP response.
+            source_name (str): The blog name (e.g., "dev.to").
+            category (str):    Aggregator category — tagged on every article.
+        Returns:
+            List[Article]: Parsed article objects.
+        """
+        try:
+            feed = feedparser.parse(xml_text)
+        except Exception as e:
+            logger.warning(f"[OpenRSS] [{source_name}] feedparser failed: {e}")
+            return []
+        articles: List[Article] = []
+        for entry in feed.entries[:MAX_ARTICLES_PER_FEED]:
+            # ── Title ────────────────────────────────────────────────────
+            title = (entry.get("title") or "").strip()
+            if not title:
+                continue
+            # ── URL ──────────────────────────────────────────────────────
+            url = (entry.get("link") or "").strip()
+            if not url or not url.startswith("http"):
+                continue
+            # ── Description ───────────────────────────────────────────────
+            raw_desc = entry.get("summary", "") or ""
+            description = re.sub(r"<[^>]+>", "", raw_desc).strip()
+            if len(description) > 200:
+                description = description[:200] + "..."
+            # ── Image URL ─────────────────────────────────────────────────
+            # Reuse rss_parser's helper — checks media:content, enclosures, etc.
+            image_url = self._rss_helpers._extract_image_from_entry(entry)
+            # ── Published Date ────────────────────────────────────────────
+            # Reuse rss_parser's _parse_date — handles all date format variants.
+            raw_date = entry.get("published", "") or ""
+            published_at = self._rss_helpers._parse_date(raw_date)
+            # ── Build Article ─────────────────────────────────────────────
+            try:
+                article = Article(
+                    title=title,
+                    description=description,
+                    url=url,
+                    image_url=image_url,
+                    published_at=published_at,
+                    source=source_name,
+                    # ── ROUTING RULE ──────────────────────────────────────
+                    # Tag with the aggregator's category so the pipeline
+                    # can route this correctly. Unknown categories safely
+                    # fall back to the default 'News Articles' collection.
+                    category=category,
+                )
+                articles.append(article)
+            except Exception as e:
+                logger.debug(
+                    f"[OpenRSS] [{source_name}] Skipped entry '{title[:50]}': {e}"
+                )
+                continue
+        logger.info(f"[OpenRSS] [{source_name}] Parsed {len(articles)} articles.")
+        return articles

app/services/providers/sauravkanchan/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# providers/sauravkanchan/__init__.py
+# ─────────────────────────────────────────────────────────────────────────────
+# This file marks the 'sauravkanchan' folder as a Python package.
+# To use this provider, import it like this:
+#
+#   from app.services.providers.sauravkanchan.client import SauravKanchanProvider
+#
+# This is a FREE, zero-rate-limit provider — it reads static JSON files
+# hosted on GitHub Pages by developer Saurav Kanchan. No API key needed.
+# It fetches tech headlines from both India (in.json) and the US (us.json)
+# simultaneously, doubling volume with a single aggregator call.
+# Gated behind GENERAL_TECH_CATEGORIES (same as Hacker News & Inshorts).

app/services/providers/sauravkanchan/client.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""
+providers/sauravkanchan/client.py
+─────────────────────────────────────────────────────────────────────────────
+The SauravKanchan Static JSON Provider for Segmento Pulse.
+What this does:
+    Reads two static JSON files hosted on GitHub Pages by a developer named
+    Saurav Kanchan. These files are automatically updated by a GitHub Action
+    that scrapes the top tech headlines from NewsAPI.org and saves them as
+    plain JSON files anyone can read for free.
+    We fetch TWO files at the same time:
+        in.json → Top tech headlines from India
+        us.json → Top tech headlines from the United States
+    Fetching both simultaneously means we get double the volume and double
+    the geographic coverage in roughly the same time as fetching just one.
+Why this is zero-cost and zero-rate-limit:
+    These are not API calls — they are just reading a text file from the
+    internet. GitHub Pages has no rate limit for public static file reads.
+    No API key. No signup. No credit card. Completely free forever.
+Why the data is high quality:
+    The JSON structure is identical to the paid NewsAPI.org format, which
+    means we get proper titles, descriptions, image URLs, publication dates,
+    and source names — all cleanly pre-formatted for us.
+Freshness note (important):
+    Saurav's GitHub Action runs on its own schedule — typically a few times
+    per day. This means some articles in the file may be several hours old
+    by the time we read them. That is perfectly fine. Our freshness gate in
+    data_validation.is_valid_article() will automatically reject anything
+    older than our midnight IST cutoff. We never need to pre-filter here.
+Client-side constraint note:
+    These are static files — we cannot add query parameters. We get
+    whatever is in the file. The keyword gate handles topic filtering.
+"""
+# ── Standard Library ──────────────────────────────────────────────────────────
+import asyncio
+import logging
+import time
+from typing import List, Optional
+# ── Third-party (already in requirements.txt) ─────────────────────────────────
+import httpx                            # Async HTTP client
+# ── Internal ──────────────────────────────────────────────────────────────────
+from app.services.providers.base import NewsProvider, ProviderStatus
+from app.models import Article
+logger = logging.getLogger(__name__)
+# ── Static JSON URLs ───────────────────────────────────────────────────────────
+#
+# Both files are hosted on GitHub Pages and updated automatically by a
+# GitHub Action. They follow the exact same JSON structure as NewsAPI.org.
+#
+# To change regions or add new ones (e.g., gb.json), just add a new entry here.
+# The fetch loop picks it up automatically.
+#
+STATIC_FEED_URLS: List[tuple] = [
+    (
+        "https://saurav.tech/NewsAPI/top-headlines/category/technology/in.json",
+        "in",       # Region code — used only in log messages
+    ),
+    (
+        "https://saurav.tech/NewsAPI/top-headlines/category/technology/us.json",
+        "us",       # Region code — used only in log messages
+    ),
+]
+# HTTP request timeout. Static files are fast, but we keep this generous
+# because GitHub Pages occasionally has slow cold starts.
+HTTP_TIMEOUT_SECONDS = 12.0
+# Max articles to take from each regional file.
+# 100 articles per file × 2 files = up to 200 raw articles per call.
+# The freshness gate will reject most of the older ones, leaving us
+# with the freshest and most relevant subset.
+MAX_ARTICLES_PER_REGION = 100
+class SauravKanchanProvider(NewsProvider):
+    """
+    Reads top tech headlines from two static JSON files on GitHub Pages.
+    Covers India (in.json) and the United States (us.json) simultaneously.
+    Free. Zero rate limits. No API key required.
+    Gated behind GENERAL_TECH_CATEGORIES in the aggregator.
+    Usage (wired in Phase 7):
+        provider = SauravKanchanProvider()
+        articles = await provider.fetch_news(category="ai", limit=50)
+    """
+    def __init__(self):
+        # Free provider — no key, no daily limit.
+        super().__init__(api_key=None)
+        self.daily_limit = 0
+        # Phase 17: Fetch-Once, Fan-Out cache
+        #
+        # Saurav's JSON files contain a snapshot of top India + US tech headlines.
+        # The file contents are the same regardless of whether we ask for
+        # category "ai" or category "cloud-gcp" — the files don't change.
+        # Without a cache: the aggregator downloads IN + US files 22 separate
+        # times (once per category), wasting bandwidth and GitHub's servers.
+        # With a cache: downloaded once, stored here for 45 minutes.
+        #
+        # We store the FINAL Pydantic Article objects, not the raw JSON.
+        # This means zero re-parsing on cache hits — callers get typed objects.
+        self._cached_articles: List[Article] = []
+        self._cache_time: float = 0.0
+        # The lock prevents the "thundering herd" problem:
+        # If 5 categories hit this provider at the exact same millisecond
+        # (which asyncio.gather() will do), only the first one fetches.
+        # The other 4 wait patiently at the lock, then return from cache.
+        self._lock = asyncio.Lock()
+    # ─────────────────────────────────────────────────────────────────────────
+    # MAIN ENTRY POINT — called by the aggregator's FREE PARALLEL RUN
+    # ─────────────────────────────────────────────────────────────────────────
+    async def fetch_news(self, category: str, limit: int = 50) -> List[Article]:
+        """
+        Fetch tech headlines from the India and US static JSON files.
+        Both files are downloaded at the same time using asyncio.gather().
+        Their article lists are then combined into one big list and returned.
+        Args:
+            category (str): The aggregator's category string (e.g., "ai").
+                            We tag every article with it. The keyword gate
+                            later filters which ones are truly relevant.
+            limit (int):    Soft cap on total articles to return.
+                            The per-region MAX_ARTICLES_PER_REGION cap is
+                            the real control lever.
+        Returns:
+            List[Article]: Combined articles from IN + US feeds.
+                           Returns [] if both feeds fail.
+        """
+        # ── Phase 17: Cache check (OUTER) ─────────────────────────────────────
+        CACHE_TTL_SECONDS = 2700   # 45 minutes
+        if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
+            logger.debug(
+                "[SauravKanchan] Cache hit — returning %d cached articles for category='%s'. "
+                "No HTTP calls made.",
+                len(self._cached_articles), category
+            )
+            return self._cached_articles
+        # ── Cache stale or empty: acquire the lock and fetch ───────────────────
+        async with self._lock:
+            # ── Cache check (INNER) — double-checked locking ──────────────
+            if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
+                logger.debug(
+                    "[SauravKanchan] Cache hit after lock — returning %d cached articles.",
+                    len(self._cached_articles)
+                )
+                return self._cached_articles
+            logger.info("[SauravKanchan] Cache stale/empty. Fetching IN + US JSON files...")
+            try:
+                async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
+                    # Build one fetch task per regional URL — both fire at the same time.
+                    fetch_tasks = [
+                        self._fetch_single_region(client, url, region_code, category)
+                        for url, region_code in STATIC_FEED_URLS
+                    ]
+                    # Wait for both regional fetches to complete simultaneously.
+                    results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
+                    # Combine articles from both regions into one flat list.
+                    all_articles: List[Article] = []
+                    for (_, region_code), result in zip(STATIC_FEED_URLS, results):
+                        if isinstance(result, Exception):
+                            logger.warning(
+                                f"[SauravKanchan] [{region_code.upper()}] "
+                                f"Fetch failed: {result}"
+                            )
+                        elif isinstance(result, list):
+                            all_articles.extend(result)
+                    logger.info(
+                        "[SauravKanchan] Fetched %d articles from %d regions. "
+                        "Caching for 45 minutes.",
+                        len(all_articles), len(STATIC_FEED_URLS)
+                    )
+                    # Store the fully-mapped Pydantic Article objects in the cache.
+                    # Future category calls get typed objects with zero re-parsing.
+                    self._cached_articles = all_articles
+                    self._cache_time = time.time()
+                    return all_articles
+            except Exception as e:
+                logger.error(f"[SauravKanchan] Unexpected error: {e}", exc_info=True)
+                return []
+    # ─────────────────────────────────────────────────────────────────────────
+    # PRIVATE HELPERS
+    # ─────────────────────────────────────────────────────────────────────────
+    async def _fetch_single_region(
+        self,
+        client: httpx.AsyncClient,
+        url: str,
+        region_code: str,
+        category: str,
+    ) -> List[Article]:
+        """
+        Download one regional JSON file and parse its articles.
+        Args:
+            client (httpx.AsyncClient): Shared HTTP client from fetch_news().
+            url (str):                  The full static JSON URL to fetch.
+            region_code (str):          Short label for logging (e.g., "us", "in").
+            category (str):             The aggregator's category — tagged on articles.
+        Returns:
+            List[Article]: Parsed articles from this region. Returns [] on failure.
+        """
+        try:
+            response = await client.get(
+                url,
+                headers={"User-Agent": "SegmentoPulse-Ingestion/1.0"},
+                follow_redirects=True,
+            )
+            if response.status_code != 200:
+                logger.warning(
+                    f"[SauravKanchan] [{region_code.upper()}] "
+                    f"HTTP {response.status_code} — skipping."
+                )
+                return []
+            data = response.json()
+        except httpx.TimeoutException:
+            logger.warning(
+                f"[SauravKanchan] [{region_code.upper()}] Timed out — skipping."
+            )
+            return []
+        except Exception as e:
+            logger.warning(
+                f"[SauravKanchan] [{region_code.upper()}] Fetch error: {e}"
+            )
+            return []
+        # The JSON has the same shape as NewsAPI.org:
+        # { "status": "ok", "totalResults": 20, "articles": [ ... ] }
+        raw_articles = data.get("articles", [])
+        if not isinstance(raw_articles, list) or not raw_articles:
+            logger.info(
+                f"[SauravKanchan] [{region_code.upper()}] "
+                "No articles found in response."
+            )
+            return []
+        articles = self._map_articles(
+            raw_articles[:MAX_ARTICLES_PER_REGION],
+            region_code,
+            category,
+        )
+        logger.info(
+            f"[SauravKanchan] [{region_code.upper()}] "
+            f"Parsed {len(articles)} articles."
+        )
+        return articles
+    def _map_articles(
+        self,
+        raw_articles: list,
+        region_code: str,
+        category: str,
+    ) -> List[Article]:
+        """
+        Convert raw NewsAPI-format JSON items into Segmento Pulse Article objects.
+        The field names in this JSON are camelCase (like JavaScript), so:
+            urlToImage  →  image_url
+            publishedAt →  published_at
+            source.name →  source
+        Everything else maps directly.
+        Args:
+            raw_articles (list): The 'articles' array from the JSON response.
+            region_code (str):   "in" or "us" — appended to the source name
+                                 so we know where the article came from.
+            category (str):      The aggregator's category string.
+        Returns:
+            List[Article]: Clean Article objects for the pipeline.
+        """
+        articles: List[Article] = []
+        for item in raw_articles:
+            if not isinstance(item, dict):
+                continue
+            # ── Title ────────────────────────────────────────────────────
+            title = (item.get("title") or "").strip()
+            # NewsAPI sometimes puts "[Removed]" as a title for deleted articles
+            if not title or title == "[Removed]":
+                continue
+            # ── URL ──────────────────────────────────────────────────────
+            url = (item.get("url") or "").strip()
+            if not url or not url.startswith("http"):
+                continue
+            # ── Description ──────────────────────────────────────────────��
+            description = (item.get("description") or "").strip()
+            # Skip "[Removed]" placeholder descriptions too
+            if description == "[Removed]":
+                description = ""
+            # ── Image URL (camelCase: urlToImage) ─────────────────────────
+            image_url = (item.get("urlToImage") or "").strip()
+            # ── Published Date (camelCase: publishedAt) ───────────────────
+            # NewsAPI format is already ISO 8601 (e.g., "2026-03-03T06:00:00Z").
+            # Our Pydantic Article model accepts this directly — no conversion.
+            published_at = item.get("publishedAt") or ""
+            # ── Source Name (nested object) ───────────────────────────────
+            # NewsAPI wraps the source as { "id": "...", "name": "..." }.
+            # We only want the 'name' string.
+            source_obj = item.get("source") or {}
+            raw_source_name = (source_obj.get("name") or "").strip()
+            # Append the region code so it's clear in the UI where
+            # this article came from, e.g., "The Verge (IN)" or "Wired (US)".
+            if raw_source_name:
+                source = f"{raw_source_name} ({region_code.upper()})"
+            else:
+                source = f"SauravKanchan ({region_code.upper()})"
+            # ── Build Article ─────────────────────────────────────────────
+            try:
+                article = Article(
+                    title=title,
+                    description=description,
+                    url=url,
+                    image_url=image_url,
+                    published_at=published_at,
+                    source=source,
+                    # ── ROUTING RULE ──────────────────────────────────────
+                    # Pass through the aggregator's category.
+                    # The keyword gate filters out off-topic articles.
+                    # Unknown or empty categories safely route to
+                    # the default 'News Articles' collection.
+                    category=category,
+                )
+                articles.append(article)
+            except Exception as e:
+                logger.debug(
+                    f"[SauravKanchan] Skipped item '{title[:50]}...': {e}"
+                )
+                continue
+        return articles

app/services/providers/thenewsapi/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# providers/thenewsapi/__init__.py
+# ─────────────────────────────────────────────────────────────────────────────
+# This file marks the 'thenewsapi' folder as a Python package.
+# To use TheNewsAPI provider, import it like this:
+#
+#   from app.services.providers.thenewsapi.client import TheNewsAPIProvider
+#
+# This is a PAID provider — it requires the THENEWSAPI_API_KEY environment
+# variable to be set. It has a daily_limit of 100 requests (free tier).
+# It lives in the PAID_CHAIN, meaning it only fires if all providers above
+# it in the chain (GNews, NewsAPI, NewsData) have already failed.

app/services/providers/thenewsapi/client.py ADDED Viewed

	@@ -0,0 +1,347 @@

+"""
+providers/thenewsapi/client.py
+─────────────────────────────────────────────────────────────────────────────
+TheNewsAPI.com Provider for Segmento Pulse.
+What this does:
+    Fetches fresh technology news articles from TheNewsAPI.com.
+    This is a paid API but has the cleanest JSON structure of all paid
+    providers — most of its field names even match our Pydantic Article model.
+Free Tier Limits:
+    - 100 requests per day (resets midnight UTC)
+    - Requires an API key (THENEWSAPI_API_KEY in your .env file)
+Where it sits in the pipeline:
+    PAID_CHAIN position 4 (after GNews → NewsAPI → NewsData).
+    Only fires if all three above it have already failed or hit their limits.
+    Once it returns articles, the paid chain stops — credits protected.
+The special data quirk (categories array):
+    TheNewsAPI returns a 'categories' field as a LIST, not a single string.
+    Example: { "categories": ["tech", "science"] }
+    We grab only the FIRST item from that list.
+    Example: "tech"
+    This raw value ("tech") is then passed through our pipeline.
+    The keyword gate in data_validation.is_relevant_to_category() handles
+    whether the article truly belongs in our system.
+    We do NOT try to translate "tech" → "magazines" ourselves here.
+    That mapping belongs in the validation/data layer, not the fetcher layer.
+    Keep the fetcher dumb — let the pipeline be smart.
+Client-side constraint note:
+    TheNewsAPI supports date filters (published_after, published_before) and
+    language filters (language=en). We use language=en to avoid non-English
+    articles. We do NOT apply date filters because the freshness gate in
+    data_validation.is_valid_article() handles that more accurately in IST.
+"""
+# ── Standard Library ──────────────────────────────────────────────────────────
+import logging
+from datetime import datetime, timezone
+from typing import List, Optional
+# ── Third-party (already in requirements.txt) ─────────────────────────────────
+import httpx                            # Async HTTP client
+# ── Internal ──────────────────────────────────────────────────────────────────
+from app.services.providers.base import NewsProvider, ProviderStatus
+from app.models import Article
+from app.config import settings         # Single source of truth for all keys
+# Phase 16: Import the Redis counter utility to make the daily budget
+# restart-proof. TheNewsAPI only allows 3 real calls per day on the free tier.
+# Without Redis, a server restart resets request_count to 0 and lets us
+# make 3 more calls — potentially 9+ calls on a restart-heavy day.
+from app.services.utils.provider_state import (
+    get_provider_counter,
+    increment_provider_counter,
+)
+logger = logging.getLogger(__name__)
+# ── Constants ─────────────────────────────────────────────────────────────────
+# Base URL for all TheNewsAPI endpoints
+THENEWSAPI_BASE_URL = "https://api.thenewsapi.com/v1/news/all"
+# How long (seconds) to wait before giving up on a request
+HTTP_TIMEOUT_SECONDS = 10.0
+# How many articles to request per call. 25 is their recommended page size.
+ARTICLES_PER_REQUEST = 25
+class TheNewsAPIProvider(NewsProvider):
+    """
+    Fetches technology news from TheNewsAPI.com.
+    Paid provider — needs THENEWSAPI_API_KEY in your .env file.
+    Sits at position 4 in the PAID_CHAIN (last paid fallback).
+    100 requests/day on the free tier.
+    Usage (wired into the aggregator in Phase 5):
+        provider = TheNewsAPIProvider(api_key="your_key_here")
+        articles = await provider.fetch_news(category="ai", limit=25)
+    """
+    def __init__(self, api_key: Optional[str] = None):
+        super().__init__(api_key=api_key)
+        # Phase 16 Audit Fix: Corrected from 100 → 3.
+        #
+        # The free tier documentation lists "100 requests/day" but in practice
+        # the Community (free) tier is hard-capped at 3 requests per day.
+        # Our QA audit caught this discrepancy: with daily_limit=100, the old
+        # code would keep calling this API expecting 100 slots, burning all 3
+        # real calls immediately and then receiving 402s for the rest of the day.
+        #
+        # With daily_limit=3 + Redis persistence: we use at most 3 calls/day
+        # even across multiple server restarts. The 3rd call is reserved as an
+        # emergency slot — Redis budget enforcement kicks in at 2.
+        self.daily_limit = 3
+        # Category mapping: translate our internal category names into the
+        # categories that TheNewsAPI actually understands.
+        # TheNewsAPI uses these: tech, science, sports, business, health, entertainment, general
+        # We map our fine-grained categories to the closest match.
+        self.category_map = {
+            'ai':                      'tech',
+            'data-security':           'tech',
+            'data-governance':         'tech',
+            'data-privacy':            'tech',
+            'data-engineering':        'tech',
+            'data-management':         'tech',
+            'business-intelligence':   'business',
+            'business-analytics':      'business',
+            'customer-data-platform':  'business',
+            'data-centers':            'tech',
+            'cloud-computing':         'tech',
+            'magazines':               'tech',
+            'data-laws':               'tech',
+            # Cloud sub-categories → all map to 'tech' in TheNewsAPI's world
+            'cloud-aws':               'tech',
+            'cloud-azure':             'tech',
+            'cloud-gcp':               'tech',
+            'cloud-oracle':            'tech',
+            'cloud-ibm':               'tech',
+            'cloud-alibaba':           'tech',
+            'cloud-digitalocean':      'tech',
+            'cloud-huawei':            'tech',
+            'cloud-cloudflare':        'tech',
+        }
+    # ─────────────────────────────────────────────────────────────────────────
+    # MAIN ENTRY POINT — called by the aggregator's PAID WATERFALL
+    # ─────────────────────────────────────────────────────────────────────────
+    async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
+        """
+        Fetch technology articles from TheNewsAPI.com.
+        Args:
+            category (str): Our internal category (e.g., "ai", "cloud-aws").
+                            We look this up in self.category_map to get the
+                            correct TheNewsAPI category keyword.
+            limit (int):    Maximum number of articles to return.
+        Returns:
+            List[Article]: Mapped Article objects. Returns [] on failure.
+        """
+        # No API key means this provider cannot run.
+        # The aggregator will have already checked this via is_available(),
+        # but we double-check here for safety.
+        if not self.api_key:
+            logger.debug("[TheNewsAPI] No API key configured — skipping.")
+            return []
+        # ── PHASE 16: Redis-backed daily budget guard ────────────────────────
+        # Real free-tier limit: 3 calls/day (corrected in this phase).
+        # We check Redis FIRST before building any params or making any HTTP call.
+        #
+        # Why inside fetch_news and not inside is_available()?
+        # is_available() is a synchronous function on the base class.
+        # Redis calls are async (they use `await`). You cannot mix them:
+        # calling an async function from a sync function crashes at runtime.
+        # So we do the async Redis check here, at the very top of async fetch_news.
+        today_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+        current_calls = await get_provider_counter("thenewsapi", today_str)
+        if current_calls >= self.daily_limit:
+            logger.warning(
+                "[TheNewsAPI] Daily Redis budget exhausted — %d/%d calls used today. "
+                "Skipping to protect the 3-call daily quota.",
+                current_calls, self.daily_limit
+            )
+            self.mark_rate_limited()
+            return []
+        try:
+            # Translate our internal category to TheNewsAPI's category keyword.
+            # If the category is not in our map, default to 'tech'.
+            api_category = self.category_map.get(category, "tech")
+            params = {
+                "api_token":  self.api_key,
+                "language":   "en",           # English articles only
+                "categories": api_category,   # TheNewsAPI category keyword
+                "limit":      min(limit, ARTICLES_PER_REQUEST),
+                # NOTE: We deliberately do NOT add 'published_after' or
+                # 'published_before' date filters.
+                # TheNewsAPI supports them, but our freshness gate
+                # (is_valid_article in data_validation.py) already enforces
+                # the correct IST-based date boundary. Letting the gate handle
+                # it is safer and avoids timezone conversion bugs here.
+            }
+            async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
+                logger.info("[TheNewsAPI] Fetching '%s' (api_category='%s')...", category, api_category)
+                response = await client.get(THENEWSAPI_BASE_URL, params=params)
+                # ── Handle rate limit ─────────────────────────────────────
+                if response.status_code == 429:
+                    logger.warning("[TheNewsAPI] Hit 429 rate limit. Marking as rate-limited.")
+                    self.mark_rate_limited()
+                    return []
+                # ── Handle authentication failure ─────────────────────────
+                if response.status_code == 401:
+                    logger.error("[TheNewsAPI] 401 Unauthorized — API key is invalid or expired.")
+                    self.status = ProviderStatus.ERROR
+                    return []
+                # ── Handle quota exhaustion ───────────────────────────────
+                if response.status_code == 402:
+                    logger.warning("[TheNewsAPI] 402 Payment Required — daily quota exhausted.")
+                    self.mark_rate_limited()
+                    return []
+                # ── Handle other non-200 responses ────────────────────────
+                if response.status_code != 200:
+                    logger.warning(f"[TheNewsAPI] Unexpected HTTP {response.status_code}.")
+                    return []
+                # ── Parse and map the response ──────────────────────────────────
+                self.request_count += 1   # Keep RAM shadow in sync for debugging
+                data = response.json()
+                # TheNewsAPI wraps articles in a 'data' key at the top level
+                raw_articles = data.get("data", [])
+                if not raw_articles:
+                    logger.info(f"[TheNewsAPI] No articles returned for category='{category}'.")
+                    return []
+                articles = self._map_articles(raw_articles, category)
+                # ── PHASE 16: Increment the Redis counter after a successful call ──
+                # Only successful 200 responses count against the daily budget.
+                # 402/429/timeout failures do not consume a slot.
+                await increment_provider_counter("thenewsapi", today_str)
+                logger.info("[TheNewsAPI] Got %d articles for '%s'.", len(articles), category)
+                return articles
+        except httpx.TimeoutException:
+            logger.warning("[TheNewsAPI] Request timed out.")
+            return []
+        except Exception as e:
+            logger.error(f"[TheNewsAPI] Unexpected error: {e}", exc_info=True)
+            return []
+    # ─────────────────────────────────────────────────────────────────────────
+    # PRIVATE HELPER — maps raw JSON items to Article objects
+    # ─────────────────────────────────────────────────────────────────────────
+    def _map_articles(self, raw_articles: list, category: str) -> List[Article]:
+        """
+        Convert TheNewsAPI JSON items into Segmento Pulse Article objects.
+        The mapping is almost 1-to-1 with our Pydantic model, which is why
+        this is the easiest of all paid providers to integrate.
+        One special case: 'categories' is a list, not a string.
+        We take [0] (the first item) as the article's category value.
+        Args:
+            raw_articles (list): The 'data' array from TheNewsAPI's response.
+            category (str):      Our internal category (from the aggregator).
+        Returns:
+            List[Article]: Clean Article objects for the pipeline.
+        """
+        articles: List[Article] = []
+        for item in raw_articles:
+            # ── Title ─────────────────────────────────────────────────────
+            title = (item.get("title") or "").strip()
+            if not title:
+                continue
+            # ── URL ───────────────────────────────────────────────────────
+            url = (item.get("url") or "").strip()
+            if not url or not url.startswith("http"):
+                continue
+            # ── Description ───────────────────────────────────────────────
+            # TheNewsAPI provides real summaries — a huge advantage over HN.
+            description = (item.get("description") or "").strip()
+            # ── Image URL ─────────────────────────────────────────────────
+            # The field is ALREADY called 'image_url' in their API.
+            # This is the cleanest mapping of any provider we have integrated.
+            image_url = (item.get("image_url") or "").strip()
+            # ── Published Date ────────────────────────────────────────────
+            # TheNewsAPI returns ISO 8601 format (e.g., "2024-03-03T06:00:00.000000Z").
+            # Our Pydantic Article model already handles this format in its
+            # published_at validator — no conversion needed.
+            published_at = item.get("published_at") or ""
+            # ── Source Name ───────────────────────────────────────────────
+            # TheNewsAPI's live response returns `source` as a plain string
+            # (the publisher domain, e.g. "techcrunch.com"), NOT as a nested
+            # dict like NewsAPI.org does. We handle both shapes defensively.
+            raw_source = item.get("source") or ""
+            if isinstance(raw_source, dict):
+                # Nested object shape: {"name": "TechCrunch", "url": "..."}
+                source = (raw_source.get("name") or "TheNewsAPI").strip()
+            else:
+                # Plain string shape: "techcrunch.com" — use it as-is.
+                source = str(raw_source).strip() or "TheNewsAPI"
+            # ── Category ──────────────────────────────────────────────────
+            # TheNewsAPI returns categories as a LIST, e.g., ["tech", "science"]
+            # We take only the first item. Our keyword gate will verify relevance.
+            # ROUTING RULE: if the list is empty, fall back to our internal
+            # category name. Both "" and category will safely route to the
+            # default 'News Articles' collection if unrecognised.
+            raw_categories = item.get("categories") or []
+            if raw_categories and isinstance(raw_categories, list):
+                article_category = raw_categories[0]
+            else:
+                article_category = category   # Fallback to aggregator's category
+            # ── Build Article ─────────────────────────────────────────────
+            try:
+                article = Article(
+                    title=title,
+                    description=description,
+                    url=url,
+                    image_url=image_url,
+                    published_at=published_at,
+                    source=source,
+                    category=article_category,
+                )
+                articles.append(article)
+            except Exception as e:
+                logger.debug(
+                    f"[TheNewsAPI] Skipped item url='{url[:60]}': {e}"
+                )
+                continue
+        return articles

app/services/providers/webz/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# providers/webz/__init__.py
+# ─────────────────────────────────────────────────────────────────────────────
+# This file marks the 'webz' folder as a Python package.
+# To use this provider, import it like this:
+#
+#   from app.services.providers.webz.client import WebzProvider
+#
+# This is a PAID provider — requires WEBZ_API_KEY in your .env file.
+# Position 6 in the PAID_CHAIN (deepest paid failover).
+#
+# ── CRITICAL BUDGET WARNING ───────────────────────────────────────────────
+# Webz.io free tier: 1,000 calls per MONTH (not per day).
+# daily_limit is set to 30 inside WebzProvider to pace usage to ~900/month.
+# DO NOT increase daily_limit above 33 — doing so will exhaust the
+# monthly budget before the month ends.

app/services/providers/webz/client.py ADDED Viewed

	@@ -0,0 +1,404 @@

+"""
+providers/webz/client.py
+─────────────────────────────────────────────────────────────────────────────
+The Webz.io Provider for Segmento Pulse.
+What this does:
+    Fetches enterprise-grade news articles from Webz.io's News API Lite.
+    Webz crawls 3.5 million articles per day from across the open web,
+    making it one of the richest news sources we have available.
+Paid provider — needs WEBZ_API_KEY in your .env file.
+Position 6 in the PAID_CHAIN (absolute final paid failover).
+── THE MONTHLY BUDGET PROBLEM AND HOW WE SOLVE IT ──────────────────────────
+Webz free tier gives us 1,000 calls per MONTH — not per day.
+Our scheduler runs many categories every hour. Without a limit, we would
+exhaust the entire 1,000-call monthly budget in less than 48 hours.
+Our fix: daily_limit = 30 inside this class.
+The quota tracker caps us at 30 calls per calendar day.
+30 calls/day × 30 days = 900 calls/month — safely under 1,000.
+This paces the budget across the whole month as an even, predictable cost.
+Math visible to future engineers:
+    1,000 calls ÷ 30 days = 33.3 calls/day max to exactly hit the limit.
+    We use 30 to leave a 10% safety margin for edge cases (month resets,
+    server restarts that lose the quota counter's in-memory state, etc.).
+── THE NESTED IMAGE PROBLEM AND HOW WE SOLVE IT ─────────────────────────────
+Webz does not put images at the top level of each article object.
+Instead, the image is buried inside a nested 'thread' object like this:
+    {
+        "title": "Article Title",
+        "url": "https://...",
+        "thread": {
+            "site_full":   "techcrunch.com",   ← source name is here too
+            "main_image":  "https://..."        ← image is here
+        },
+        "text": "Full article body (thousands of words)..."
+    }
+Our fix: We safely "drill down" using chained .get() calls.
+    thread = item.get("thread") or {}
+    image_url = thread.get("main_image") or ""
+    If 'thread' is missing → {} (empty dict, no crash)
+    If 'main_image' is missing → "" (empty string, no crash)
+    Either way, the pipeline gets a clean empty string for the fallback image.
+── THE FULL TEXT BODY PROBLEM AND HOW WE SOLVE IT ──────────────────────────
+Webz provides the COMPLETE article body in the 'text' field — this can be
+thousands of words. Storing that in our database is too large and risks
+reproducing copyright-protected content.
+Our fix: Truncate to the first 200 characters (same approach as Phase 8).
+200 characters is enough for a preview. Our newsletter system uses the
+description field but also has its own 160-char cap, so anything beyond
+200 already has no use downstream.
+"""
+# ── Standard Library ──────────────────────────────────────────────────────────
+import logging
+from datetime import datetime, timezone
+from typing import List, Optional
+# ── Third-party (already in requirements.txt) ─────────────────────────────────
+import httpx                            # Async HTTP client
+# ── Internal ──────────────────────────────────────────────────────────────────
+from app.services.providers.base import NewsProvider, ProviderStatus
+from app.models import Article
+from app.config import settings
+# Phase 16: Import the Redis counter utility for dual-layer budget protection.
+# Webz has the strictest budget of all three paid providers — 1,000 calls per
+# MONTH. Without restart-proof counters, a restart-heavy day can exhaust the
+# entire monthly budget in a few hours. Two Redis keys protect us:
+#   1. Daily key ("webz", today_str) — caps us at 30/day
+#   2. Monthly key ("webz_month", month_str) — caps us at 900/month total
+from app.services.utils.provider_state import (
+    get_provider_counter,
+    increment_provider_counter,
+)
+logger = logging.getLogger(__name__)
+# ── Constants ─────────────────────────────────────────────────────────────────
+# Webz.io News API Lite endpoint
+WEBZ_API_URL = "https://api.webz.io/newsApiLite"
+# Request timeout in seconds. Enterprise APIs are usually fast.
+HTTP_TIMEOUT_SECONDS = 12.0
+# Articles to request per call. Keeping this modest saves the budget
+# because Webz deducts from quota based on results returned, not just calls.
+ARTICLES_PER_REQUEST = 10
+# Maximum characters to keep from the article body for the description field.
+# Matches Phase 8's WorldNewsAI approach for consistency.
+DESCRIPTION_MAX_CHARS = 200
+# Category → search query translation.
+# Webz uses free-text query strings (like Google search), so we convert
+# our internal category slugs into descriptive keyword phrases that maximise
+# the quality of results from Webz's index.
+CATEGORY_QUERY_MAP = {
+    'ai':                      'artificial intelligence machine learning',
+    'data-security':           'data security cybersecurity breach hacking',
+    'data-governance':         'data governance compliance policy',
+    'data-privacy':            'data privacy GDPR regulation',
+    'data-engineering':        'data engineering pipeline ETL spark',
+    'data-management':         'data management master data catalog',
+    'business-intelligence':   'business intelligence analytics BI tools',
+    'business-analytics':      'business analytics data-driven decisions',
+    'customer-data-platform':  'customer data platform CDP personalization',
+    'data-centers':            'data center infrastructure hyperscaler',
+    'cloud-computing':         'cloud computing technology platform',
+    'magazines':               'technology news innovation',
+    'data-laws':               'AI regulation data law privacy act',
+    'cloud-aws':               'Amazon AWS cloud services',
+    'cloud-azure':             'Microsoft Azure cloud platform',
+    'cloud-gcp':               'Google Cloud Platform GCP services',
+    'cloud-oracle':            'Oracle Cloud OCI database',
+    'cloud-ibm':               'IBM Cloud Red Hat OpenShift',
+    'cloud-alibaba':           'Alibaba Cloud Aliyun technology',
+    'cloud-digitalocean':      'DigitalOcean cloud developer platform',
+    'cloud-huawei':            'Huawei Cloud services technology',
+    'cloud-cloudflare':        'Cloudflare CDN security network',
+}
+class WebzProvider(NewsProvider):
+    """
+    Fetches enterprise-grade news articles from Webz.io News API Lite.
+    Paid provider — 1,000 calls/month free tier, paced to 30/day.
+    Position 6 in the PAID_CHAIN (deepest paid failover).
+    Only fires when all 5 providers above it have failed or hit limits.
+    Requires WEBZ_API_KEY in the .env file.
+    Usage (wired in Phase 10):
+        provider = WebzProvider(api_key="your_key_here")
+        articles = await provider.fetch_news(category="ai", limit=10)
+    """
+    def __init__(self, api_key: Optional[str] = None):
+        super().__init__(api_key=api_key)
+        # 30 calls/day × 30 days = 900/month — safely under the 1,000 cap.
+        # The quota tracker enforces this limit before each call.
+        # 10% safety margin included for server restart edge cases.
+        self.daily_limit = 30
+    # ─────────────────────────────────────────────────────────────────────────
+    # MAIN ENTRY POINT — called by the aggregator's PAID WATERFALL
+    # ─────────────────────────────────────────────────────────────────────────
+    async def fetch_news(self, category: str, limit: int = 10) -> List[Article]:
+        """
+        Fetch news articles from Webz.io for the given category.
+        Args:
+            category (str): Our internal category slug (e.g., "ai").
+                            Translated to a keyword query via CATEGORY_QUERY_MAP.
+            limit (int):    Max articles to return. Kept at 10 to conserve
+                            the monthly call budget (Webz charges per result).
+        Returns:
+            List[Article]: Mapped Article objects. Returns [] on any failure.
+        """
+        if not self.api_key:
+            logger.debug("[Webz] No API key configured — skipping.")
+            return []
+        # ── PHASE 16: Dual-layer Redis budget guard ────────────────────────
+        #
+        # Webz is the most budget-constrained provider we have: 1,000 calls/MONTH.
+        # We protect it with TWO independent Redis counters running in parallel.
+        #
+        # Gate 1 — DAILY: Stops at 30 calls/day to pace spending evenly.
+        #   Redis key: "provider:state:webz:calls:2026-03-03"  (TTL: 24h)
+        #
+        # Gate 2 — MONTHLY: Stops at 900 calls/month (10% safety margin on 1,000).
+        #   Redis key: "provider:state:webz_month:calls:2026-03" (TTL: 30 days)
+        #   Note: The key name includes the month string ("2026-03").
+        #   When April starts, the key name changes to "2026-04" automatically
+        #   — no manual cleanup needed. The old March key expires via TTL.
+        #
+        # Either gate being exhausted blocks the call completely.
+        # Fail-safe design: if Redis is down, both return 999999 — call is skipped.
+        today_str  = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+        month_str  = datetime.now(timezone.utc).strftime("%Y-%m")
+        daily_calls   = await get_provider_counter("webz",       today_str)
+        monthly_calls = await get_provider_counter("webz_month", month_str)
+        # Hard monthly ceiling: 900 (leaving 100 as safety buffer on the 1,000 limit)
+        MONTHLY_HARD_LIMIT = 900
+        if daily_calls >= self.daily_limit:
+            logger.warning(
+                "[Webz] Daily Redis budget exhausted — %d/%d calls used today. "
+                "Skipping to protect the monthly quota.",
+                daily_calls, self.daily_limit
+            )
+            self.mark_rate_limited()
+            return []
+        if monthly_calls >= MONTHLY_HARD_LIMIT:
+            logger.warning(
+                "[Webz] Monthly Redis budget exhausted — %d/%d calls used this month. "
+                "No more Webz calls until next month to protect the 1,000-call limit.",
+                monthly_calls, MONTHLY_HARD_LIMIT
+            )
+            self.mark_rate_limited()
+            return []
+        # Translate our internal category slug into a Webz-friendly search phrase.
+        search_query = CATEGORY_QUERY_MAP.get(category, f"technology {category}")
+        params = {
+            "token":    self.api_key,
+            "q":        search_query,
+            "language": "english",
+            "size":     min(limit, ARTICLES_PER_REQUEST),
+            # NOTE: No date filters applied here intentionally.
+            # Our freshness gate in data_validation.is_valid_article()
+            # handles date boundaries accurately using IST windows.
+            # Adding date filters here would add timezone conversion risk.
+        }
+        try:
+            async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
+                print(
+                    f"[Webz] Fetching '{category}' "
+                    f"(query='{search_query[:40]}...')..."
+                )
+                response = await client.get(WEBZ_API_URL, params=params)
+                # ── HTTP 402: Monthly budget exhausted ────────────────────
+                # Webz uses 402 to mean "you have no more credits this month".
+                # We mark as rate-limited so the circuit breaker respects it.
+                if response.status_code == 402:
+                    logger.warning(
+                        "[Webz] HTTP 402 — monthly call budget exhausted. "
+                        "No more calls until quota resets at month end."
+                    )
+                    self.mark_rate_limited()
+                    return []
+                # ── HTTP 401: Bad API key ─────────────────────────────────
+                if response.status_code == 401:
+                    logger.error(
+                        "[Webz] HTTP 401 — API key is invalid or expired. "
+                        "Check WEBZ_API_KEY in your .env file."
+                    )
+                    self.status = ProviderStatus.ERROR
+                    return []
+                # ── HTTP 429: Too many requests (short-term rate limit) ───
+                if response.status_code == 429:
+                    logger.warning("[Webz] HTTP 429 — request rate exceeded.")
+                    self.mark_rate_limited()
+                    return []
+                # ── Any other non-200 ─────────────────────────────────────
+                if response.status_code != 200:
+                    logger.warning(f"[Webz] Unexpected HTTP {response.status_code}.")
+                    return []
+                # ── Parse the response ────────────────────────────────────
+                self.request_count += 1   # Keep RAM shadow in sync for debugging
+                data = response.json()
+                # Webz wraps the article list in a 'posts' key at the top level.
+                raw_posts = data.get("posts", [])
+                if not raw_posts:
+                    logger.info(f"[Webz] No articles returned for '{category}'.")
+                    return []
+                articles = self._map_articles(raw_posts, category)
+                # ── PHASE 16: Increment BOTH Redis counters after a successful call ──
+                # The monthly counter uses a 30-day TTL (2592000 seconds).
+                # This is long enough to outlive any calendar month.
+                # The key name ("webz_month:calls:2026-03") changes with each month
+                # so old keys just fade away on their own without our help.
+                await increment_provider_counter("webz",       today_str, expire_seconds=86400)
+                await increment_provider_counter("webz_month", month_str, expire_seconds=2592000)
+                logger.info("[Webz] Got %d articles for '%s'.", len(articles), category)
+                return articles
+        except httpx.TimeoutException:
+            logger.warning("[Webz] Request timed out.")
+            return []
+        except Exception as e:
+            logger.error(f"[Webz] Unexpected error: {e}", exc_info=True)
+            return []
+    # ─────────────────────────────────────────────────────────────────────────
+    # PRIVATE HELPER — maps raw JSON posts to Article objects
+    # ─────────────────────────────────────────────────────────────────────────
+    def _map_articles(self, raw_posts: list, category: str) -> List[Article]:
+        """
+        Convert Webz.io JSON 'posts' items into Segmento Pulse Article objects.
+        Key challenges handled here:
+            1. Nested image — lives inside posts[].thread.main_image
+            2. Nested source — lives inside posts[].thread.site_full
+            3. Full text body — truncated to 200 characters
+            4. Published date — Webz uses ISO 8601, our model accepts it directly
+        Webz field              →  Article field
+        ─────────────────────────────────────────
+        title                   →  title
+        url                     →  url
+        thread.site_full        →  source       (nested — safe .get() chain)
+        thread.main_image       →  image_url    (nested — safe .get() chain)
+        published               →  published_at
+        text (truncated 200)    →  description
+        Args:
+            raw_posts (list): The 'posts' array from the API response.
+            category (str):   The aggregator's category for routing.
+        Returns:
+            List[Article]: Clean Article objects ready for the pipeline.
+        """
+        articles: List[Article] = []
+        for item in raw_posts:
+            if not isinstance(item, dict):
+                continue
+            # ── Title ────────────────────────────────────────────────────
+            title = (item.get("title") or "").strip()
+            if not title:
+                continue
+            # ── URL ──────────────────────────────────────────────────────
+            url = (item.get("url") or "").strip()
+            if not url or not url.startswith("http"):
+                continue
+            # ── Published Date ────────────────────────────────────────────
+            # Webz returns ISO 8601 format (e.g., "2026-03-03T06:00:00.000+0000").
+            # Our Article model's published_at validator handles this directly.
+            published_at = item.get("published") or ""
+            # ── Nested: Source and Image ──────────────────────────────────
+            # The 'thread' field is a nested dictionary containing both.
+            # We extract it once, then pull from it safely.
+            # If 'thread' is missing for any reason, we fall back to an empty
+            # dict {} so the chained .get() calls below don't crash.
+            thread = item.get("thread") or {}
+            # Source: the full domain name of the publishing site.
+            # Example: "techcrunch.com" or "thenextweb.com"
+            source = (thread.get("site_full") or "Webz").strip()
+            if not source:
+                source = "Webz"
+            # Image: the main article image from the thread context.
+            # Buried one level deep — safe because of the `or {}` fallback above.
+            image_url = (thread.get("main_image") or "").strip()
+            # ── Description (TRUNCATED full article body) ─────────────────
+            # 'text' contains the complete article body — potentially thousands
+            # of words. We keep only the first 200 characters as a preview.
+            # This protects us from database bloat and copyright issues.
+            raw_text = (item.get("text") or "").strip()
+            if len(raw_text) > DESCRIPTION_MAX_CHARS:
+                description = raw_text[:DESCRIPTION_MAX_CHARS] + "..."
+            else:
+                description = raw_text
+            # ── Build Article ─────────────────────────────────────────────
+            try:
+                article = Article(
+                    title=title,
+                    description=description,
+                    url=url,
+                    image_url=image_url,
+                    published_at=published_at,
+                    source=source,
+                    # ── ROUTING RULE ──────────────────────────────────────
+                    # Pass through the aggregator's category.
+                    # Unknown/empty categories route to 'News Articles'.
+                    category=category,
+                )
+                articles.append(article)
+            except Exception as e:
+                logger.debug(f"[Webz] Skipped post '{title[:50]}': {e}")
+                continue
+        return articles

app/services/providers/wikinews/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# providers/wikinews/__init__.py
+# ─────────────────────────────────────────────────────────────────────────────
+# This file marks the 'wikinews' folder as a Python package.
+# To use this provider, import it like this:
+#
+#   from app.services.providers.wikinews.client import WikinewsProvider
+#
+# Wikinews is 100% FREE — no API key, no rate limits, no registration.
+# It is run by the Wikimedia Foundation (same people who run Wikipedia).
+#
+# All content is published under Public Domain or Creative Commons licenses.
+# This makes it the only copyright-bulletproof news source in our pipeline.
+#
+# Gated behind GENERAL_TECH_CATEGORIES (same as HN, Inshorts, SauravKanchan)
+# because Wikinews tech categories cover broad technology topics only.

app/services/providers/wikinews/client.py ADDED Viewed

	@@ -0,0 +1,435 @@

+"""
+providers/wikinews/client.py
+─────────────────────────────────────────────────────────────────────────────
+The Wikinews Provider for Segmento Pulse.
+What this does:
+    Fetches technology news articles from Wikinews (en.wikinews.org).
+    Wikinews is run by the Wikimedia Foundation — the same organization
+    behind Wikipedia and Wiktionary.
+Free. No API key. No rate limits. No copyright concerns.
+Why Wikinews is unique:
+    Every article on Wikinews is published under Public Domain or extremely
+    open Creative Commons licenses. This means we can freely display their
+    content without any legal risk. It is the only fully copyright-bulletproof
+    news source in our entire pipeline.
+We search TWO Wikinews categories concurrently for maximum coverage:
+    - "Computing"  → software, hardware, AI, security news
+    - "Internet"   → web tech, data, social media policy news
+Gated behind GENERAL_TECH_CATEGORIES in the aggregator because Wikinews
+tech content is broad — it does not know about "cloud-alibaba" or
+"data-governance" as separate topics.
+── THE HTML SNIPPET PROBLEM AND HOW WE FIX IT ───────────────────────────────
+The MediaWiki search API highlights your search terms inside the description
+snippet by wrapping them in HTML tags like this:
+    "The latest advances in <span class=\"searchmatch\">computing</span> have..."
+If we stored that raw, our database would get cluttered with raw HTML tags
+that would then appear in the Pulse UI as literal text.
+Fix: We use a simple regex pattern to strip ALL HTML tags from the snippet.
+    re.sub(r'<[^>]+>', '', raw_snippet).strip()
+    <[^>]+> means: any '<', followed by one or more characters that are
+    NOT '>', followed by '>'. This matches every HTML tag universally,
+    not just MediaWiki's specific span tags — making it bulletproof for
+    any future format changes on their end.
+── URL CONSTRUCTION FROM pageid ─────────────────────────────────────────────
+MediaWiki search results give us a 'pageid' integer, NOT a direct URL.
+We construct a permanent, stable URL using the curid URL format:
+    f"https://en.wikinews.org/?curid={pageid}"
+    Example: pageid = 4684321 → https://en.wikinews.org/?curid=4684321
+This URL format is guaranteed stable by Wikimedia — it never changes
+even if the article is moved or renamed.
+"""
+# ── Standard Library ──────────────────────────────────────────────────────────
+import asyncio
+import logging
+import re
+from typing import List
+# ── Third-party (already in requirements.txt) ─────────────────────────────────
+import httpx                            # Async HTTP client
+# ── Internal ──────────────────────────────────────────────────────────────────
+from app.services.providers.base import NewsProvider
+from app.models import Article
+# Phase 12: Shared image enricher (extracts og:image from article pages)
+from app.services.utils.image_enricher import extract_top_image
+logger = logging.getLogger(__name__)
+# ── Wikinews API Configuration ────────────────────────────────────────────────
+# The MediaWiki Action API endpoint for English Wikinews.
+WIKINEWS_API_URL = "https://en.wikinews.org/w/api.php"
+# We search two categories to broaden our coverage of tech news.
+# 'Computing' → software, AI, hardware. 'Internet' → web, data, social policy.
+WIKINEWS_CATEGORIES = [
+    "Computing",
+    "Internet",
+]
+# Max articles to take per category query.
+# 10 per category × 2 categories = up to 20 articles per call.
+MAX_ARTICLES_PER_CATEGORY = 10
+# HTTP timeout in seconds. Wikimedia servers are reliable but can be slow.
+HTTP_TIMEOUT_SECONDS = 12.0
+# Regex to strip ALL HTML tags from MediaWiki search snippets.
+# MediaWiki wraps search terms in <span class="searchmatch">...</span> tags.
+# We strip all HTML universally so any future tag changes are also handled.
+HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
+class WikinewsProvider(NewsProvider):
+    """
+    Fetches technology news from Wikinews using the MediaWiki search API.
+    Free. No API key. Copyright-bulletproof (Public Domain / CC).
+    Queries 'Computing' and 'Internet' categories concurrently.
+    Gated behind GENERAL_TECH_CATEGORIES in the aggregator.
+    Usage (wired in Phase 11):
+        provider = WikinewsProvider()
+        articles = await provider.fetch_news(category="ai", limit=20)
+    """
+    def __init__(self):
+        # Free provider — no API key, no daily limit.
+        super().__init__(api_key=None)
+        self.daily_limit = 0
+    # ─────────────────────────────────────────────────────────────────────────
+    # MAIN ENTRY POINT — called by the aggregator's FREE PARALLEL RUN
+    # ─────────────────────────────────────────────────────────────────────────
+    async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
+        """
+        Fetch tech articles from Wikinews's Computing and Internet categories.
+        Both category queries run at the same time using asyncio.gather().
+        Their results are combined into one flat list and returned.
+        Args:
+            category (str): Our internal category slug (e.g., "ai").
+                            Tagged on every article. The keyword gate filters
+                            irrelevant articles downstream.
+            limit (int):    Soft cap on total articles to return.
+        Returns:
+            List[Article]: Combined articles from both Wikinews categories.
+                           Returns [] if both queries fail.
+        """
+        try:
+            async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
+                # Fire queries for both categories simultaneously.
+                fetch_tasks = [
+                    self._query_category(client, wiki_cat, category)
+                    for wiki_cat in WIKINEWS_CATEGORIES
+                ]
+                results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
+                # Combine results from both categories.
+                all_articles: List[Article] = []
+                for wiki_cat, result in zip(WIKINEWS_CATEGORIES, results):
+                    if isinstance(result, Exception):
+                        logger.warning(
+                            f"[Wikinews] [{wiki_cat}] Query failed: {result}"
+                        )
+                    elif isinstance(result, list):
+                        all_articles.extend(result)
+                logger.info(
+                    f"[Wikinews] Collected {len(all_articles)} articles from "
+                    f"{len(WIKINEWS_CATEGORIES)} categories for '{category}'"
+                )
+                return all_articles
+        except Exception as e:
+            logger.error(f"[Wikinews] Unexpected error: {e}", exc_info=True)
+            return []
+    # ─────────────────────────────────────────────────────────────────────────
+    # PRIVATE HELPERS
+    # ─────────────────────────────────────────────────────────────────────────
+    async def _query_category(
+        self,
+        client: httpx.AsyncClient,
+        wiki_category: str,
+        pulse_category: str,
+    ) -> List[Article]:
+        """
+        Run one MediaWiki search query for articles in a given Wikinews category.
+        Args:
+            client (httpx.AsyncClient): Shared HTTP client from fetch_news().
+            wiki_category (str):  The Wikinews category to search within
+                                  (e.g., "Computing", "Internet").
+            pulse_category (str): Our internal Pulse category — tagged on articles.
+        Returns:
+            List[Article]: Parsed articles. Returns [] on any failure.
+        """
+        params = {
+            "action":    "query",
+            "list":      "search",
+            # incategory: restricts results to articles in that Wikinews category.
+            "srsearch":  f"incategory:{wiki_category}",
+            "srlimit":   MAX_ARTICLES_PER_CATEGORY,
+            "srprop":    "snippet|timestamp",   # Only fetch what we actually need
+            "format":    "json",
+            "formatversion": "2",               # Cleaner JSON output format
+            # Phase 14 fix: Adding 'info' query alongside the search so that
+            # MediaWiki returns the 'canonicalurl' for each result page.
+            # This eliminates the redirect hop in the image enricher:
+            # Before: curid URL → 301 redirect → actual page → parse og:image (2 requests)
+            # After:  canonicalurl → actual page → parse og:image (1 request)
+            # We do not add 'generator=search' because that changes the response
+            # format entirely and would break our current _map_search_hits() logic.
+            # Instead we capture the canonicalurl inside the search result hit itself
+            # via the 'url' srprop (supported by MediaWiki's search module).
+            "srprop":    "snippet|timestamp|titlesnippet",  # Overrides above — note below
+            # NOTE: MediaWiki does NOT expose canonicalurl through srprop directly.
+            # The correct approach is a separate 'prop=info&inprop=url' sub-query.
+            # That requires changing from 'list=search' to 'generator=search' which
+            # is a larger refactor. For Phase 14 we use a safe, narrow approach:
+            # keep 'snippet|timestamp' as srprop and construct the canonical URL
+            # from the title (URL-encoded), which is always stable on Wikinews.
+            "srprop":    "snippet|timestamp",   # Keep original — canonical from title
+        }
+        try:
+            response = await client.get(
+                WIKINEWS_API_URL,
+                params=params,
+                headers={
+                    "User-Agent": "SegmentoPulse-Ingestion/1.0 (https://segmento.in)"
+                    # Wikimedia's API rules require a descriptive User-Agent.
+                },
+            )
+            if response.status_code == 429:
+                logger.warning(f"[Wikinews] [{wiki_category}] HTTP 429 rate limit.")
+                self.mark_rate_limited()
+                return []
+            if response.status_code != 200:
+                logger.warning(
+                    f"[Wikinews] [{wiki_category}] HTTP {response.status_code} — skipping."
+                )
+                return []
+            data = response.json()
+        except httpx.TimeoutException:
+            logger.warning(f"[Wikinews] [{wiki_category}] Request timed out.")
+            return []
+        except Exception as e:
+            logger.warning(f"[Wikinews] [{wiki_category}] Fetch error: {e}")
+            return []
+        # Drill into the MediaWiki response structure.
+        # Shape: { "query": { "search": [ {...}, {...} ] } }
+        query_block = data.get("query") or {}
+        search_hits = query_block.get("search") or []
+        if not search_hits:
+            logger.info(f"[Wikinews] [{wiki_category}] No results returned.")
+            return []
+        articles = self._map_search_hits(search_hits, wiki_category, pulse_category)
+        # ── ENRICH: Fetch images for articles that have none ──────────────
+        # _map_search_hits is sync — enrichment happens here in the async caller.
+        # Wikinews curid URLs do have og:image tags on their article pages.
+        articles = await self._enrich_article_images(wiki_category, articles)
+        logger.info(
+            f"[Wikinews] [{wiki_category}] Parsed {len(articles)} articles."
+        )
+        return articles
+    def _map_search_hits(
+        self,
+        search_hits: list,
+        wiki_category: str,
+        pulse_category: str,
+    ) -> List[Article]:
+        """
+        Convert MediaWiki search result items into Segmento Pulse Article objects.
+        Key transformations:
+            title      →  title (direct)
+            pageid     →  url (constructed as curid URL)
+            timestamp  →  published_at (already ISO 8601)
+            snippet    →  description (HTML tags stripped via regex)
+            (none)     →  image_url = "" (no images in search results — Phase 12 fix)
+            (hardcoded)→  source = "Wikinews"
+        Args:
+            search_hits (list):   The 'query.search' array from the API response.
+            wiki_category (str):  Which Wikinews category these came from.
+            pulse_category (str): Our internal category — tagged on each article.
+        Returns:
+            List[Article]: Clean Article objects.
+        """
+        articles: List[Article] = []
+        for hit in search_hits:
+            if not isinstance(hit, dict):
+                continue
+            # ── Title ────────────────────────────────────────────────────
+            title = (hit.get("title") or "").strip()
+            if not title:
+                continue
+            # ── URL — canonical title URL with curid fallback ──────────────
+            # Phase 14 fix: Construct the canonical URL from the article title.
+            # Wikinews titles map directly to stable URLs under /wiki/.
+            # Example: title = "AI chip shortage hits 2026"
+            #   → https://en.wikinews.org/wiki/AI_chip_shortage_hits_2026
+            # This URL is permanent (Wikimedia guarantees title-based URLs).
+            # The image enricher can now visit this URL directly without
+            # following a 301 redirect from the curid format — saving one
+            # HTTP round-trip per article during image enrichment.
+            #
+            # We still require pageid as a sanity check. If both checks fail,
+            # we skip the article entirely (no pageid = no reliable identity).
+            pageid = hit.get("pageid")
+            if not pageid:
+                continue
+            # Build canonical URL from the URL-safe title.
+            # urllib.parse.quote() turns spaces → underscores → %20, but Wikimedia
+            # actually uses underscores in URLs (not %20). We replace spaces first.
+            title_for_url = title.replace(" ", "_")
+            import urllib.parse
+            canonical_url = (
+                "https://en.wikinews.org/wiki/"
+                + urllib.parse.quote(title_for_url, safe="/:@!$&'()*+,;=")
+            )
+            # curid URL is kept as fallback — if the canonical URL ever fails
+            # to load in the enricher, the curid URL still reaches the same page.
+            # We use canonical_url as the primary because it has no redirect hop.
+            url = canonical_url
+            # ── Published Date ────────────────────────────────────────────
+            # MediaWiki returns ISO 8601 already, e.g., "2026-03-03T06:00:00Z".
+            # Our Article model's published_at validator accepts this directly.
+            published_at = hit.get("timestamp") or ""
+            # ── Description (HTML-stripped snippet) ───────────────────────
+            # MediaWiki injects HTML like <span class="searchmatch">term</span>
+            # into snippets to highlight search terms. We strip ALL HTML tags
+            # using the pre-compiled regex pattern defined at the module level.
+            raw_snippet = hit.get("snippet") or ""
+            description = HTML_TAG_PATTERN.sub("", raw_snippet).strip()
+            # ── Image URL ─────────────────────────────────────────────────
+            # MediaWiki search results do not include images.
+            # Phase 12 will add a separate image enrichment step for Wikinews.
+            # For now, empty string routes to the Segmento Pulse banner fallback.
+            image_url = ""
+            # ── Build Article ─────────────────────────────────────────────
+            try:
+                article = Article(
+                    title=title,
+                    description=description,
+                    url=url,
+                    image_url=image_url,
+                    published_at=published_at,
+                    source="Wikinews",
+                    # ── ROUTING RULE ──────────────────────────────────────
+                    # Tag with pulse_category from the aggregator.
+                    # Unknown categories safely route to 'News Articles'.
+                    category=pulse_category,
+                )
+                articles.append(article)
+            except Exception as e:
+                logger.debug(
+                    f"[Wikinews] [{wiki_category}] Skipped '{title[:50]}': {e}"
+                )
+                continue
+        return articles
+    # ─────────────────────────────────────────────────────────────────────────
+    # PHASE 12: IMAGE ENRICHMENT — async post-processing step
+    # ─────────────────────────────────────────────────────────────────────────
+    async def _enrich_article_images(
+        self, wiki_category: str, articles: List[Article]
+    ) -> List[Article]:
+        """
+        For every article that has an empty image_url, visit its Wikinews
+        curid URL and try to find the main image via the og:image meta tag.
+        Wikinews article pages DO include og:image tags — they are set by
+        the MediaWiki software for every published article. This call is
+        therefore likely to succeed for most articles.
+        All image fetches run concurrently. With the outer 4-second timeout
+        per call, the entire batch takes ~4 seconds maximum, not N x 4.
+        Args:
+            wiki_category (str):    Category label used for logging only.
+            articles (List[Article]): Output from _map_search_hits().
+        Returns:
+            List[Article]: Same articles, with image_url filled in where possible.
+        """
+        if not articles:
+            return articles
+        # Phase 14 fix: Added asyncio.Semaphore(10) to cap concurrent connections.
+        # Before: 10 articles per category × 2 categories = 20 simultaneous HTTP
+        # requests to Wikinews article pages — no limit.
+        # After: At most 10 page visits run at the same time. The rest queue safely.
+        sem = asyncio.Semaphore(10)
+        async def _get_image(article: Article) -> str:
+            if article.image_url and article.image_url.startswith("http"):
+                return article.image_url      # Already has an image — skip
+            # Acquire one of 10 available lanes before fetching the page.
+            async with sem:
+                return await extract_top_image(article.url)
+        image_tasks = [_get_image(a) for a in articles]
+        fetched_images = await asyncio.gather(*image_tasks, return_exceptions=True)
+        enriched: List[Article] = []
+        for article, image_result in zip(articles, fetched_images):
+            if isinstance(image_result, str) and image_result:
+                article = article.model_copy(update={"image_url": image_result})
+            enriched.append(article)
+        logger.info(
+            f"[Wikinews] [{wiki_category}] Image enrichment complete — "
+            f"{sum(1 for a in enriched if a.image_url)}/{len(enriched)} articles have images."
+        )
+        return enriched

app/services/providers/worldnewsai/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# providers/worldnewsai/__init__.py
+# ─────────────────────────────────────────────────────────────────────────────
+# This file marks the 'worldnewsai' folder as a Python package.
+# To use this provider, import it like this:
+#
+#   from app.services.providers.worldnewsai.client import WorldNewsAIProvider
+#
+# This is a PAID provider (point-based quota) — it requires the
+# WORLDNEWS_API_KEY environment variable to be set.
+#
+# It sits at position 5 in the PAID_CHAIN — the last line of defence
+# before the paid chain gives up. Only fires after GNews, NewsAPI,
+# NewsData, and TheNewsAPI have all failed or exhausted their budgets.
+#
+# ── CRITICAL QUOTA WARNING ────────────────────────────────────────────────
+# WorldNewsAI uses a point system, NOT a simple request counter.
+# Each API call costs points + each returned article costs additional points.
+# The client has a conservative daily_limit = 50 calls to protect the budget.
+# If you see HTTP 402, the daily point budget is fully exhausted.

app/services/providers/worldnewsai/client.py ADDED Viewed

	@@ -0,0 +1,359 @@

+"""
+providers/worldnewsai/client.py
+─────────────────────────────────────────────────────────────────────────────
+The WorldNewsAI Provider for Segmento Pulse.
+What this does:
+    Fetches technology news from WorldNewsAI.com — a global news crawler
+    that indexes tens of thousands of sources worldwide, including many
+    non-English and non-US-centric publications.
+Paid provider — needs WORLDNEWS_API_KEY in your .env file.
+Position 5 in the PAID_CHAIN (last paid failover).
+── THE CRITICAL QUOTA PROBLEM AND HOW WE SOLVE IT ──────────────────────────
+WorldNewsAI does NOT use a simple "100 requests per day" model.
+It uses a POINT system:
+    - Each search call costs points
+    - Each article returned in the response costs additional points
+    - If you run out of points, the API returns HTTP 402 (not 429)
+If we called this for all 22 categories every hour, we would exhaust our
+free-tier point budget before lunchtime.
+Our two-layer protection:
+    1. Position 5 in PAID_CHAIN: Only fires as the last fallback after
+       GNews, NewsAPI, NewsData, and TheNewsAPI have all failed.
+       In a healthy system, it will rarely be called at all.
+    2. daily_limit = 50: The quota tracker caps total calls per day.
+       Once 50 calls are used, the circuit breaker prevents further calls.
+── THE CONTENT SAFETY PROBLEM AND HOW WE SOLVE IT ──────────────────────────
+WorldNewsAI returns the FULL article body in the 'text' field.
+A typical article body is 500-3,000 words — far too large to store in
+our database for each article, and potentially a copyright issue.
+Fix: We take only the first 200 characters from the 'text' field
+and use that as the article's description. This is the same "snippet"
+approach used by Google News, Bing News, and other aggregators.
+200 characters is enough to show a preview without reproducing the article.
+"""
+# ── Standard Library ──────────────────────────────────────────────────────
+import logging
+from datetime import datetime, timezone
+from typing import List, Optional
+# ── Third-party (already in requirements.txt) ──────────────────────────────────
+import httpx                            # Async HTTP client
+# ── Internal ─────────────────────────────────────────────────────────────────
+from app.services.providers.base import NewsProvider, ProviderStatus
+from app.models import Article
+from app.config import settings
+# Phase 16: Import the Redis counter utility to make the daily budget
+# restart-proof. Without this, self.request_count lives in RAM and resets
+# to 0 on every Hugging Face Space restart, letting us overspend the quota.
+from app.services.utils.provider_state import (
+    get_provider_counter,
+    increment_provider_counter,
+)
+logger = logging.getLogger(__name__)
+# ── Constants ─────────────────────────────────────────────────────────────────
+# WorldNewsAI search endpoint (v1)
+WORLDNEWSAI_SEARCH_URL = "https://api.worldnewsapi.com/search-news"
+# Request timeout in seconds
+HTTP_TIMEOUT_SECONDS = 12.0
+# Articles per call. Keep it modest to save points per request.
+ARTICLES_PER_REQUEST = 10
+# How many characters of article body text to keep as the description.
+# Enough for a readable summary, small enough to avoid copyright concerns
+# and database bloat. Matches the 200-char limit used by our RSS parser.
+DESCRIPTION_MAX_CHARS = 200
+# Category → search text mapping.
+# WorldNewsAI takes free-text search queries, not categories.
+# We translate our internal category slug into a descriptive keyword phrase.
+CATEGORY_QUERY_MAP = {
+    'ai':                      'artificial intelligence machine learning',
+    'data-security':           'data security cybersecurity breach',
+    'data-governance':         'data governance compliance regulation',
+    'data-privacy':            'data privacy GDPR CCPA',
+    'data-engineering':        'data engineering pipeline ETL',
+    'data-management':         'data management master data catalog',
+    'business-intelligence':   'business intelligence analytics BI',
+    'business-analytics':      'business analytics reporting dashboards',
+    'customer-data-platform':  'customer data platform CDP',
+    'data-centers':            'data center infrastructure colocation',
+    'cloud-computing':         'cloud computing technology',
+    'magazines':               'technology news',
+    'data-laws':               'data privacy law regulation AI act',
+    'cloud-aws':               'Amazon Web Services AWS cloud',
+    'cloud-azure':             'Microsoft Azure cloud',
+    'cloud-gcp':               'Google Cloud Platform GCP',
+    'cloud-oracle':            'Oracle Cloud OCI',
+    'cloud-ibm':               'IBM Cloud Red Hat',
+    'cloud-alibaba':           'Alibaba Cloud technology',
+    'cloud-digitalocean':      'DigitalOcean cloud platform',
+    'cloud-huawei':            'Huawei Cloud technology',
+    'cloud-cloudflare':        'Cloudflare network security',
+}
+class WorldNewsAIProvider(NewsProvider):
+    """
+    Fetches global technology news from WorldNewsAI.com.
+    Paid provider (point-based quota) — position 5 in the PAID_CHAIN.
+    Only fires when GNews, NewsAPI, NewsData, and TheNewsAPI have all failed.
+    Requires WORLDNEWS_API_KEY in the .env file.
+    Usage (wired in Phase 8):
+        provider = WorldNewsAIProvider(api_key="your_key_here")
+        articles = await provider.fetch_news(category="ai", limit=10)
+    """
+    def __init__(self, api_key: Optional[str] = None):
+        super().__init__(api_key=api_key)
+        # Phase 16: This value is now the CEILING checked in Redis, not just
+        # a RAM counter. Even if the server restarts mid-day, Redis remembers
+        # exactly how many calls we have already made today.
+        self.daily_limit = 50
+    # ─────────────────────────────────────────────────────────────────────────
+    # MAIN ENTRY POINT — called by the aggregator's PAID WATERFALL
+    # ─────────────────────────────────────────────────────────────────────────
+    async def fetch_news(self, category: str, limit: int = 10) -> List[Article]:
+        """
+        Fetch global technology news from WorldNewsAI.
+        Args:
+            category (str): Our internal category slug (e.g., "ai").
+                            We look it up in CATEGORY_QUERY_MAP to get
+                            the search text for the API call.
+            limit (int):    Max articles to return. Kept at 10 by default
+                            to conserve the point budget per call.
+        Returns:
+            List[Article]: Mapped Article objects. Returns [] on any failure.
+        """
+        if not self.api_key:
+            logger.debug("[WorldNewsAI] No API key configured — skipping.")
+            return []
+        # ── PHASE 16: Redis-backed daily budget guard ────────────────────────
+        # Check how many times we have already called WorldNewsAI TODAY
+        # using the Redis counter (not self.request_count which lives in RAM).
+        #
+        # Today's date string (UTC) is used as part of the Redis key so the
+        # counter automatically resets at midnight UTC without any manual work.
+        # Example key: "provider:state:worldnewsai:calls:2026-03-03"
+        #
+        # If Redis is unreachable: get_provider_counter returns 999999
+        # (fail-safe) so we skip the call rather than risk overspending.
+        today_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+        current_calls = await get_provider_counter("worldnewsai", today_str)
+        if current_calls >= self.daily_limit:
+            logger.warning(
+                "[WorldNewsAI] Daily Redis budget exhausted — %d/%d calls used today. "
+                "Skipping to protect the API quota.",
+                current_calls, self.daily_limit
+            )
+            self.mark_rate_limited()
+            return []
+        search_text = CATEGORY_QUERY_MAP.get(category, "technology news")
+        params = {
+            "text":     search_text,
+            "language": "en",
+            "number":   min(limit, ARTICLES_PER_REQUEST),
+            "api-key":  self.api_key,
+            # NOTE: No date filters applied here intentionally.
+            # WorldNewsAI supports 'earliest-publish-date' and
+            # 'latest-publish-date', but our freshness gate handles
+            # date filtering more accurately using IST boundaries.
+        }
+        try:
+            async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
+                print(
+                    f"[WorldNewsAI] Fetching '{category}' "
+                    f"(query='{search_text[:40]}...')..."
+                )
+                response = await client.get(WORLDNEWSAI_SEARCH_URL, params=params)
+                # ── HTTP 402: Point quota fully exhausted ─────────────────
+                # 402 means we are out of points for today — not just rate
+                # limited, but completely blocked until tomorrow's reset.
+                # We mark the provider as RATE_LIMITED (not ERROR) so it can
+                # recover after the scheduler's daily quota reset cycle.
+                if response.status_code == 402:
+                    logger.warning(
+                        "[WorldNewsAI] HTTP 402 — point quota exhausted. "
+                        "No more calls until tomorrow's reset."
+                    )
+                    self.mark_rate_limited()
+                    return []
+                # ── HTTP 401: Invalid or expired API key ──────────────────
+                if response.status_code == 401:
+                    logger.error(
+                        "[WorldNewsAI] HTTP 401 — API key is invalid or expired. "
+                        "Check WORLDNEWS_API_KEY in your .env file."
+                    )
+                    self.status = ProviderStatus.ERROR
+                    return []
+                # ── HTTP 429: Too many requests (short-term rate limit) ───
+                if response.status_code == 429:
+                    logger.warning("[WorldNewsAI] HTTP 429 — request rate exceeded.")
+                    self.mark_rate_limited()
+                    return []
+                # ── Any other non-200 ─────────────────────────────────────
+                if response.status_code != 200:
+                    logger.warning(
+                        f"[WorldNewsAI] Unexpected HTTP {response.status_code}."
+                    )
+                    return []
+                # ── Parse the response ─────────────────────────────────────────
+                self.request_count += 1   # Keep RAM shadow in sync for debugging
+                data = response.json()
+                # WorldNewsAI wraps articles in a top-level 'news' key
+                raw_articles = data.get("news", [])
+                if not raw_articles:
+                    logger.info(
+                        f"[WorldNewsAI] No articles returned for '{category}'."
+                    )
+                    return []
+                articles = self._map_articles(raw_articles, category)
+                # ── PHASE 16: Increment the Redis counter after a successful call ──
+                # We only count successful 200 responses, not failures.
+                # A failed call that returns [] should NOT burn our daily budget.
+                await increment_provider_counter("worldnewsai", today_str)
+                logger.info("[WorldNewsAI] Got %d articles for '%s'.", len(articles), category)
+                return articles
+        except httpx.TimeoutException:
+            logger.warning("[WorldNewsAI] Request timed out.")
+            return []
+        except Exception as e:
+            logger.error(f"[WorldNewsAI] Unexpected error: {e}", exc_info=True)
+            return []
+    # ─────────────────────────────────────────────────────────────────────────
+    # PRIVATE HELPER — maps raw JSON items to Article objects
+    # ─────────────────────────────────────────────────────────────────────────
+    def _map_articles(self, raw_articles: list, category: str) -> List[Article]:
+        """
+        Convert WorldNewsAI JSON items into Segmento Pulse Article objects.
+        Key transformations:
+            - 'text' field is truncated to 200 characters (body is too long)
+            - 'authors' is a list — we join it with ", " into one string
+            - 'image' maps directly to image_url
+        WorldNewsAI field   →  Article field
+        ──────────────────────────────────────
+        title               →  title
+        url                 →  url
+        image               →  image_url
+        publish_date        →  published_at
+        authors (list)      →  source (joined)
+        text (truncated)    →  description
+        Args:
+            raw_articles (list): The 'news' array from the API response.
+            category (str):      The aggregator's category for routing.
+        Returns:
+            List[Article]: Clean Article objects ready for the pipeline.
+        """
+        articles: List[Article] = []
+        for item in raw_articles:
+            if not isinstance(item, dict):
+                continue
+            # ── Title ────────────────────────────────────────────────────
+            title = (item.get("title") or "").strip()
+            if not title:
+                continue
+            # ── URL ──────────────────────────────────────────────────────
+            url = (item.get("url") or "").strip()
+            if not url or not url.startswith("http"):
+                continue
+            # ── Image URL ─────────────────────────────────────────────────
+            image_url = (item.get("image") or "").strip()
+            # ── Published Date ────────────────────────────────────────────
+            # WorldNewsAI returns ISO 8601 format (e.g., "2026-03-03 06:00:00")
+            # Our Article model's published_at validator can handle this.
+            published_at = item.get("publish_date") or ""
+            # ── Source (from authors list) ────────────────────────────────
+            # 'authors' is a list of names, e.g., ["Jane Doe", "John Smith"]
+            # We join them into a comma-separated string for the source field.
+            authors = item.get("authors") or []
+            if isinstance(authors, list) and authors:
+                # Filter out empty strings first, then join
+                clean_authors = [a.strip() for a in authors if a and a.strip()]
+                source = ", ".join(clean_authors) if clean_authors else "WorldNewsAI"
+            else:
+                source = "WorldNewsAI"
+            # ── Description (TRUNCATED body text) ─────────────────────────
+            # WorldNewsAI returns the FULL article body in 'text'.
+            # This is thousands of words — we MUST truncate it.
+            # 200 characters gives a readable preview without storing
+            # copyright-protected full content in our database.
+            raw_text = (item.get("text") or item.get("summary") or "").strip()
+            if len(raw_text) > DESCRIPTION_MAX_CHARS:
+                description = raw_text[:DESCRIPTION_MAX_CHARS] + "..."
+            else:
+                description = raw_text
+            # ── Build Article ─────────────────────────────────────────────
+            try:
+                article = Article(
+                    title=title,
+                    description=description,
+                    url=url,
+                    image_url=image_url,
+                    published_at=published_at,
+                    source=source,
+                    # ── ROUTING RULE ──────────────────────────────────────
+                    # Pass through the aggregator's category.
+                    # Unknown categories safely route to 'News Articles'.
+                    category=category,
+                )
+                articles.append(article)
+            except Exception as e:
+                logger.debug(
+                    f"[WorldNewsAI] Skipped item '{title[:50]}': {e}"
+                )
+                continue
+        return articles

app/services/scheduler.py CHANGED Viewed

@@ -17,6 +17,8 @@ from app.services.upstash_cache import get_upstash_cache   # Needed to bust stal
 from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
 from app.services.research_aggregator import ResearchAggregator
 from app.config import settings
 # Setup logging
 logging.basicConfig(level=logging.INFO)
@@ -377,6 +379,149 @@ async def fetch_daily_research():
     logger.info("═" * 80)
 async def fetch_and_validate_category(category: str, aggregator) -> tuple:
     """
     Fetch and validate articles for a single category.
@@ -393,6 +538,7 @@ async def fetch_and_validate_category(category: str, aggregator) -> tuple:
     from app.utils.date_parser import normalize_article_date
     from app.utils.url_canonicalization import canonicalize_url
     from app.utils.redis_dedup import is_url_seen_or_mark
     try:
         logger.info("📌 Fetching %s...", category.upper())
@@ -477,12 +623,41 @@ async def fetch_and_validate_category(category: str, aggregator) -> tuple:
                 continue
             # Step 4: Normalize date to UTC ISO-8601.
-            article = normalize_article_date(article)
-            # Step 5: Sanitize and clean the article fields.
-            clean_article = sanitize_article(article)
-            valid_articles.append(clean_article)
         logger.info("✓ %s: %d valid, %d invalid, %d irrelevant",
                     category.upper(), len(valid_articles), invalid_count, irrelevant_count)
         return (category, valid_articles, invalid_count, irrelevant_count, relevant_count)

 from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
 from app.services.research_aggregator import ResearchAggregator
 from app.config import settings
+# Phase 13: Global image enrichment — fills missing og:image across ALL providers
+from app.services.utils.image_enricher import extract_top_image
 # Setup logging
 logging.basicConfig(level=logging.INFO)
     logger.info("═" * 80)
+# ──────────────────────────────────────────────────────────────────────────────
+# PHASE 13: GLOBAL IMAGE ENRICHMENT SAFETY NET
+# ──────────────────────────────────────────────────────────────────────────────
+#
+# What this does:
+#   After all validation and deduplication gates have passed, some articles
+#   still arrive with an empty or missing image_url. This happens most often
+#   with providers like OpenRSS (blog feeds without media tags), Webz.io
+#   (small sites without a thread.main_image), and SauravKanchan (NewsAPI
+#   null urlToImage). This function visits the article's URL and tries to
+#   extract the og:image meta tag — the standard way websites declare their
+#   main thumbnail image.
+#
+# Why AFTER deduplication?
+#   We only enrich articles that actually passed every gate and are about to
+#   be saved. We never spend HTTP calls on articles that will be thrown away.
+#
+# Safety guards:
+#   1. MAX_ENRICH_PER_RUN = 20  — Hard cap. If 50 no-image articles arrive,
+#      we only enrich the first 20, leave the rest as "", and the Pulse banner
+#      shows on the frontend. This stops a rogue provider from bottlenecking
+#      the cron job.
+#   2. asyncio.Semaphore(10)    — At most 10 web-page fetches happen at the
+#      same time. This prevents memory spikes and avoids hammering websites.
+#   3. Individual 4-second timeout (inside extract_top_image) — A broken URL
+#      is cancelled in 4 seconds. With Semaphore(10) and MAX 20 articles:
+#      worst-case total overhead = (20 / 10) × 4 = 8 seconds per category run.
+#   4. Zero side-effects — A failed enrichment returns the article unchanged.
+#      The enricher NEVER removes an article from the pipeline.
+#
+async def enrich_missing_images_in_batch(articles: list) -> list:
+    """
+    Scan a list of fully-vetted articles and fill in any missing images.
+    Only enriches up to MAX_ENRICH_PER_RUN articles that have no valid
+    image_url. Articles that already have an image are passed through
+    instantly with zero network cost.
+    Args:
+        articles (list): Final, deduplicated, validated Article objects.
+    Returns:
+        list: Same articles, with image_url filled where possible.
+              Never raises. Never removes an article.
+    """
+    if not articles:
+        return articles
+    # ── Constants ─────────────────────────────────────────────────────────────
+    # Cap: only attempt image enrichment on the first 20 articles that need it.
+    # The rest go to the database as-is (empty image = Pulse banner fallback).
+    MAX_ENRICH_PER_RUN = 20
+    # Semaphore: at most 10 website fetches run simultaneously.
+    # Think of it like a queue of 10 checkout lanes at a supermarket.
+    # If 20 people arrive at once, 10 go straight through and 10 wait
+    # in line. Nobody gets turned away, but the store doesn't explode.
+    sem = asyncio.Semaphore(10)
+    # ── Count how many articles actually need enrichment ───────────────────────
+    articles_needing_images = [
+        a for a in articles
+        if not a.image_url or not a.image_url.startswith("http")
+    ]
+    enrich_count = min(len(articles_needing_images), MAX_ENRICH_PER_RUN)
+    if enrich_count == 0:
+        # Every article already has a valid image. Nothing to do.
+        return articles
+    logger.info(
+        "🖼️  [IMAGE ENRICHER] %d article(s) missing images — enriching up to %d...",
+        len(articles_needing_images), enrich_count
+    )
+    # Build a lookup set of URLs to enrich (only the capped subset).
+    urls_to_enrich = {
+        str(a.url) for a in articles_needing_images[:MAX_ENRICH_PER_RUN]
+    }
+    # ── Internal worker: enrich one article ───────────────────────────────────
+    async def _enrich_one(article) -> object:
+        """
+        If this article needs an image, fetch it under the semaphore guard.
+        Returns the article (updated or unchanged).
+        """
+        url_str = str(article.url) if article.url else ""
+        # Article already has a valid image, or it's outside the cap — skip.
+        if url_str not in urls_to_enrich:
+            return article
+        async with sem:
+            # Semaphore acquired: one of our 10 lanes is now occupied.
+            # extract_top_image has its own 4-second internal timeout,
+            # so this will release the lane quickly regardless of outcome.
+            image_url = await extract_top_image(url_str)
+        if image_url and image_url.startswith("http"):
+            # Got a valid image — update the article cleanly.
+            # model_copy() is the correct Pydantic v2 pattern for immutable models.
+            return article.model_copy(update={"image_url": image_url})
+        # No image found or fetch failed — return article unchanged.
+        return article
+    # ── Run all workers concurrently ───────────────────────────────────────────
+    # All articles go into gather() at once. The semaphore controls how many
+    # actually hit the network at the same time (max 10). The rest wait
+    # in asyncio's queue without blocking the event loop.
+    try:
+        enriched_articles = await asyncio.gather(
+            *[_enrich_one(a) for a in articles],
+            return_exceptions=True
+        )
+        # Replace any Exception results with the original article (safe fallback).
+        final = []
+        for original, result in zip(articles, enriched_articles):
+            if isinstance(result, Exception):
+                logger.debug(
+                    "[IMAGE ENRICHER] Worker exception for %s: %s",
+                    str(original.url)[:60], result
+                )
+                final.append(original)           # Keep original if worker crashed
+            else:
+                final.append(result)
+        enriched_total = sum(
+            1 for a in final if a.image_url and a.image_url.startswith("http")
+        )
+        logger.info(
+            "✅ [IMAGE ENRICHER] Done — %d/%d articles now have images.",
+            enriched_total, len(final)
+        )
+        return final
+    except Exception as e:
+        # If the entire gather somehow fails, return the original list untouched.
+        logger.error("[IMAGE ENRICHER] Gather failed: %s — returning articles unchanged.", e)
+        return articles
 async def fetch_and_validate_category(category: str, aggregator) -> tuple:
     """
     Fetch and validate articles for a single category.
     from app.utils.date_parser import normalize_article_date
     from app.utils.url_canonicalization import canonicalize_url
     from app.utils.redis_dedup import is_url_seen_or_mark
+    from app.models import Article   # Needed to reconstruct Pydantic model after date normalization
     try:
         logger.info("📌 Fetching %s...", category.upper())
                 continue
             # Step 4: Normalize date to UTC ISO-8601.
+            # IMPORTANT: normalize_article_date() always returns a plain dict
+            # (it calls model_dump() internally). We reconstruct the Pydantic
+            # Article right after so that enrich_missing_images_in_batch()
+            # (Phase 13, below) gets the .image_url attribute it needs.
+            normalized_dict = normalize_article_date(article)
+            try:
+                article = Article(**normalized_dict)
+            except Exception:
+                # If reconstruction fails for any reason, skip this article.
+                # The dict is malformed — better to drop it than crash.
+                invalid_count += 1
+                continue
+            # Step 5: Article is now a clean Pydantic object with a normalized date.
+            # We intentionally do NOT call sanitize_article() yet — that step
+            # runs AFTER image enrichment below.
+            valid_articles.append(article)
+        # ── PHASE 13: GLOBAL IMAGE ENRICHMENT ─────────────────────────────────
+        # This is the bottom of the funnel. Every article here has already:
+        #   ✓ Passed basic validation (title, URL, date exist)
+        #   ✓ Passed category relevance check
+        #   ✓ Passed Redis 48-hour deduplication (it is a NEW article)
+        #   ✓ Been date-normalized
+        # Articles are still Pydantic objects here — enrichment needs .image_url.
+        if valid_articles:
+            valid_articles = await enrich_missing_images_in_batch(valid_articles)
+        # ── SANITIZE (after enrichment) ──────────────────────────────────────���─
+        # Now that images are filled, convert each Pydantic Article to a clean
+        # dict for Appwrite storage. sanitize_article() strips unsafe chars,
+        # trims lengths, and returns the final dict payload.
+        valid_articles = [sanitize_article(a) for a in valid_articles]
+        # ──────────────────────────────────────────────────────────────────────
         logger.info("✓ %s: %d valid, %d invalid, %d irrelevant",
                     category.upper(), len(valid_articles), invalid_count, irrelevant_count)
         return (category, valid_articles, invalid_count, irrelevant_count, relevant_count)

app/services/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# app/services/utils/__init__.py
+# ─────────────────────────────────────────────────────────────────────────────
+# This folder contains shared helper utilities that are used by multiple
+# providers. They are NOT providers themselves — they are small tools that
+# providers can import to do common jobs.
+#
+# Current utilities:
+#   image_enricher.py — Extracts the main image from any article URL

app/services/utils/image_enricher.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""
+app/services/utils/image_enricher.py
+─────────────────────────────────────────────────────────────────────────────
+Shared Image Enrichment Utility for Segmento Pulse.
+What this does:
+    Given any article URL, this tool visits the page and tries to find the
+    main (top) image that the website publisher chose for that article.
+    It does this by reading two standard HTML meta tags:
+        1. og:image      — Open Graph (used by Facebook, LinkedIn, Twitter)
+        2. twitter:image — Twitter Card image
+    Almost every modern news website, blog, and tech publication sets at least
+    one of these tags. They are the industry-standard way to declare "this is
+    my article's main image".
+── WHY WE USE bs4 + httpx INSTEAD OF newspaper4k ────────────────────────────
+The user directive requested newspaper4k (a modern async fork of newspaper3k).
+However, newspaper4k is not in our requirements.txt and would add a heavy new
+dependency with many sub-packages (including lxml, Pillow, and others).
+Our current stack already has everything we need:
+    ✓  httpx        — async HTTP client (already in requirements.txt)
+    ✓  beautifulsoup4 — HTML parser   (already in requirements.txt)
+    ✓  lxml          — fast XML/HTML parser (already in requirements.txt)
+The og:image meta tag approach is exactly what newspaper4k uses internally
+for its top_image property. We get the same result without a new dependency.
+This decision follows our Version First-Scan Protocol: never add a library
+when an existing installed library can do the same job.
+── HOW THE TIMEOUT PROTECTION WORKS ─────────────────────────────────────────
+Some websites are slow, broken, or behind Cloudflare protection pages.
+If we waited forever for them, our entire ingestion pipeline would freeze.
+Two layers of protection:
+    1. httpx timeout:        3 seconds max to receive any response at all.
+       If the server doesn't respond in 3 seconds, httpx raises TimeoutException.
+    2. asyncio.wait_for:     4 seconds total ceiling for the entire function.
+       Even if httpx somehow hangs (rare), this outer guard kills it.
+    3. Universal try/except: Catches EVERYTHING. A bad image URL will NEVER
+       crash a provider. The worst it can do is return "".
+The function signature is intentionally similar to newspaper4k's approach
+so that future migration is a one-line change if newspaper4k is later added.
+"""
+# ── Standard Library ──────────────────────────────────────────────────────────
+import asyncio
+import logging
+from typing import Optional
+# ── Third-party (already in requirements.txt) ─────────────────────────────────
+import httpx
+from bs4 import BeautifulSoup
+logger = logging.getLogger(__name__)
+# ── Timing constants ──────────────────────────────────────────────────────────
+# How long to wait for the target website to respond (seconds).
+# 3 seconds is generous enough for normal websites, short enough to not
+# freeze our pipeline if a URL is broken or behind Cloudflare.
+HTTP_FETCH_TIMEOUT = 3.0
+# Hard outer ceiling for the entire extract_top_image() call.
+# Even if httpx somehow hangs past its own timeout, asyncio.wait_for
+# will forcibly cancel the task at this point.
+OUTER_TIMEOUT_SECONDS = 4.0
+async def extract_top_image(url: str) -> str:
+    """
+    Visit an article URL and extract its main (top) image.
+    Looks for the image in two standard HTML meta tags, in this order:
+        1. <meta property="og:image" content="...">
+        2. <meta name="twitter:image" content="...">
+    Args:
+        url (str): Full article URL (must start with "http").
+    Returns:
+        str: The image URL if found and valid. "" if not found or any error.
+    This function NEVER raises an exception. If anything goes wrong
+    (timeout, bad HTML, no meta tag found), it returns "" silently.
+    The pipeline treats "" as "no image" and shows the Pulse banner instead.
+    """
+    if not url or not url.startswith("http"):
+        return ""
+    try:
+        # Wrap everything in asyncio.wait_for so we have a hard ceiling.
+        # If _fetch_and_extract takes longer than OUTER_TIMEOUT_SECONDS, it
+        # is cancelled automatically and we return "" from the except block.
+        image_url = await asyncio.wait_for(
+            _fetch_and_extract(url),
+            timeout=OUTER_TIMEOUT_SECONDS,
+        )
+        return image_url
+    except asyncio.TimeoutError:
+        logger.debug(f"[ImageEnricher] Outer timeout for: {url[:60]}")
+        return ""
+    except Exception as e:
+        logger.debug(f"[ImageEnricher] Failed for '{url[:60]}': {e}")
+        return ""
+async def _fetch_and_extract(url: str) -> str:
+    """
+    Internal helper: download the HTML and pull out the og:image tag.
+    Separated from extract_top_image() so asyncio.wait_for() has a clean
+    coroutine to cancel if needed.
+    Args:
+        url (str): Full article URL.
+    Returns:
+        str: Image URL from meta tag, or "" if none found.
+    """
+    try:
+        async with httpx.AsyncClient(timeout=HTTP_FETCH_TIMEOUT) as client:
+            response = await client.get(
+                url,
+                headers={
+                    # Some sites block requests without a browser User-Agent.
+                    # We mimic a normal browser to get past basic protections.
+                    "User-Agent": (
+                        "Mozilla/5.0 (compatible; SegmentoPulse-ImageBot/1.0; "
+                        "+https://segmento.in)"
+                    ),
+                    # Tell the server we only need enough HTML to read the <head>.
+                    # This does NOT guarantee the server sends less data, but it
+                    # is polite and some servers respect it.
+                    "Accept": "text/html",
+                },
+                follow_redirects=True,
+            )
+        if response.status_code != 200:
+            return ""
+        html = response.text
+    except Exception:
+        # Network error, timeout, SSL error, etc.
+        return ""
+    # ── Parse the HTML and look for meta image tags ───────────────────────────
+    # We only need the <head> section — everything in <body> is irrelevant
+    # and would slow down BeautifulSoup's parsing.
+    # NOTE: We pass only the first 10,000 characters to avoid processing huge
+    # HTML files. og:image is always in the <head> which is near the top.
+    try:
+        soup = BeautifulSoup(html[:10_000], "lxml")
+    except Exception:
+        # If lxml fails (malformed HTML), try the built-in html.parser
+        try:
+            soup = BeautifulSoup(html[:10_000], "html.parser")
+        except Exception:
+            return ""
+    # ── Priority 1: Open Graph image (most reliable) ─────────────────────────
+    og_tag = soup.find("meta", property="og:image")
+    if og_tag:
+        image_url = (og_tag.get("content") or "").strip()
+        if image_url and image_url.startswith("http"):
+            logger.debug(f"[ImageEnricher] og:image found for {url[:50]}")
+            return image_url
+    # ── Priority 2: Twitter Card image (common fallback) ─────────────────────
+    tw_tag = soup.find("meta", attrs={"name": "twitter:image"})
+    if tw_tag:
+        image_url = (tw_tag.get("content") or "").strip()
+        if image_url and image_url.startswith("http"):
+            logger.debug(f"[ImageEnricher] twitter:image found for {url[:50]}")
+            return image_url
+    # No image tag found — return empty, let the banner fallback handle it.
+    logger.debug(f"[ImageEnricher] No meta image tag found for: {url[:60]}")
+    return ""

app/services/utils/provider_state.py ADDED Viewed

	@@ -0,0 +1,283 @@

+"""
+app/services/utils/provider_state.py
+─────────────────────────────────────────────────────────────────────────────
+Phase 15: Unified Redis State Architecture
+What this does:
+    Saves and restores provider "state" — things like "when did we last call
+    OpenRSS?" and "how many times have we called Webz today?" — to our
+    Upstash Redis instance.
+Why we need this:
+    Our backend runs on Hugging Face Spaces, which can restart at any time.
+    When a restart happens, all Python RAM is wiped. Without this utility:
+        - OpenRSS's 60-minute cooldown resets to 0, so we hammer them on
+          every restart and eventually get an IP ban.
+        - Webz's monthly budget counter resets, so we can burn our entire
+          month's calls in a single bad restart day.
+    With this utility:
+        - Even if the server restarts 10 times in an hour, Redis remembers
+          the exact timestamp of the last OpenRSS call and the exact number
+          of Webz calls made today. Provider quotas are now restart-proof.
+How it works:
+    Two pairs of async functions:
+        1. Timestamps (for cooldown timers like OpenRSS):
+               get_provider_timestamp("openrss") → float (Unix timestamp)
+               set_provider_timestamp("openrss", time.time())
+        2. Counters (for daily/monthly budgets like Webz, WorldNewsAI):
+               get_provider_counter("webz", "2026-03-03") → int
+               increment_provider_counter("webz", "2026-03-03")
+Redis key format:
+    Timestamps: provider:state:{provider_name}:last_fetch
+    Counters:   provider:state:{provider_name}:calls:{date_key}
+Mirrored directly from circuit_breaker.py's approach:
+    - Same get_upstash_cache() import
+    - Same _execute_command([...]) API
+    - Same fail-safe try/except pattern
+Fail-open vs Fail-safe design:
+    - get_provider_timestamp:  returns 0.0  on Redis failure
+        → Provider assumes "never fetched before" → allowed to run
+        → This is CORRECT for free providers (OpenRSS). Missing one cooldown
+          check is less dangerous than permanently blocking the provider.
+    - get_provider_counter:    returns 999999 on Redis failure
+        → Provider assumes "budget exhausted" → safely skips the run
+        → This is CORRECT for paid providers (Webz, WorldNewsAI). We would
+          rather miss one run than accidentally overspend our API budget.
+Thread safety:
+    asyncio is single-threaded. All functions below use `await`. Only one
+    coroutine runs at a time, so there are no race conditions to worry about
+    within a single Python process. No locks needed.
+"""
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+# ── Key Builders ─────────────────────────────────────────────────────────────
+# Centralizing the key format here means if we ever need to change it,
+# we change it in one place and every provider picks up the fix automatically.
+def _timestamp_key(provider_name: str) -> str:
+    """
+    Build the Redis key string for a provider's last-fetch timestamp.
+    Example:
+        provider_name = "openrss"
+        → "provider:state:openrss:last_fetch"
+    """
+    return f"provider:state:{provider_name}:last_fetch"
+def _counter_key(provider_name: str, date_key: str) -> str:
+    """
+    Build the Redis key string for a provider's daily call counter.
+    date_key is normally a date string like "2026-03-03" so the key
+    automatically changes every day without needing a manual reset.
+    Example:
+        provider_name = "webz", date_key = "2026-03-03"
+        → "provider:state:webz:calls:2026-03-03"
+    """
+    return f"provider:state:{provider_name}:calls:{date_key}"
+# ── Timestamp Functions (for cooldown timers) ─────────────────────────────────
+async def get_provider_timestamp(provider_name: str) -> float:
+    """
+    Read the last-fetch timestamp for a provider from Redis.
+    Returns a Unix timestamp (seconds since 1970). If Redis is unavailable
+    or the key doesn't exist yet, returns 0.0 so the provider treats it as
+    "never fetched before" and is allowed to run immediately.
+    This is the FAIL-OPEN design — when in doubt, let the provider run.
+    Suitable for free providers with cooldown timers (OpenRSS).
+    Args:
+        provider_name (str): Short name like "openrss".
+    Returns:
+        float: Unix timestamp of the last fetch, or 0.0 if not found.
+    """
+    try:
+        from app.services.upstash_cache import get_upstash_cache
+        cache = get_upstash_cache()
+        key = _timestamp_key(provider_name)
+        # Redis GET returns a string like "1740000000.123" or None if missing.
+        raw_value = await cache._execute_command(["GET", key])
+        if raw_value is None:
+            # Key doesn't exist yet — provider has never fetched before.
+            return 0.0
+        # Parse the string back to a float.
+        return float(raw_value)
+    except Exception as e:
+        # Redis is down, unreachable, or returned something unexpected.
+        # Fail open: return 0.0 so the provider is allowed to run.
+        # This is the safe direction for free providers — one extra call
+        # is far less dangerous than permanently blocking the provider.
+        logger.warning(
+            "[provider_state] get_provider_timestamp('%s') failed (%s) "
+            "— returning 0.0 (fail-open: provider will be allowed to run).",
+            provider_name, e
+        )
+        return 0.0
+async def set_provider_timestamp(
+    provider_name: str,
+    timestamp: float,
+    expire_seconds: int = 7200,   # Default TTL: 2 hours
+) -> None:
+    """
+    Save a provider's last-fetch timestamp to Redis.
+    Always call this BEFORE you start the actual network request, not after.
+    If you save it AFTER and the request crashes halfway through, the provider
+    will think "I was never blocked" and fire again immediately on the next
+    scheduler cycle — the exact opposite of what the cooldown is supposed to do.
+    The TTL (expire_seconds) is a safety net. If the key is never explicitly
+    deleted, Redis will remove it automatically after 2 hours so it doesn't
+    sit in memory forever. 2 hours is safely above the 60-minute cooldown.
+    Args:
+        provider_name (str):  Short name like "openrss".
+        timestamp (float):    Unix timestamp (use time.time() to get the current one).
+        expire_seconds (int): How long to keep this key in Redis. Default: 7200s (2h).
+    """
+    try:
+        from app.services.upstash_cache import get_upstash_cache
+        cache = get_upstash_cache()
+        key = _timestamp_key(provider_name)
+        # Store the float as a string. Redis stores all values as strings anyway.
+        # "SET key value EX seconds" — sets both the value and the TTL in one call.
+        await cache._execute_command(["SET", key, str(timestamp), "EX", expire_seconds])
+        logger.debug(
+            "[provider_state] Saved last_fetch timestamp for '%s' to Redis (TTL=%ds).",
+            provider_name, expire_seconds
+        )
+    except Exception as e:
+        # Redis write failed. This is recoverable — the cooldown will just
+        # fall back to RAM-based tracking for this run. Log it and move on.
+        logger.warning(
+            "[provider_state] set_provider_timestamp('%s') failed (%s) "
+            "— cooldown state will not survive a server restart for this run.",
+            provider_name, e
+        )
+# ── Counter Functions (for daily/monthly API budgets) ─────────────────────────
+async def get_provider_counter(provider_name: str, date_key: str) -> int:
+    """
+    Read a provider's call counter for a specific date from Redis.
+    If Redis is unavailable or the key doesn't exist, returns 999999.
+    This is the FAIL-SAFE design — when in doubt, assume the budget is
+    exhausted and skip the call. Much better than accidentally burning
+    a month's worth of Webz or WorldNewsAI credits on a bad restart day.
+    Args:
+        provider_name (str): Short name like "webz" or "worldnewsai".
+        date_key (str):      Date string like "2026-03-03" (use UTC date).
+                             Using today's UTC date as the key means the
+                             counter automatically resets each morning without
+                             any manual cleanup — yesterday's key just expires.
+    Returns:
+        int: Number of API calls made today, or 999999 if Redis is down.
+    """
+    try:
+        from app.services.upstash_cache import get_upstash_cache
+        cache = get_upstash_cache()
+        key = _counter_key(provider_name, date_key)
+        raw_value = await cache._execute_command(["GET", key])
+        if raw_value is None:
+            # No calls made today yet — counter starts at 0.
+            return 0
+        return int(raw_value)
+    except Exception as e:
+        # Redis is down. Fail SAFE: return a huge number so the provider
+        # thinks its budget is exhausted and skips this run.
+        # One missed run costs us nothing. One overspent budget could cost us money.
+        logger.warning(
+            "[provider_state] get_provider_counter('%s', '%s') failed (%s) "
+            "— returning 999999 (fail-safe: provider will be skipped this run).",
+            provider_name, date_key, e
+        )
+        return 999999
+async def increment_provider_counter(
+    provider_name: str,
+    date_key: str,
+    amount: int = 1,
+    expire_seconds: int = 86400,  # Default TTL: 24 hours (one full day)
+) -> None:
+    """
+    Increment a provider's daily call counter in Redis by `amount`.
+    Uses Redis INCR (atomic increment) which is safe to call concurrently
+    from multiple requests — though since we run single-process asyncio,
+    this is mostly a good practice rather than a strict requirement here.
+    After incrementing, we always refresh the TTL with EXPIRE. This means
+    even if the key was created yesterday and is still sitting around, it
+    gets a fresh 24-hour life from the moment we update it.
+    Args:
+        provider_name (str):  Short name like "webz" or "worldnewsai".
+        date_key (str):       Date string like "2026-03-03" (use UTC date).
+        amount (int):         How much to add to the counter. Default: 1.
+        expire_seconds (int): Key TTL. Default: 86400s (24 hours).
+    """
+    try:
+        from app.services.upstash_cache import get_upstash_cache
+        cache = get_upstash_cache()
+        key = _counter_key(provider_name, date_key)
+        # INCRBY key amount — atomically adds `amount` to the counter.
+        # If the key doesn't exist yet, Redis creates it at 0 and then adds amount.
+        await cache._execute_command(["INCRBY", key, str(amount)])
+        # Refresh the TTL so the key doesn't expire mid-day.
+        # EXPIRE key seconds — resets the countdown timer on the key.
+        await cache._execute_command(["EXPIRE", key, str(expire_seconds)])
+        logger.debug(
+            "[provider_state] Incremented call counter for '%s' on '%s' by %d.",
+            provider_name, date_key, amount
+        )
+    except Exception as e:
+        # Redis write failed. The counter won't reflect this call in Redis,
+        # but in-memory tracking (request_count) still works. Log and continue.
+        logger.warning(
+            "[provider_state] increment_provider_counter('%s', '%s') failed (%s) "
+            "— this call will not be counted in Redis. In-memory limit still applies.",
+            provider_name, date_key, e
+        )

app/utils/data_validation.py CHANGED Viewed

@@ -268,158 +268,272 @@ def calculate_quality_score(article: Dict) -> int:
     return min(max(score, 0), 100)
 def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> bool:
     """
-    Validate that article is relevant to the specified category
-    HOTFIX: Now handles both Pydantic Article objects and dicts
-    Prevents category pollution (e.g., "Apple pie" in Tech)
-    Returns True only if article contains category-specific keywords
     """
-    # HOTFIX: Convert to dict if needed
     if hasattr(article, 'model_dump'):
         article_dict = article.model_dump()
     elif hasattr(article, 'dict'):
         article_dict = article.dict()
     else:
         article_dict = article
-    # Category keyword dictionaries
-    # Each category has a list of words we scan for in the article's title,
-    # description, AND URL path. If at least one word matches, the article passes.
-    CATEGORY_KEYWORDS = {
-        'ai': [
-            'ai', 'artificial intelligence', 'machine learning', 'deep learning',
-            'neural network', 'gpt', 'llm', 'chatgpt', 'generative ai',
-            'computer vision', 'nlp', 'natural language', 'transformer'
-        ],
-        'data-security': [
-            'security', 'cybersecurity', 'data breach', 'hacking', 'vulnerability',
-            'encryption', 'malware', 'ransomware', 'firewall', 'threat'
-        ],
-        'data-governance': [
-            'governance', 'compliance', 'regulation', 'audit', 'policy',
-            'data quality', 'metadata', 'lineage', 'stewardship'
-        ],
-        'data-privacy': [
-            'privacy', 'gdpr', 'ccpa', 'consent', 'personal data',
-            'pii', 'anonymization', 'data protection', 'privacy law'
-        ],
-        'data-engineering': [
-            'data engineering', 'pipeline', 'etl', 'big data', 'spark',
-            'hadoop', 'kafka', 'airflow', 'data warehouse', 'snowflake'
-        ],
-        'data-management': [
-            'data management', 'master data', 'mdm', 'data catalog',
-            'data quality', 'data lineage', 'data stewardship',
-            'data governance', 'data integration', 'reference data'
-        ],
-        'business-intelligence': [
-            'business intelligence', 'bi', 'analytics', 'dashboard',
-            'tableau', 'power bi', 'looker', 'reporting', 'kpi'
-        ],
-        'business-analytics': [
-            'analytics', 'analysis', 'insights', 'metrics', 'data-driven',
-            'business analytics', 'predictive', 'forecasting'
-        ],
-        'customer-data-platform': [
-            'cdp', 'customer data', 'customer platform', 'crm',
-            'customer experience', 'personalization', 'segmentation'
-        ],
-        'data-centers': [
-            'data center', 'data centre', 'datacenter', 'server', 'infrastructure',
-            'colocation', 'edge computing', 'hyperscale'
-        ],
-        'cloud-computing': [
-            'cloud', 'aws', 'azure', 'google cloud', 'gcp', 'salesforce',
-            'alibaba cloud', 'tencent cloud', 'huawei cloud', 'cloudflare',
-            'saas', 'paas', 'iaas', 'serverless', 'kubernetes'
-        ],
-        # ── Cloud sub-categories (each maps to a specific provider) ──────────
-        'cloud-aws': [
-            'aws', 'amazon web services', 's3', 'ec2', 'lambda',
-            'cloudfront', 'sagemaker', 'dynamodb', 'amazon'
-        ],
-        'cloud-azure': [
-            'azure', 'microsoft azure', 'azure devops', 'azure ml',
-            'azure openai', 'microsoft cloud'
-        ],
-        'cloud-gcp': [
-            'gcp', 'google cloud', 'bigquery', 'vertex ai',
-            'cloud run', 'dataflow', 'google cloud platform'
-        ],
-        'cloud-oracle': [
-            'oracle cloud', 'oci', 'oracle database', 'oracle fusion',
-            'oracle cloud infrastructure'
-        ],
-        'cloud-ibm': [
-            'ibm cloud', 'ibm watson', 'red hat', 'openshift', 'ibm z'
-        ],
-        'cloud-alibaba': [
-            'alibaba cloud', 'aliyun', 'alicloud'
-        ],
-        'cloud-digitalocean': [
-            'digitalocean', 'droplet', 'app platform'
-        ],
-        'cloud-huawei': [
-            'huawei cloud', 'huaweicloud'
-        ],
-        'cloud-cloudflare': [
-            'cloudflare', 'cloudflare workers', 'cloudflare r2',
-            'cloudflare pages', 'zero trust'
-        ],
-        # ── Content / publishing categories ───────────────────────────────────
-        'medium-article': [
-            'medium', 'article', 'blog', 'writing', 'publishing',
-            'content', 'story', 'author', 'blogging'
-        ],
-        'magazines': [
-            'technology', 'tech', 'innovation', 'digital', 'startup',
-            'software', 'hardware', 'gadget'
-        ]
-    }
-    # Get keywords for this category
-    keywords = CATEGORY_KEYWORDS.get(category, [])
-    if not keywords:
-        # Unknown category - allow (don't reject)
         return True
-    # Build the text we will search for keywords.
-    # We use title + description as the primary source.
-    # We also append the article's URL path because RSS feeds (especially Google News)
-    # often return empty descriptions. The URL itself usually tells you what the
-    # article is about — e.g. "/aws-launches-new-s3-feature" clearly contains 'aws' and 's3'.
-    # Hyphens and slashes are replaced with spaces so words can be matched individually.
-    title = (article_dict.get('title') or '').lower()
     description = (article_dict.get('description') or '').lower()
-    # Extract the URL path safely.
     raw_url = article_dict.get('url') or ''
     url_str = str(raw_url).lower()
     try:
         parsed_url = urlparse(url_str)
-        # Replace hyphens and slashes with spaces so
-        # "/aws-new-s3-launch" becomes "aws new s3 launch".
         url_words = parsed_url.path.replace('-', ' ').replace('/', ' ')
     except Exception:
         url_words = ''
-    text = f"{title} {description} {url_words}"
-    # Count keyword matches
-    matches = sum(1 for keyword in keywords if keyword.lower() in text)
-    # Require at least 1 keyword match (lenient for now)
-    # Can increase to 2+ for stricter filtering
-    if matches >= 1:
         return True
-    # Log rejection for monitoring
-    print(f"🚫 Rejected '{article_dict.get('title', 'Unknown')[:50]}' from {category} (0 keyword matches)")
     return False

     return min(max(score, 0), 100)
+# ==============================================================================
+# MASTER CATEGORY TAXONOMY  (Phase 19 — Expanded Entity-Based Keywords)
+# ==============================================================================
+#
+# This dictionary is the SINGLE SOURCE OF TRUTH for category routing.
+# Every category has a rich list of keywords covering:
+#   • The topic itself            (e.g., "machine learning")
+#   • Major companies             (e.g., "openai", "anthropic")
+#   • Flagship products           (e.g., "chatgpt", "sagemaker")
+#   • Industry acronyms           (e.g., "llm", "etl", "gcp")
+#
+# ⚠️  IMPORTANT — word-boundary safety:
+#   Short acronyms like "ai", "bi", "aws" MUST live here — we protect them
+#   with \b regex word boundaries in COMPILED_CATEGORY_REGEX below.
+#   Do NOT add single-letter keywords; they can never be safe.
+#
+# NOTE: 'cloud-computing' is kept here because it is an active category in
+#   config.py, news_aggregator.py, and several providers. Removing it would
+#   break article routing for all generic cloud news. — Phase 19
+# ==============================================================================
+CATEGORY_KEYWORDS = {
+    # ── Artificial Intelligence ────────────────────────────────────────────────
+    'ai': [
+        'artificial intelligence', 'machine learning', 'deep learning',
+        'neural network', 'gpt', 'llm', 'chatgpt', 'generative ai',
+        'computer vision', 'nlp', 'natural language processing', 'transformer',
+        'openai', 'anthropic', 'sam altman', 'claude', 'gemini', 'mistral',
+        'llama', 'copilot', 'midjourney', 'stable diffusion', 'hugging face',
+        'rag', 'vector database', 'prompt engineering', 'agi', 'agentic ai',
+    ],
+    # ── Cloud — generic umbrella category (must stay: used in config.py) ──────
+    'cloud-computing': [
+        'cloud computing', 'cloud services', 'aws', 'azure', 'google cloud',
+        'gcp', 'salesforce', 'alibaba cloud', 'tencent cloud', 'huawei cloud',
+        'cloudflare', 'saas', 'paas', 'iaas', 'serverless', 'kubernetes',
+        'multi-cloud', 'hybrid cloud',
+    ],
+    # ── Cloud sub-categories (provider-specific) ───────────────────────────────
+    'cloud-aws': [
+        'aws', 'amazon web services', 's3', 'ec2', 'lambda', 'cloudfront',
+        'sagemaker', 'dynamodb', 'amazon bedrock', 'aws reinvent',
+        'fargate', 'aws graviton', 'elastic beanstalk',
+    ],
+    'cloud-azure': [
+        'azure', 'microsoft azure', 'azure devops', 'azure ml',
+        'azure openai', 'microsoft cloud', 'azure synapse', 'cosmos db',
+        'azure arc', 'microsoft entra',
+    ],
+    'cloud-gcp': [
+        'gcp', 'google cloud', 'bigquery', 'vertex ai', 'cloud run',
+        'dataflow', 'google kubernetes engine', 'gke', 'google spanner',
+        'anthos', 'cloud sql', 'gemini for google cloud',
+    ],
+    'cloud-alibaba': [
+        'alibaba cloud', 'aliyun', 'alicloud', 'polar db', 'maxcompute',
+        'elastic compute service', 'tongyi qianwen', 'qwen',
+    ],
+    'cloud-huawei': [
+        'huawei cloud', 'huaweicloud', 'pangu model',
+        'harmonyos', 'kunpeng', 'ascend ai',
+    ],
+    'cloud-digitalocean': [
+        'digitalocean', 'digital ocean', 'do droplet', 'digitalocean spaces',
+        'digitalocean app platform', 'managed kubernetes', 'cloudways',
+    ],
+    'cloud-oracle': [
+        'oracle cloud', 'oci', 'oracle database', 'oracle fusion',
+        'oracle cloud infrastructure', 'mysql heatwave', 'oracle apex',
+    ],
+    'cloud-ibm': [
+        'ibm cloud', 'ibm watson', 'red hat', 'openshift',
+        'ibm z', 'watsonx', 'ibm mainframe',
+    ],
+    'cloud-cloudflare': [
+        'cloudflare', 'cloudflare workers', 'cloudflare r2',
+        'cloudflare pages', 'zero trust',
+    ],
+    # ── Data Engineering ───────────────────────────────────────────────────────
+    'data-engineering': [
+        'data engineering', 'data pipeline', 'etl', 'elt', 'big data',
+        'apache spark', 'hadoop', 'kafka', 'airflow', 'data warehouse',
+        'snowflake', 'databricks', 'dbt', 'fivetran', 'apache iceberg',
+        'delta lake', 'data lakehouse',
+    ],
+    # ── Data Security ─────────────────────────────────────────────────────────
+    'data-security': [
+        'security', 'cybersecurity', 'data breach', 'hacking', 'vulnerability',
+        'encryption', 'malware', 'ransomware', 'firewall', 'zero trust',
+        'phishing', 'soc2', 'infosec', 'penetration testing',
+    ],
+    # ── Data Governance ───────────────────────────────────────────────────────
+    'data-governance': [
+        'data governance', 'compliance', 'regulation', 'audit', 'data policy',
+        'metadata management', 'data lineage', 'data stewardship',
+        'regulatory compliance',
+    ],
+    # ── Data Privacy ──────────────────────────────────────────────────────────
+    'data-privacy': [
+        'data privacy', 'gdpr', 'ccpa', 'user consent', 'personal data',
+        'pii', 'anonymization', 'data protection', 'privacy law',
+        'hipaa', 'cookie tracking',
+    ],
+    # ── Data Management ───────────────────────────────────────────────────────
+    'data-management': [
+        'data management', 'master data', 'mdm', 'data catalog',
+        'data quality', 'reference data', 'data lifecycle', 'data architecture',
+    ],
+    # ── Business Intelligence ─────────────────────────────────────────────────
+    'business-intelligence': [
+        'business intelligence', 'bi', 'analytics dashboard', 'tableau',
+        'power bi', 'looker', 'data reporting', 'kpi', 'quicksight', 'qlik',
+    ],
+    # ── Business Analytics ────────────────────────────────────────────────────
+    'business-analytics': [
+        'data analytics', 'data analysis', 'business insights', 'business metrics',
+        'data-driven', 'business analytics', 'predictive analytics', 'forecasting',
+    ],
+    # ── Customer Data Platform ────────────────────────────────────────────────
+    'customer-data-platform': [
+        'cdp', 'customer data platform', 'crm', 'customer experience',
+        'personalization engine', 'audience segmentation',
+        'segment.com', 'salesforce data cloud',
+    ],
+    # ── Data Centers ──────────────────────────────────────────────────────────
+    'data-centers': [
+        'data center', 'data centre', 'datacenter', 'server rack', 'colocation',
+        'edge computing', 'hyperscale', 'hpc', 'liquid cooling',
+        'data center cooling',
+    ],
+    # ── Publishing categories ─────────────────────────────────────────────────
+    'medium-article': [
+        'medium', 'article', 'blog', 'writing', 'publishing',
+        'content', 'story', 'author', 'blogging',
+    ],
+    'magazines': [
+        'technology', 'tech', 'innovation', 'digital', 'startup',
+        'software', 'hardware', 'gadget',
+    ],
+}
+# ==============================================================================
+# PRE-COMPILED REGEX ENGINE  (Phase 19 — Word-Boundary Patterns)
+# ==============================================================================
+#
+# Problem this solves:
+#   Old code: "ai" in text  →  matches "tr[ai]n", "ava[i]lable" — garbage hits.
+#   New code: \bai\b in text → only "AI" as a standalone word — clean hits.
+#
+# Why pre-compile?
+#   Building a regex from scratch takes CPU time. If we do it inside the
+#   validation function, it runs once per article × 22 categories = thousands of
+#   compilations per scheduler cycle. By compiling ONCE at import time and
+#   storing the result, all subsequent lookups are instant memory reads.
+#
+# How each pattern is built:
+#   For every keyword in a category we do:
+#       re.escape(keyword)   → safely escapes dots, plus signs, brackets etc.
+#       \b ... \b            → word boundaries so "aws" won't match "kawasaki"
+#   All keywords in one category are joined with | (OR), so a single
+#   re.search() call checks every keyword at once — maximum speed.
+#
+# Example — 'ai' category compiles to:
+#   \bartificial intelligence\b|\bmachine learning\b|\bgpt\b|\bllm\b|...
+# ==============================================================================
+def _build_category_regex(keywords: list) -> 're.Pattern':
+    """
+    Turn a list of keywords into one pre-compiled word-boundary OR pattern.
+    Example:
+        ['gpt', 'llm', 'openai']
+        → re.compile(r'\\bgpt\\b|\\bllm\\b|\\bopenai\\b', re.IGNORECASE)
+    """
+    parts = [r'\b' + re.escape(kw) + r'\b' for kw in keywords]
+    return re.compile('|'.join(parts), re.IGNORECASE)
+# This dict is built ONCE when the server starts.
+# Key   = category slug  (e.g. 'ai', 'cloud-aws')
+# Value = compiled regex (e.g. re.compile(r'\bgpt\b|\bllm\b|...'))
+COMPILED_CATEGORY_REGEX: dict = {
+    category: _build_category_regex(keywords)
+    for category, keywords in CATEGORY_KEYWORDS.items()
+}
 def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> bool:
     """
+    Check whether an article belongs to the given category.
+    Uses pre-compiled word-boundary regex patterns (built once at server start)
+    so that:
+      • Short acronyms like "ai", "bi", "aws" only match as full words.
+        "trail"  → does NOT match 'ai' anymore.
+        "kubernot" → does NOT match 'gcp' anymore.
+      • Multi-word phrases like "openai" or "sagemaker" are matched exactly.
+      • Unknown categories automatically pass (return True) so we don't
+        accidentally drop articles routed to categories we haven't mapped yet.
+    Scans: article title + description + URL path (all lowercased).
+    Returns:
+        True  — article is relevant (at least 1 keyword matches).
+        False — no keyword matched; article is rejected for this category.
     """
+    # ── Step 1: Convert to dict safely ────────────────────────────────────────
     if hasattr(article, 'model_dump'):
         article_dict = article.model_dump()
     elif hasattr(article, 'dict'):
         article_dict = article.dict()
     else:
         article_dict = article
+    # ── Step 2: Look up the pre-compiled pattern for this category ────────────
+    pattern = COMPILED_CATEGORY_REGEX.get(category)
+    if pattern is None:
+        # Category not in our taxonomy — let it pass rather than silently drop.
         return True
+    # ── Step 3: Build the search text ─────────────────────────────────────────
+    # We scan three sources:
+    #   • title       — the headline, most reliable signal
+    #   • description — body summary, adds context
+    #   • url_words   — URL path with hyphens → spaces.
+    #                   Catches articles with empty descriptions like Google RSS.
+    #                   e.g. "/aws-launches-sagemaker-feature" → "aws launches sagemaker feature"
+    title       = (article_dict.get('title')       or '').lower()
     description = (article_dict.get('description') or '').lower()
     raw_url = article_dict.get('url') or ''
     url_str = str(raw_url).lower()
     try:
         parsed_url = urlparse(url_str)
+        # Replace hyphens and slashes with spaces so URL path words
+        # are treated as individual tokens by the word-boundary regex.
         url_words = parsed_url.path.replace('-', ' ').replace('/', ' ')
     except Exception:
         url_words = ''
+    search_text = f"{title} {description} {url_words}"
+    # ── Step 4: Run the compiled regex ────────────────────────────────────────
+    # re.search() returns a Match object on the FIRST hit, or None.
+    # The pattern already has re.IGNORECASE compiled in — no need to lower() again.
+    if pattern.search(search_text):
         return True
+    # No match — log the rejection for monitoring.
+    print(
+        f"🚫 Rejected '{article_dict.get('title', 'Unknown')[:50]}' "
+        f"from {category} (0 keyword matches)"
+    )
     return False