SHAFI commited on
Commit
ff4f05b
Β·
1 Parent(s): 7d4e625

added Massive Tech News Ingestion , more than 10+ news providers added to the ingestion part

Browse files
app/config.py CHANGED
@@ -29,6 +29,12 @@ class Settings(BaseSettings):
29
  GNEWS_API_KEY: str = ""
30
  NEWSAPI_API_KEY: str = ""
31
  NEWSDATA_API_KEY: str = ""
 
 
 
 
 
 
32
 
33
  # Provider priority (will try in order until successful)
34
  NEWS_PROVIDER_PRIORITY: List[str] = ["gnews", "newsapi", "newsdata", "google_rss"]
 
29
  GNEWS_API_KEY: str = ""
30
  NEWSAPI_API_KEY: str = ""
31
  NEWSDATA_API_KEY: str = ""
32
+ # Phase 5: TheNewsAPI.com β€” 100 req/day free tier, position 4 in PAID_CHAIN
33
+ THENEWSAPI_API_KEY: str = ""
34
+ # Phase 8: WorldNewsAI.com β€” point-based quota, position 5 in PAID_CHAIN
35
+ WORLDNEWS_API_KEY: str = ""
36
+ # Phase 10: Webz.io β€” 1,000 calls/month free tier, position 6 in PAID_CHAIN
37
+ WEBZ_API_KEY: str = ""
38
 
39
  # Provider priority (will try in order until successful)
40
  NEWS_PROVIDER_PRIORITY: List[str] = ["gnews", "newsapi", "newsdata", "google_rss"]
app/services/circuit_breaker.py CHANGED
@@ -79,8 +79,24 @@ class ProviderCircuitBreaker:
79
  self.circuit_open_time: Dict[str, float] = {}
80
  self.half_open_attempts: Dict[str, int] = defaultdict(int)
81
 
82
- # Known providers β€” used by the boot-time Redis restore
83
- self._known_providers = ["gnews", "newsapi", "newsdata", "google_rss", "medium", "official_cloud"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  logger.info("=" * 70)
86
  logger.info("⚑ [CIRCUIT BREAKER] Provider protection initialized")
 
79
  self.circuit_open_time: Dict[str, float] = {}
80
  self.half_open_attempts: Dict[str, int] = defaultdict(int)
81
 
82
+ # Known providers β€” used by the boot-time Redis restore.
83
+ # IMPORTANT: Every provider registered in news_aggregator.py MUST be
84
+ # listed here. If a provider is missing, a circuit that was OPEN before
85
+ # a server restart will not be restored β€” the Space will hammer a broken
86
+ # API on every restart until it fails 3 more times to re-open.
87
+ #
88
+ # Phases 1-2 (legacy): gnews, newsapi, newsdata, google_rss, medium, official_cloud
89
+ # Phases 3-11 (new modules): hacker_news, direct_rss, thenewsapi, inshorts,
90
+ # saurav_static, worldnewsai, openrss, webz, wikinews
91
+ self._known_providers = [
92
+ # ── Legacy providers (Phases 1-2) ────────────────────────────────
93
+ "gnews", "newsapi", "newsdata",
94
+ "google_rss", "medium", "official_cloud",
95
+ # ── New modular providers (Phases 3-11) ───────────────────────────
96
+ "hacker_news", "direct_rss", "thenewsapi",
97
+ "inshorts", "saurav_static", "worldnewsai",
98
+ "openrss", "webz", "wikinews",
99
+ ]
100
 
101
  logger.info("=" * 70)
102
  logger.info("⚑ [CIRCUIT BREAKER] Provider protection initialized")
app/services/news_aggregator.py CHANGED
@@ -18,6 +18,20 @@ from app.config import settings
18
  from app.services.api_quota import get_quota_tracker
19
  from app.services.circuit_breaker import get_circuit_breaker
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  class NewsAggregator:
22
  """Service for aggregating news from multiple sources with automatic failover"""
23
 
@@ -48,12 +62,54 @@ class NewsAggregator:
48
 
49
  # Official Cloud Provider (Strict Isolation)
50
  self.providers['official_cloud'] = OfficialCloudProvider()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  # ── Provider role lists ──────────────────────────────────────────────
53
  # PAID_CHAIN: tried in order, stop after the first success (save credits)
54
  # FREE_SOURCES: always tried, always in parallel (no cost, no limits)
55
- self.PAID_CHAIN = ['gnews', 'newsapi', 'newsdata']
56
- self.FREE_SOURCES = ['google_rss', 'medium', 'official_cloud']
57
 
58
  # Medium only publishes articles for a small set of topics.
59
  # Calling it for 'data-centers' or 'cloud-oracle' would return nothing.
@@ -70,6 +126,30 @@ class NewsAggregator:
70
  'cloud-huawei', 'cloud-cloudflare'
71
  ]
72
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  # Cloud provider RSS feeds
75
  self.cloud_rss_urls = {
@@ -227,6 +307,46 @@ class NewsAggregator:
227
  free_tasks.append(official.fetch_news(category, limit=10))
228
  free_names.append('official_cloud')
229
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  if free_tasks:
231
  print(f"[FREE] Launching {len(free_tasks)} free source(s) in parallel for '{category}'...")
232
  free_results = await asyncio.gather(*free_tasks, return_exceptions=True)
 
18
  from app.services.api_quota import get_quota_tracker
19
  from app.services.circuit_breaker import get_circuit_breaker
20
 
21
+ # ── Phases 3-11: New modular providers (Strangler Fig pattern) ──────────────
22
+ # These live in providers/ folder. The legacy news_providers.py is NOT touched.
23
+ # We import each new provider here and the aggregator runs both old and new
24
+ # providers side-by-side safely.
25
+ from app.services.providers.hackernews.client import HackerNewsProvider
26
+ from app.services.providers.direct_rss.client import DirectRSSProvider
27
+ from app.services.providers.thenewsapi.client import TheNewsAPIProvider
28
+ from app.services.providers.inshorts.client import InshortsProvider
29
+ from app.services.providers.sauravkanchan.client import SauravKanchanProvider
30
+ from app.services.providers.worldnewsai.client import WorldNewsAIProvider
31
+ from app.services.providers.openrss.client import OpenRSSProvider
32
+ from app.services.providers.webz.client import WebzProvider
33
+ from app.services.providers.wikinews.client import WikinewsProvider
34
+
35
  class NewsAggregator:
36
  """Service for aggregating news from multiple sources with automatic failover"""
37
 
 
62
 
63
  # Official Cloud Provider (Strict Isolation)
64
  self.providers['official_cloud'] = OfficialCloudProvider()
65
+
66
+ # Direct RSS from premium tech publications (TechCrunch, Wired, The Verge,
67
+ # Engadget, Ars Technica). Free, no key, great images and descriptions.
68
+ # Runs for ALL categories β€” the keyword gate filters off-topic results.
69
+ self.providers['direct_rss'] = DirectRSSProvider()
70
+
71
+ # TheNewsAPI.com β€” Position 4 in the PAID_CHAIN (failover after the
72
+ # existing 3 paid providers). 100 requests/day on the free tier.
73
+ # Only registered when the API key is present in the environment.
74
+ if settings.THENEWSAPI_API_KEY:
75
+ self.providers['thenewsapi'] = TheNewsAPIProvider(
76
+ api_key=settings.THENEWSAPI_API_KEY
77
+ )
78
+
79
+ # WorldNewsAI.com β€” Position 5 in the PAID_CHAIN (final paid failover).
80
+ # Point-based quota, conservative daily_limit = 50 calls.
81
+ # Gives global, non-US-centric news from tens of thousands of sources.
82
+ # Only registered when the API key is present in the environment.
83
+ if settings.WORLDNEWS_API_KEY:
84
+ self.providers['worldnewsai'] = WorldNewsAIProvider(
85
+ api_key=settings.WORLDNEWS_API_KEY
86
+ )
87
+
88
+ # OpenRSS.org β€” generates feeds for sites with no native RSS.
89
+ # Free, no key. Has strict 60-minute internal cooldown to avoid IP ban.
90
+ # Runs for ALL categories β€” no category guardrail needed.
91
+ # The cooldown timer is the only protection this provider needs.
92
+ self.providers['openrss'] = OpenRSSProvider()
93
+
94
+ # Webz.io β€” Position 6 in the PAID_CHAIN (deepest paid failover).
95
+ # Enterprise-grade crawl from 3.5M articles/day. Rich, global coverage.
96
+ # 1,000 calls/month free tier β€” paced to 30/day = ~900/month (10% margin).
97
+ # Only registered when the API key is present in the environment.
98
+ if settings.WEBZ_API_KEY:
99
+ self.providers['webz'] = WebzProvider(
100
+ api_key=settings.WEBZ_API_KEY
101
+ )
102
+
103
+ # Wikinews β€” Public Domain, copyright-bulletproof tech news.
104
+ # Free, no key. Searches 'Computing' and 'Internet' categories concurrently.
105
+ # Gated behind GENERAL_TECH_CATEGORIES (broad tech content only).
106
+ self.providers['wikinews'] = WikinewsProvider()
107
 
108
  # ── Provider role lists ──────────────────────────────────────────────
109
  # PAID_CHAIN: tried in order, stop after the first success (save credits)
110
  # FREE_SOURCES: always tried, always in parallel (no cost, no limits)
111
+ self.PAID_CHAIN = ['gnews', 'newsapi', 'newsdata', 'thenewsapi', 'worldnewsai', 'webz']
112
+ self.FREE_SOURCES = ['google_rss', 'medium', 'official_cloud', 'direct_rss', 'hacker_news', 'inshorts', 'saurav_static', 'openrss', 'wikinews']
113
 
114
  # Medium only publishes articles for a small set of topics.
115
  # Calling it for 'data-centers' or 'cloud-oracle' would return nothing.
 
126
  'cloud-huawei', 'cloud-cloudflare'
127
  ]
128
  }
129
+
130
+ # ── Phase 3: Hacker News Category Guardrail ──────────────────────────
131
+ # Hacker News gives broad tech news β€” it does NOT know about "cloud-alibaba"
132
+ # or "data-governance". Asking it for niche categories wastes CPU cycles
133
+ # and risks polluting those collections with off-topic articles.
134
+ # Only enable Hacker News for the broad categories below where it adds value.
135
+ self.GENERAL_TECH_CATEGORIES = {
136
+ 'ai', 'magazines', 'data-engineering', 'cloud-computing',
137
+ 'data-security', 'business-intelligence'
138
+ }
139
+
140
+ # Register the Hacker News provider (free, no key needed).
141
+ # It lives in providers/hackernews/client.py β€” completely isolated from
142
+ # the legacy news_providers.py file.
143
+ self.providers['hacker_news'] = HackerNewsProvider()
144
+
145
+ # Inshorts β€” 60-word tech summaries. Free, no key, broad tech topics.
146
+ # Gated behind GENERAL_TECH_CATEGORIES (same as Hacker News).
147
+ self.providers['inshorts'] = InshortsProvider()
148
+
149
+ # SauravKanchan static JSON β€” reads two GitHub Pages files (IN + US).
150
+ # Zero cost, zero rate limits, NewsAPI-format data structure.
151
+ # Gated behind GENERAL_TECH_CATEGORIES (broad tech news only).
152
+ self.providers['saurav_static'] = SauravKanchanProvider()
153
 
154
  # Cloud provider RSS feeds
155
  self.cloud_rss_urls = {
 
307
  free_tasks.append(official.fetch_news(category, limit=10))
308
  free_names.append('official_cloud')
309
 
310
+ # ── Phase 3: Hacker News Guardrail ────────────────────────────────────
311
+ # Only fire Hacker News when the category is a broad tech topic.
312
+ # For niche categories (e.g., cloud-alibaba), we skip it entirely.
313
+ if category in self.GENERAL_TECH_CATEGORIES:
314
+ hn = self.providers.get('hacker_news')
315
+ if hn and not self.circuit.should_skip('hacker_news'):
316
+ if hn.is_available():
317
+ free_tasks.append(hn.fetch_news(category, limit=30))
318
+ free_names.append('hacker_news')
319
+
320
+ # ── Phase 6: Inshorts Guardrail ─────────────────────────────────────
321
+ # Same rule as Hacker News: only fire for broad tech categories.
322
+ # Inshorts covers general tech, not niche cloud or governance topics.
323
+ if category in self.GENERAL_TECH_CATEGORIES:
324
+ inshorts = self.providers.get('inshorts')
325
+ if inshorts and not self.circuit.should_skip('inshorts'):
326
+ if inshorts.is_available():
327
+ free_tasks.append(inshorts.fetch_news(category, limit=20))
328
+ free_names.append('inshorts')
329
+
330
+ # ── Phase 7: SauravKanchan Guardrail ─────────────────────────────────
331
+ # Static JSON files (IN + US). Same guardrail as Hacker News and Inshorts.
332
+ # Broad tech content only β€” niche categories get no value from these files.
333
+ if category in self.GENERAL_TECH_CATEGORIES:
334
+ saurav = self.providers.get('saurav_static')
335
+ if saurav and not self.circuit.should_skip('saurav_static'):
336
+ if saurav.is_available():
337
+ free_tasks.append(saurav.fetch_news(category, limit=50))
338
+ free_names.append('saurav_static')
339
+
340
+ # ── Phase 11: Wikinews Guardrail ──────────────────────────────────
341
+ # Wikinews searches broad tech categories (Computing + Internet).
342
+ # No value for niche collections like cloud-alibaba or data-governance.
343
+ if category in self.GENERAL_TECH_CATEGORIES:
344
+ wikinews = self.providers.get('wikinews')
345
+ if wikinews and not self.circuit.should_skip('wikinews'):
346
+ if wikinews.is_available():
347
+ free_tasks.append(wikinews.fetch_news(category, limit=20))
348
+ free_names.append('wikinews')
349
+
350
  if free_tasks:
351
  print(f"[FREE] Launching {len(free_tasks)} free source(s) in parallel for '{category}'...")
352
  free_results = await asyncio.gather(*free_tasks, return_exceptions=True)
app/services/providers/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # providers/__init__.py
3
+ # ─────────────────────────────────────────────────────────────────────────────
4
+ # This file marks the 'providers' folder as a Python package so that
5
+ # Python knows it can import code from inside it.
6
+ #
7
+ # ── HOW TO ADD A NEW PROVIDER ──────────────────────────────────────────────
8
+ # 1. Create a new folder under providers/ (e.g., providers/hackernews/)
9
+ # 2. Inside that folder, create __init__.py (empty) and client.py
10
+ # 3. In client.py, write a class that inherits from base.NewsProvider
11
+ # 4. Add the import line below so the aggregator can find it easily:
12
+ # from app.services.providers.hackernews.client import HackerNewsProvider
13
+ #
14
+ # ── ROUTING RULE (CRITICAL) ────────────────────────────────────────────────
15
+ # Every provider MUST set a 'category' on each Article it returns.
16
+ # If a provider cannot determine a category, it MUST leave category as ""
17
+ # or "magazines". DO NOT LEAVE IT AS None.
18
+ #
19
+ # When category is empty or unrecognized, appwrite_db.get_collection_id()
20
+ # automatically routes the article to the DEFAULT 'News Articles' collection.
21
+ # This is intentional and safe. Never invent a category name that doesn't
22
+ # exist in config.py CATEGORIES β€” it will silently break routing.
23
+ # =============================================================================
app/services/providers/base.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ providers/base.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ The Foundation β€” every news provider in this system inherits from this file.
5
+
6
+ Think of this like a "job contract" for a news provider. Any class that wants
7
+ to act as a news provider MUST sign this contract by:
8
+ 1. Inheriting from the NewsProvider class below.
9
+ 2. Implementing the fetch_news() method with real logic.
10
+
11
+ If a class inherits from NewsProvider but does NOT implement fetch_news(),
12
+ Python will throw a TypeError at startup β€” which is exactly what we want.
13
+ It forces every developer to write proper fetching logic.
14
+
15
+ ── RULE: THE CATEGORY ROUTING CONTRACT ─────────────────────────────────────
16
+
17
+ Every Article produced by a provider MUST have a 'category' field.
18
+ The category value routes the article to the correct Appwrite collection.
19
+
20
+ Current routing rules (defined in appwrite_db.get_collection_id):
21
+ "ai" β†’ AI collection
22
+ "cloud-*" β†’ Cloud collection
23
+ "data-*" / "business-*" / "customer-data-platform" β†’ Data collection
24
+ "magazines" β†’ Magazine collection
25
+ "medium-article" β†’ Medium collection
26
+ "" (empty)
27
+ or any unknown β†’ DEFAULT 'News Articles' collection ← SAFE FALLBACK
28
+
29
+ ⚠️ IMPORTANT FOR ALL PROVIDER DEVELOPERS:
30
+ If your provider fetches general tech news and cannot determine a specific
31
+ category, set category = "magazines".
32
+ If your provider truly cannot figure out a category, set category = "".
33
+ The default collection will catch it safely.
34
+ NEVER set category = None β€” that will cause a Pydantic validation error.
35
+ NEVER invent a category string that is not in config.py CATEGORIES list.
36
+
37
+ ── HOW CLIENT-SIDE FILTERING WORKS ─────────────────────────────────────────
38
+
39
+ Many providers (Hacker News, RSS Feeds, static files) do NOT support
40
+ filtering by date or keyword in their API request. That is okay.
41
+
42
+ Do NOT try to add date filters in the URL if the API doesn't support them.
43
+ Our data_validation pipeline enforces all constraints AFTER the fetch:
44
+ - Freshness gate: rejects articles older than midnight IST today
45
+ - Keyword gate: rejects articles with no matching category keywords
46
+ - Redis dedup: rejects URLs we have already saved in the last 48 hours
47
+
48
+ So your job in fetch_news() is simple: fetch as many articles as the
49
+ provider gives you, map them to Article objects, and return them.
50
+ The pipeline does the rest.
51
+ """
52
+
53
+ # ── Imports ──────────────────────────────────────────────────────────────────
54
+ # Standard library
55
+ from abc import ABC, abstractmethod # ABC = Abstract Base Class toolkit
56
+ from typing import List, Optional
57
+ from datetime import datetime, timezone, timedelta
58
+ from zoneinfo import ZoneInfo # Timezone handling (Python 3.9+ built-in)
59
+ from enum import Enum
60
+
61
+ # Third-party (all already in requirements.txt β€” no new installs needed)
62
+ import httpx # Async HTTP client for API calls
63
+
64
+ # Internal
65
+ from app.models import Article # The standard Article shape every provider must return
66
+
67
+
68
+ # ── Provider Status ────────────────────────────────────────────────────────────
69
+
70
+ class ProviderStatus(Enum):
71
+ """
72
+ Represents the health of a provider at any given moment.
73
+
74
+ ACTIVE β†’ Provider is working fine. Calls proceed normally.
75
+ RATE_LIMITED β†’ Provider hit its API limit. Calls are paused.
76
+ ERROR β†’ Provider had a hard failure. Circuit breaker may kick in.
77
+ """
78
+ ACTIVE = "active"
79
+ RATE_LIMITED = "rate_limited"
80
+ ERROR = "error"
81
+
82
+
83
+ # ── Abstract Base Class ────────────────────────────────────────────────────────
84
+
85
+ class NewsProvider(ABC):
86
+ """
87
+ The contract that every news provider must follow.
88
+
89
+ Subclass this, implement fetch_news(), and your provider
90
+ is automatically compatible with the NewsAggregator, circuit breaker,
91
+ quota tracker, and the full validation pipeline.
92
+
93
+ Example of a minimal valid provider:
94
+
95
+ from app.services.providers.base import NewsProvider, ProviderStatus
96
+ from app.models import Article
97
+ from typing import List
98
+
99
+ class MyProvider(NewsProvider):
100
+ async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
101
+ # 1. Call your API / RSS feed
102
+ # 2. Map the response to Article objects
103
+ # 3. Return the list (can be empty if nothing found)
104
+ return []
105
+ """
106
+
107
+ def __init__(self, api_key: Optional[str] = None):
108
+ # The API key for paid providers. Free providers leave this as None.
109
+ self.api_key = api_key
110
+
111
+ # Starts as ACTIVE. The aggregator or circuit breaker may change this.
112
+ self.status = ProviderStatus.ACTIVE
113
+
114
+ # Tracks how many API calls this provider has made today.
115
+ self.request_count: int = 0
116
+
117
+ # Maximum calls per day. 0 = no limit (used by free providers).
118
+ self.daily_limit: int = 0
119
+
120
+ # The name of this provider. Used in logging and circuit breaker tracking.
121
+ # Automatically takes the class name (e.g., "HackerNewsProvider").
122
+ self.name: str = self.__class__.__name__
123
+
124
+ @abstractmethod
125
+ async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
126
+ """
127
+ REQUIRED: Fetch news articles for the given category.
128
+
129
+ Args:
130
+ category (str): The internal Segmento Pulse category name.
131
+ Example: "ai", "cloud-aws", "magazines"
132
+ limit (int): Maximum number of articles to return.
133
+ This is a guideline β€” providers may return fewer.
134
+
135
+ Returns:
136
+ List[Article]: A list of Article objects. Return [] on failure.
137
+ Never raise an unhandled exception from here.
138
+ Wrap all network calls in try/except.
139
+
140
+ Remember the ROUTING RULE at the top of this file:
141
+ Every Article MUST have a category string.
142
+ Use "magazines" for general tech. Use "" for truly unknown.
143
+ """
144
+ pass
145
+
146
+ # ── Utility Methods (inherited by all providers, no need to override) ──────
147
+
148
+ def is_available(self) -> bool:
149
+ """
150
+ Check if this provider is ready to accept a fetch request.
151
+
152
+ Returns False if:
153
+ - It is currently rate-limited or in an error state.
154
+ - It has used up its daily API call limit.
155
+ """
156
+ return (
157
+ self.status == ProviderStatus.ACTIVE
158
+ and (self.daily_limit == 0 or self.request_count < self.daily_limit)
159
+ )
160
+
161
+ def mark_rate_limited(self):
162
+ """
163
+ Call this when the API returns a 429 (Too Many Requests).
164
+ The status changes to RATE_LIMITED so the aggregator knows to skip it.
165
+ """
166
+ self.status = ProviderStatus.RATE_LIMITED
167
+
168
+ def reset_daily_quota(self):
169
+ """
170
+ Reset this provider's call counter back to zero.
171
+ Called once per day (midnight UTC) by the scheduler to restore access.
172
+ """
173
+ self.request_count = 0
174
+ self.status = ProviderStatus.ACTIVE
app/services/providers/direct_rss/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # providers/direct_rss/__init__.py
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ # This file marks the 'direct_rss' folder as a Python package.
4
+ # To use the Direct RSS provider, import it like this:
5
+ #
6
+ # from app.services.providers.direct_rss.client import DirectRSSProvider
7
+ #
8
+ # This provider fetches XML feeds from premium tech publications
9
+ # (TechCrunch, Wired, The Verge, Engadget, Ars Technica) completely for free.
10
+ # No API keys. No rate limits. Just clean, honest RSS.
app/services/providers/direct_rss/client.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ providers/direct_rss/client.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ The Direct RSS Provider for Segmento Pulse.
5
+
6
+ What this does:
7
+ Fetches the latest technology articles from the RSS feeds of the world's
8
+ best tech publications: TechCrunch, Wired, The Verge, Engadget, and
9
+ Ars Technica.
10
+
11
+ Why Direct RSS instead of using rss_parser.parse_provider_rss()?
12
+ The existing rss_parser.parse_provider_rss() function is built for a
13
+ specific use case: fetching official CLOUD PROVIDER blogs (AWS, GCP etc.)
14
+ It hardcodes category = f'cloud-{provider}' on every article it creates.
15
+
16
+ If we ran TechCrunch through that function, every TechCrunch article
17
+ would be tagged "category = cloud-TechCrunch". Appwrite would not know
18
+ where to route it, and articles would end up in the wrong collection β€”
19
+ or worse, be silently dropped.
20
+
21
+ So instead, we use the feedparser library directly (the same library
22
+ rss_parser.py uses internally). We follow the exact same parsing pattern
23
+ but set the category correctly from what the aggregator tells us.
24
+
25
+ We DO still reuse two helper methods from rss_parser.py for consistency:
26
+ - _extract_image_from_entry() β†’ finds images from media/enclosure tags
27
+ - _parse_date() β†’ handles all date format variations
28
+
29
+ How it works:
30
+ Step 1: Build a list of async HTTP tasks β€” one per RSS feed URL.
31
+ Step 2: Fire all tasks at the same time using asyncio.gather().
32
+ Step 3: Feed each successful XML response into feedparser.
33
+ Step 4: Map each feedparser entry to a Pulse Article object.
34
+ Step 5: Return the combined list from all feeds.
35
+
36
+ Client-side constraint note:
37
+ RSS feeds give us whatever was published recently by that outlet β€”
38
+ we cannot ask them for "only today's AI articles".
39
+ The freshness gate (is_valid_article) and keyword gate
40
+ (is_relevant_to_category) in data_validation.py handle all filtering
41
+ after we return these articles. That is by design.
42
+ """
43
+
44
+ # ── Standard Library ──────────────────────────────────────────────────────────
45
+ import asyncio
46
+ import logging
47
+ import re
48
+ import time
49
+ from typing import List
50
+
51
+ # ── Third-party (already in requirements.txt) ─────────────────────────────────
52
+ import feedparser # XML/RSS feed parser β€” already used by rss_parser.py
53
+ import httpx # Async HTTP client
54
+
55
+ # ── Internal ──────────────────────────────────────────────────────────────────
56
+ from app.services.providers.base import NewsProvider
57
+ from app.services.rss_parser import RSSParser # Reuse helper methods, not the methods with hardcoded categories
58
+ from app.models import Article
59
+
60
+ logger = logging.getLogger(__name__)
61
+
62
+ # ── RSS Feed Registry ──────────────────────────────────────────────────────────
63
+ #
64
+ # These are the direct RSS feed URLs for the most trusted tech publications.
65
+ # Each entry is a tuple of (feed_url, source_name).
66
+ #
67
+ # "source_name" is the human-readable name we store on every article.
68
+ # It appears in the Segmento Pulse UI next to the article headline.
69
+ #
70
+ # To add a new RSS feed in the future, just add a new line here.
71
+ # The rest of the code picks it up automatically.
72
+ #
73
+ TECH_RSS_FEEDS: List[tuple] = [
74
+ ("https://techcrunch.com/feed", "TechCrunch"),
75
+ ("https://www.wired.com/feed/rss", "Wired"),
76
+ ("https://www.theverge.com/rss/tech/index.xml", "The Verge"),
77
+ ("https://www.engadget.com/rss.xml", "Engadget"),
78
+ ("https://feeds.arstechnica.com/arstechnica/technology-lab", "Ars Technica"),
79
+ ]
80
+
81
+ # Maximum articles to take from each individual feed.
82
+ # 10 per feed Γ— 5 feeds = up to 50 articles total per aggregator run.
83
+ MAX_ARTICLES_PER_FEED = 10
84
+
85
+ # How long (in seconds) to wait for a feed to respond before giving up.
86
+ HTTP_TIMEOUT_SECONDS = 12.0
87
+
88
+
89
+ class DirectRSSProvider(NewsProvider):
90
+ """
91
+ Fetches articles directly from the RSS feeds of premium tech publications.
92
+
93
+ Free. No API key needed. No rate limits.
94
+ Provides the best descriptions and images of all our free providers,
95
+ because these are professionally edited by full-time journalists.
96
+
97
+ Usage (wired into the aggregator in Phase 4):
98
+ provider = DirectRSSProvider()
99
+ articles = await provider.fetch_news(category="ai", limit=50)
100
+ """
101
+
102
+ def __init__(self):
103
+ # Free provider β€” no API key, no daily limit.
104
+ super().__init__(api_key=None)
105
+ self.daily_limit = 0
106
+
107
+ # Phase 17: Fetch-Once, Fan-Out cache
108
+ #
109
+ # Direct RSS fetches TechCrunch, Wired, The Verge, Engadget, and
110
+ # Ars Technica. These do NOT change between categories β€” the same
111
+ # 5 XML files contain the same articles whether the category is
112
+ # "ai", "cloud-aws", or "data-security".
113
+ #
114
+ # Without a cache: 22 categories Γ— 5 feeds = 110 outbound HTTP requests
115
+ # per scheduler run, all downloading the exact same XML.
116
+ #
117
+ # With a cache: first category fetches 5 feeds once, stores results
118
+ # here. The other 21 categories get the list instantly from memory.
119
+ # Total outbound requests: 5. A 95% reduction.
120
+ self._cached_articles: List[Article] = []
121
+ self._cache_time: float = 0.0
122
+
123
+ # asyncio.Lock prevents a race condition during the first run.
124
+ # When the scheduler fires, asyncio.gather() calls fetch_news() for
125
+ # multiple categories at the same time. Without the lock, all of them
126
+ # would see an empty cache and all start their own 5-feed HTTP fetch
127
+ # simultaneously. That defeats the whole purpose. With the lock,
128
+ # only the FIRST caller fetches; the rest wait and then read from cache.
129
+ self._lock = asyncio.Lock()
130
+
131
+ # We borrow helpers from the existing RSSParser.
132
+ # We do NOT call parse_google_news() or parse_provider_rss() β€”
133
+ # those have category logic built in that would break our routing.
134
+ # We only use the helper methods: _extract_image_from_entry, _parse_date.
135
+ self._rss_helpers = RSSParser()
136
+
137
+ # ─────────────────────────────────────────────────────────────────────────
138
+ # MAIN ENTRY POINT β€” called by the aggregator
139
+ # ─────────────────────────────────────────────────────────────────────────
140
+
141
+ async def fetch_news(self, category: str, limit: int = 50) -> List[Article]:
142
+ """
143
+ Fetch articles from all premium tech RSS feeds concurrently.
144
+
145
+ Args:
146
+ category (str): The category string passed from the aggregator.
147
+ We tag every article with this so the pipeline
148
+ can route it to the correct Appwrite collection.
149
+ The keyword gate will filter out irrelevant articles.
150
+ limit (int): Not strictly enforced here β€” we let the per-feed
151
+ cap (MAX_ARTICLES_PER_FEED) control volume, and
152
+ the aggregator deduplication handles the rest.
153
+
154
+ Returns:
155
+ List[Article]: All articles collected across all 5 feeds.
156
+ Returns [] if network is down for all feeds.
157
+ """
158
+ # ── Phase 17: Cache check (OUTER) ─────────────────────────────────────
159
+ # 2700 seconds = 45 minutes. If we fetched the RSS feeds less than
160
+ # 45 minutes ago, return the stored articles immediately.
161
+ # No HTTP request. No XML parsing. Instant return.
162
+ #
163
+ # Why 45 minutes? Our freshness gate uses an hourly window. A 45-minute
164
+ # cache is safely inside that window, giving us fresh-enough content
165
+ # without hammering TechCrunch and Wired every minute.
166
+ CACHE_TTL_SECONDS = 2700 # 45 minutes
167
+
168
+ if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
169
+ logger.debug(
170
+ "[DirectRSS] Cache hit β€” returning %d cached articles for category='%s'. "
171
+ "No HTTP calls made.",
172
+ len(self._cached_articles), category
173
+ )
174
+ return self._cached_articles
175
+
176
+ # ── Cache stale or empty: acquire the lock and fetch ───────────────────
177
+ # Only one coroutine can be inside this block at a time.
178
+ # Any other coroutine that reaches this point will WAIT here until
179
+ # the first one has finished and released the lock.
180
+ async with self._lock:
181
+
182
+ # ── Cache check (INNER) β€” double-checked locking ──────────────
183
+ # While THIS coroutine was waiting for the lock, the coroutine that
184
+ # held the lock before us already fetched and filled the cache.
185
+ # We check again so we don't fetch a second time.
186
+ if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
187
+ logger.debug(
188
+ "[DirectRSS] Cache hit after lock (another task fetched it) β€” "
189
+ "returning %d cached articles.",
190
+ len(self._cached_articles)
191
+ )
192
+ return self._cached_articles
193
+
194
+ # Cache is genuinely stale β€” this coroutine won the race.
195
+ # Do the full HTTP fetch now.
196
+ logger.info("[DirectRSS] Cache stale/empty. Fetching all 5 RSS feeds...")
197
+
198
+ try:
199
+ async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
200
+
201
+ # Step 1: Build one fetch task per RSS feed URL.
202
+ # All tasks run at the same time β€” we do not wait for feed #1
203
+ # before starting feed #2. This keeps total time under 2 seconds.
204
+ fetch_tasks = [
205
+ self._fetch_and_parse_feed(client, url, source_name, category)
206
+ for url, source_name in TECH_RSS_FEEDS
207
+ ]
208
+
209
+ # Step 2: Launch all tasks simultaneously.
210
+ results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
211
+
212
+ # Step 3: Combine all lists into one. Skip any that errored.
213
+ all_articles: List[Article] = []
214
+ for feed_url_source, result in zip(TECH_RSS_FEEDS, results):
215
+ source_name = feed_url_source[1]
216
+ if isinstance(result, Exception):
217
+ logger.warning(
218
+ f"[DirectRSS] [{source_name}] Feed fetch failed: {result}"
219
+ )
220
+ elif isinstance(result, list):
221
+ all_articles.extend(result)
222
+
223
+ logger.info(
224
+ "[DirectRSS] Fetched %d articles across %d feeds. "
225
+ "Caching for 45 minutes.",
226
+ len(all_articles), len(TECH_RSS_FEEDS)
227
+ )
228
+
229
+ # Save results and timestamp to the class-level cache.
230
+ self._cached_articles = all_articles
231
+ self._cache_time = time.time()
232
+ return all_articles
233
+
234
+ except Exception as e:
235
+ logger.error(f"[DirectRSS] Unexpected error: {e}", exc_info=True)
236
+ return []
237
+
238
+ # ─────────────────────────────────────────────────────────────────────────
239
+ # PRIVATE HELPERS
240
+ # ─────────────────────────────────────────────────────────────────────────
241
+
242
+ async def _fetch_and_parse_feed(
243
+ self,
244
+ client: httpx.AsyncClient,
245
+ url: str,
246
+ source_name: str,
247
+ category: str,
248
+ ) -> List[Article]:
249
+ """
250
+ Fetch one RSS feed URL and parse it into Article objects.
251
+
252
+ Args:
253
+ client (httpx.AsyncClient): Shared HTTP client from fetch_news().
254
+ url (str): The RSS feed URL (e.g., https://techcrunch.com/feed).
255
+ source_name (str): Human-readable name (e.g., "TechCrunch").
256
+ category (str): The category from the aggregator β€” stored on each article.
257
+
258
+ Returns:
259
+ List[Article]: Parsed articles from this feed. Returns [] on any failure.
260
+ """
261
+ try:
262
+ response = await client.get(
263
+ url,
264
+ # Politely identify ourselves. Some servers block unknown user agents.
265
+ headers={"User-Agent": "SegmentoPulse-RSS-Reader/1.0"},
266
+ follow_redirects=True,
267
+ )
268
+
269
+ if response.status_code != 200:
270
+ logger.warning(
271
+ f"[DirectRSS] [{source_name}] HTTP {response.status_code} β€” skipping."
272
+ )
273
+ return []
274
+
275
+ xml_text = response.text
276
+
277
+ except httpx.TimeoutException:
278
+ logger.warning(f"[DirectRSS] [{source_name}] Timed out β€” skipping.")
279
+ return []
280
+ except Exception as e:
281
+ logger.warning(f"[DirectRSS] [{source_name}] Fetch error: {e}")
282
+ return []
283
+
284
+ # Hand the raw XML to feedparser β€” it handles all RSS/Atom variants
285
+ # (RSS 2.0, Atom 1.0, etc.) automatically.
286
+ return self._parse_feed_xml(xml_text, source_name, category)
287
+
288
+ def _parse_feed_xml(
289
+ self,
290
+ xml_text: str,
291
+ source_name: str,
292
+ category: str,
293
+ ) -> List[Article]:
294
+ """
295
+ Parse raw XML text from a feed into a list of Article objects.
296
+
297
+ Uses feedparser to decode the XML, then maps each entry to our
298
+ Pydantic Article model. We reuse rss_parser's helper methods for
299
+ image extraction and date parsing so the logic is consistent
300
+ across all RSS sources in the system.
301
+
302
+ Args:
303
+ xml_text (str): Raw XML string from the HTTP response.
304
+ source_name (str): Name of the publication (e.g., "Wired").
305
+ category (str): Category to tag on every article.
306
+
307
+ Returns:
308
+ List[Article]: Parsed articles. May be [] if the feed is malformed.
309
+ """
310
+ try:
311
+ feed = feedparser.parse(xml_text)
312
+ except Exception as e:
313
+ logger.warning(f"[DirectRSS] [{source_name}] feedparser failed: {e}")
314
+ return []
315
+
316
+ articles: List[Article] = []
317
+
318
+ for entry in feed.entries[:MAX_ARTICLES_PER_FEED]:
319
+
320
+ # ── Title ────────────────────────────────────────────────────────
321
+ title = (entry.get("title") or "").strip()
322
+ if not title:
323
+ continue # Every article must have a title
324
+
325
+ # ── URL ──────────────────────────────────────────────────────────
326
+ url = (entry.get("link") or "").strip()
327
+ if not url or not url.startswith("http"):
328
+ continue # Every article must have a clickable link
329
+
330
+ # ── Description ──────────────────────────────────────────────────
331
+ # RSS feeds usually put a short summary in the 'summary' field.
332
+ # We strip any HTML tags, then cap it at 200 characters.
333
+ raw_desc = entry.get("summary", "") or ""
334
+ description = re.sub(r"<[^>]+>", "", raw_desc).strip()
335
+ if len(description) > 200:
336
+ description = description[:200] + "..."
337
+
338
+ # ── Image URL ────────────────────────────────────────────────────
339
+ # We reuse the existing _extract_image_from_entry helper from
340
+ # rss_parser.py. It checks media:content, media:thumbnail,
341
+ # enclosures, and <img> tags inside the description.
342
+ image_url = self._rss_helpers._extract_image_from_entry(entry)
343
+
344
+ # ── Published Date ───────────────────────────────────────────────
345
+ # We reuse the existing _parse_date helper from rss_parser.py.
346
+ # It handles RFC 2822, ISO 8601, and other common date formats.
347
+ raw_date = entry.get("published", "") or ""
348
+ published_at = self._rss_helpers._parse_date(raw_date)
349
+
350
+ # ── Build Article ────────────────────────────────────────────────
351
+ try:
352
+ article = Article(
353
+ title=title,
354
+ description=description,
355
+ url=url,
356
+ image_url=image_url,
357
+ published_at=published_at,
358
+ source=source_name,
359
+ # ── ROUTING RULE ──────────────────────────────────────
360
+ # We set the category that the aggregator passed in.
361
+ # The keyword gate will reject articles that don't
362
+ # actually match this category β€” that's completely fine.
363
+ # It is much safer than guessing a wrong category here.
364
+ category=category,
365
+ )
366
+ articles.append(article)
367
+
368
+ except Exception as e:
369
+ # One bad article should never cancel the rest of the feed
370
+ logger.debug(
371
+ f"[DirectRSS] [{source_name}] Skipped entry '{title[:50]}': {e}"
372
+ )
373
+ continue
374
+
375
+ logger.info(
376
+ f"[DirectRSS] [{source_name}] Parsed {len(articles)} articles."
377
+ )
378
+ return articles
app/services/providers/hackernews/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # providers/hackernews/__init__.py
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ # This file marks the 'hackernews' folder as a Python package.
4
+ # To use the Hacker News provider, import it like this:
5
+ #
6
+ # from app.services.providers.hackernews.client import HackerNewsProvider
7
+ #
8
+ # This provider is entirely self-contained in this folder.
9
+ # It does not touch news_providers.py, news_aggregator.py, or anything else.
app/services/providers/hackernews/client.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ providers/hackernews/client.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ The Hacker News Provider for Segmento Pulse.
5
+
6
+ What this does:
7
+ Fetches the top stories from Hacker News β€” a community-voted list of the
8
+ best tech articles on the internet. It is completely free to use and has
9
+ no rate limits or API key requirement.
10
+
11
+ How the Hacker News API works (Two-Step Process):
12
+ Step 1: Ask HN for a list of top story IDs (one big list)
13
+ Step 2: For each ID, ask HN for that story's actual details
14
+
15
+ We only take the top 30 IDs. If we tried 500 IDs (the full list),
16
+ it would take too long and put unnecessary load on their server.
17
+ 30 is a safe, polite number that still gives us great content.
18
+
19
+ What we do about missing data:
20
+ - No URL? β†’ Skip this story entirely (it's an "Ask HN" self-post).
21
+ Our database cannot link to a story without a URL.
22
+ - No image? β†’ Set image_url = "". The frontend will use the
23
+ Segmento Pulse banner image as the default.
24
+ - No summary? β†’ Set description = "". HN only provides the title
25
+ for external links, not a description.
26
+ - Unix time? β†’ Convert to ISO 8601 string (our standard date format).
27
+
28
+ Client-side constraint note (from our architecture plan):
29
+ Hacker News does NOT support any filtering. We cannot ask it for
30
+ "only today's articles" or "only AI news". It gives us what it gives us.
31
+ That is completely fine. Our data_validation pipeline (is_valid_article,
32
+ is_relevant_to_category) will filter out old or off-topic articles
33
+ automatically AFTER we fetch them. We just fetch and map here.
34
+ """
35
+
36
+ # ── Standard Library ──────────────────────────────────────────────────────────
37
+ import asyncio # Lets us run multiple HTTP calls at the same time
38
+ import logging
39
+ from datetime import datetime, timezone
40
+ from typing import List, Optional
41
+
42
+ # ── Third-party (already in requirements.txt) ─────────────────────────────────
43
+ import httpx # Async HTTP client
44
+
45
+ # ── Internal ──────────────────────────────────────────────────────────────────
46
+ # We import only from our new base β€” no dependency on legacy news_providers.py
47
+ from app.services.providers.base import NewsProvider, ProviderStatus
48
+ from app.models import Article
49
+ # Phase 12: Shared image enricher (extracts og:image from article pages)
50
+ from app.services.utils.image_enricher import extract_top_image
51
+
52
+ logger = logging.getLogger(__name__)
53
+
54
+ # ── Constants ─────────────────────────────────────────────────────────────────
55
+
56
+ # The top of this list = the most upvoted stories on Hacker News right now
57
+ HN_TOP_STORIES_URL = "https://hacker-news.firebaseio.com/v0/topstories.json"
58
+
59
+ # Template for fetching one story's full details by its ID
60
+ HN_ITEM_URL = "https://hacker-news.firebaseio.com/v0/item/{item_id}.json"
61
+
62
+ # How many top stories to fetch. Kept small to be polite to HN's servers.
63
+ # The full list has 500 stories β€” we only want the best 30.
64
+ TOP_STORIES_LIMIT = 30
65
+
66
+ # HTTP timeout in seconds. HN is fast, but we cap it to avoid hanging jobs.
67
+ HTTP_TIMEOUT_SECONDS = 10.0
68
+
69
+
70
+ class HackerNewsProvider(NewsProvider):
71
+ """
72
+ Fetches top stories from the Hacker News API.
73
+
74
+ No API key needed. No rate limit. Completely free.
75
+
76
+ Usage (once wired into the aggregator in Phase 3):
77
+ provider = HackerNewsProvider()
78
+ articles = await provider.fetch_news(category="magazines", limit=30)
79
+ """
80
+
81
+ def __init__(self):
82
+ # Free provider β€” no API key needed, so we pass None to the base class.
83
+ super().__init__(api_key=None)
84
+
85
+ # daily_limit = 0 means "no limit". HN has no quota.
86
+ self.daily_limit = 0
87
+
88
+ # ─────────────────────────────────────────────────────────────────────────
89
+ # STEP 1 + 2 COMBINED: fetch_news() is the one method the aggregator calls
90
+ # ─────────────────────────────────────────────────────────────────────────
91
+
92
+ async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
93
+ """
94
+ Fetch the top stories from Hacker News.
95
+
96
+ Args:
97
+ category (str): The category passed in by the aggregator.
98
+ We store this on each article, but we cannot
99
+ actually filter HN results by it. The keyword
100
+ gate in data_validation.py will handle that.
101
+ limit (int): Maximum number of articles to return.
102
+ We cap this at TOP_STORIES_LIMIT (30) regardless.
103
+
104
+ Returns:
105
+ List[Article]: Validated Article objects from Hacker News.
106
+ Returns [] if the network is down or HN is unreachable.
107
+ """
108
+ try:
109
+ async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
110
+
111
+ # ── STEP 1: Get the list of top story IDs ─────────────────
112
+ top_ids = await self._fetch_top_ids(client)
113
+
114
+ if not top_ids:
115
+ logger.warning("[HackerNews] Could not retrieve top story IDs.")
116
+ return []
117
+
118
+ # Slice the list β€” we only want the top N IDs
119
+ ids_to_fetch = top_ids[:min(limit, TOP_STORIES_LIMIT)]
120
+
121
+ # ── STEP 2: Fetch all story details concurrently ───────────
122
+ # Instead of fetching stories one-by-one (which would take ~30 seconds),
123
+ # we launch all 30 HTTP requests at the same time using asyncio.gather().
124
+ # All 30 requests fly out simultaneously and come back in ~1-2 seconds.
125
+ fetch_tasks = [
126
+ self._fetch_single_item(client, story_id)
127
+ for story_id in ids_to_fetch
128
+ ]
129
+ raw_items = await asyncio.gather(*fetch_tasks, return_exceptions=True)
130
+
131
+ # ── MAP: Convert raw HN items β†’ Article objects ────────────
132
+ articles = self._map_items_to_articles(raw_items, category)
133
+
134
+ # ── ENRICH: Fetch images for articles that have none ───────
135
+ # _map_items_to_articles is a sync function, so it cannot await.
136
+ # We run image enrichment here in the async caller instead.
137
+ # All image fetches run concurrently β€” the total extra wait
138
+ # is ~4 seconds maximum (the outer timeout), not 30Γ—4 seconds.
139
+ articles = await self._enrich_article_images(articles)
140
+
141
+ logger.info(
142
+ f"[HackerNews] Fetched {len(raw_items)} items β†’ "
143
+ f"{len(articles)} valid articles for category='{category}'"
144
+ )
145
+ return articles
146
+
147
+ except httpx.TimeoutException:
148
+ logger.warning("[HackerNews] Request timed out. Will retry next cycle.")
149
+ return []
150
+ except Exception as e:
151
+ # Catch-all: never let a HN failure crash the aggregator job
152
+ logger.error(f"[HackerNews] Unexpected error: {e}", exc_info=True)
153
+ return []
154
+
155
+ # ─────────────────────────────────────────────────────────────────────────
156
+ # PRIVATE HELPERS β€” internal steps, not called by the aggregator
157
+ # ─────────────────────────────────────────────────────────────────────────
158
+
159
+ async def _fetch_top_ids(self, client: httpx.AsyncClient) -> List[int]:
160
+ """
161
+ Step 1: Ask Hacker News for the IDs of its top stories.
162
+
163
+ Returns a list of integers like [39281947, 39281001, ...].
164
+ Returns [] if HN is unreachable or returns an error.
165
+ """
166
+ try:
167
+ response = await client.get(HN_TOP_STORIES_URL)
168
+
169
+ if response.status_code != 200:
170
+ logger.warning(
171
+ f"[HackerNews] Top stories endpoint returned HTTP {response.status_code}"
172
+ )
173
+ return []
174
+
175
+ ids = response.json()
176
+
177
+ # Sanity check β€” make sure we got a list of numbers, not garbage
178
+ if not isinstance(ids, list):
179
+ logger.warning("[HackerNews] Unexpected response format for top IDs.")
180
+ return []
181
+
182
+ return ids
183
+
184
+ except Exception as e:
185
+ logger.error(f"[HackerNews] Failed to fetch top IDs: {e}")
186
+ return []
187
+
188
+ async def _fetch_single_item(
189
+ self, client: httpx.AsyncClient, item_id: int
190
+ ) -> Optional[dict]:
191
+ """
192
+ Step 2 (single unit): Fetch the details for one Hacker News story.
193
+
194
+ Args:
195
+ client (httpx.AsyncClient): Shared client passed from fetch_news().
196
+ item_id (int): The numeric ID of the story to fetch.
197
+
198
+ Returns:
199
+ dict of story details, or None if the request failed.
200
+ """
201
+ url = HN_ITEM_URL.format(item_id=item_id)
202
+ try:
203
+ response = await client.get(url)
204
+
205
+ if response.status_code != 200:
206
+ return None
207
+
208
+ item = response.json()
209
+
210
+ # HN can return null for deleted or dead items
211
+ if not item:
212
+ return None
213
+
214
+ return item
215
+
216
+ except Exception:
217
+ # A single story failing should not cancel the other 29 stories
218
+ return None
219
+
220
+ def _map_items_to_articles(
221
+ self, raw_items: list, category: str
222
+ ) -> List[Article]:
223
+ """
224
+ Convert raw Hacker News JSON items into Segmento Pulse Article objects.
225
+
226
+ This is where all the data transformation happens:
227
+ - Unix timestamp β†’ ISO 8601 string
228
+ - Missing URL β†’ skip (self-posts cannot be stored)
229
+ - Missing image β†’ "" (frontend uses Pulse banner)
230
+ - Missing text β†’ "" (HN has no descriptions for external links)
231
+
232
+ Args:
233
+ raw_items (list): Results from asyncio.gather() β€” each is either
234
+ a dict (success) or None/Exception (failure).
235
+ category (str): The category string from the aggregator.
236
+ We pass it through as-is.
237
+
238
+ Returns:
239
+ List[Article]: Clean, valid Article objects ready for the pipeline.
240
+ """
241
+ articles: List[Article] = []
242
+
243
+ for item in raw_items:
244
+
245
+ # Skip anything that errored or returned null from HN
246
+ if item is None or isinstance(item, Exception):
247
+ continue
248
+
249
+ # ── Check: Skip non-story types ───────────────────────────────
250
+ # HN API also returns "job", "comment", "poll" types.
251
+ # We only want "story" type β€” the actual articles.
252
+ if item.get("type") != "story":
253
+ continue
254
+
255
+ # ── Check: Skip self-posts that have no external URL ──────────
256
+ # "Ask HN", "Show HN", and other self-posts have no 'url' key.
257
+ # Our database cannot store a meaningful link for these.
258
+ url = item.get("url", "")
259
+ if not url or not url.startswith("http"):
260
+ continue
261
+
262
+ # ── Check: Skip stories without a title ───────────────────────
263
+ title = (item.get("title") or "").strip()
264
+ if not title:
265
+ continue
266
+
267
+ # ── Convert: Unix timestamp β†’ ISO 8601 string ─────────────────
268
+ # HN stores time as seconds since 1970-01-01 (Unix epoch).
269
+ # Example: 1709432800 β†’ "2024-03-03T04:46:40+00:00"
270
+ unix_time = item.get("time")
271
+ if unix_time:
272
+ published_at = datetime.fromtimestamp(
273
+ unix_time, tz=timezone.utc
274
+ ).isoformat()
275
+ else:
276
+ # If HN somehow has no timestamp, use now as fallback.
277
+ # The freshness gate in data_validation.py will still check it.
278
+ published_at = datetime.now(tz=timezone.utc).isoformat()
279
+
280
+ # ── Build the Article dict ─────────────────────────────────────
281
+ # We use a plain dict here; the aggregator's validation layer
282
+ # converts dicts β†’ Article objects and runs all the checks.
283
+ try:
284
+ article = Article(
285
+ title=title,
286
+ description="", # HN does not provide descriptions
287
+ url=url,
288
+ image_url="", # HN does not provide images
289
+ published_at=published_at,
290
+ source="Hacker News",
291
+ # ── ROUTING RULE ──────────────────────────────────────
292
+ # We pass through whatever category the aggregator gave us.
293
+ # If the article doesn't match this category, the keyword
294
+ # gate in data_validation.is_relevant_to_category() will
295
+ # reject it safely β€” no routing damage to the database.
296
+ category=category,
297
+ )
298
+ articles.append(article)
299
+
300
+ except Exception as e:
301
+ # If one article fails Pydantic validation, log and skip it.
302
+ # Never let one bad article break the whole batch.
303
+ logger.debug(
304
+ f"[HackerNews] Skipped item id={item.get('id')}: {e}"
305
+ )
306
+ continue
307
+
308
+ return articles
309
+
310
+ # ─────────────────────────────────────────────────────────────────────────
311
+ # PHASE 12: IMAGE ENRICHMENT β€” async post-processing step
312
+ # ─────────────────────────────────────────────────────────────────────────
313
+
314
+ async def _enrich_article_images(self, articles: List[Article]) -> List[Article]:
315
+ """
316
+ For every article that has an empty image_url, visit its URL and
317
+ try to find the main image using the og:image HTML meta tag.
318
+
319
+ Phase 14 fix: Added asyncio.Semaphore(10) to cap concurrent connections.
320
+
321
+ Before this fix: 30 HN articles β†’ 30 simultaneous HTTP connections to
322
+ 30 different websites. On a slow network day or from a Hugging Face
323
+ shared container, this could exhaust available socket handles.
324
+
325
+ After this fix: At most 10 website visits run at the same time.
326
+ Think of it like 10 checkout lanes at a supermarket β€” if 30 people
327
+ arrive, 10 go through immediately and 20 wait in line. Nobody gets
328
+ turned away, and the store doesn't collapse.
329
+
330
+ The total added time is still bounded by the 4-second timeout inside
331
+ extract_top_image, not by the semaphore.
332
+
333
+ Args:
334
+ articles (List[Article]): Articles from _map_items_to_articles().
335
+
336
+ Returns:
337
+ List[Article]: Same articles, with image_url filled in where possible.
338
+ """
339
+ if not articles:
340
+ return articles
341
+
342
+ # Max 10 website visits at the same time.
343
+ # The semaphore is created fresh per call so it doesn't leak state
344
+ # between separate fetch_news() invocations.
345
+ sem = asyncio.Semaphore(10)
346
+
347
+ async def _get_image(article: Article) -> str:
348
+ if article.image_url and article.image_url.startswith("http"):
349
+ return article.image_url # Already has an image β€” skip
350
+ # Acquire one of 10 available slots before hitting the network.
351
+ async with sem:
352
+ return await extract_top_image(article.url)
353
+
354
+ image_tasks = [_get_image(a) for a in articles]
355
+ fetched_images = await asyncio.gather(*image_tasks, return_exceptions=True)
356
+
357
+ # Apply the fetched images back to the articles.
358
+ enriched: List[Article] = []
359
+ for article, image_result in zip(articles, fetched_images):
360
+ if isinstance(image_result, str) and image_result:
361
+ # Pydantic v2: model_copy() changes one field without mutating.
362
+ article = article.model_copy(update={"image_url": image_result})
363
+ enriched.append(article)
364
+
365
+ return enriched
app/services/providers/inshorts/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # providers/inshorts/__init__.py
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ # This file marks the 'inshorts' folder as a Python package.
4
+ # To use the Inshorts provider, import it like this:
5
+ #
6
+ # from app.services.providers.inshorts.client import InshortsProvider
7
+ #
8
+ # Inshorts is a FREE provider β€” no API key needed, no rate limits.
9
+ # It runs in the FREE_SOURCES list, behind the GENERAL_TECH_CATEGORIES
10
+ # guardrail (same as Hacker News), because its content is broad tech news
11
+ # rather than anything niche like cloud-alibaba or data-governance.
app/services/providers/inshorts/client.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ providers/inshorts/client.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ The Inshorts Provider for Segmento Pulse.
5
+
6
+ What this does:
7
+ Fetches 60-word tech news summaries from the Inshorts community API.
8
+ Inshorts takes long articles from the internet and rewrites them in
9
+ exactly 60 words. This gives our users very quick, scannable reads.
10
+
11
+ Free. No API key needed. No rate limits.
12
+
13
+ Where it sits in the pipeline:
14
+ FREE_SOURCES (always runs in parallel).
15
+ Gated behind GENERAL_TECH_CATEGORIES β€” same rule as Hacker News.
16
+ Inshorts "technology" news is broad. It does not know the difference
17
+ between "cloud-alibaba" and "cloud-gcp". We only ask it for wide,
18
+ general categories where its content is genuinely valuable.
19
+
20
+ The special data quirk (split date and time):
21
+ Inshorts returns the article timestamp as TWO separate strings:
22
+ "date": "Mon, 03 Mar 2026"
23
+ "time": "10:30 AM, IST"
24
+
25
+ Our Pydantic Article model needs a SINGLE published_at timestamp.
26
+ So we join them: "Mon, 03 Mar 2026 10:30 AM, IST"
27
+ Then we parse that combined string into a proper datetime object using
28
+ dateutil.parser (the same library our rss_parser.py already uses).
29
+
30
+ If parsing fails, we safely fall back to datetime.now() so the article
31
+ still enters the pipeline and the freshness gate makes the final call.
32
+
33
+ API note:
34
+ The endpoint used below is a well-known community-maintained mirror of
35
+ the Inshorts API. It may change URLs over time. The try/except in
36
+ fetch_news() wraps the entire fetch, so even if the endpoint goes down,
37
+ the aggregator just gets an empty list and moves on without crashing.
38
+ """
39
+
40
+ # ── Standard Library ──────────────────────────────────────────────────────────
41
+ import asyncio
42
+ import logging
43
+ import time
44
+ from datetime import datetime, timezone
45
+ from typing import List
46
+
47
+ # ── Third-party (already available β€” used by rss_parser.py line 209) ─────────
48
+ import httpx # Async HTTP client
49
+ from dateutil import parser as dateutil_parser # Flexible date string parser
50
+
51
+ # ── Internal ──────────────────────────────────────────────────────────────────
52
+ from app.services.providers.base import NewsProvider, ProviderStatus
53
+ from app.models import Article
54
+
55
+ logger = logging.getLogger(__name__)
56
+
57
+ # ── Constants ─────────────────────────────────────────────────────────────────
58
+
59
+ # Inshorts community API endpoint.
60
+ # The 'category=technology' filter is the closest match to our content needs.
61
+ # Other available categories: national, business, sports, entertainment, etc.
62
+ INSHORTS_URL = "https://inshorts.deta.dev/news?category=technology"
63
+
64
+ # Request timeout in seconds. Kept generous because this is a community server.
65
+ HTTP_TIMEOUT_SECONDS = 12.0
66
+
67
+ # Max articles to take from one response. Inshorts usually sends 10-25.
68
+ MAX_ARTICLES = 20
69
+
70
+
71
+ class InshortsProvider(NewsProvider):
72
+ """
73
+ Fetches 60-word technology summaries from the Inshorts community API.
74
+
75
+ Free. No API key. No daily limit.
76
+ Sits in FREE_SOURCES, gated by GENERAL_TECH_CATEGORIES.
77
+
78
+ Usage (wired in Phase 6):
79
+ provider = InshortsProvider()
80
+ articles = await provider.fetch_news(category="ai", limit=20)
81
+ """
82
+
83
+ def __init__(self):
84
+ # Free provider β€” no API key, no daily limit.
85
+ super().__init__(api_key=None)
86
+ self.daily_limit = 0
87
+
88
+ # Phase 17: Fetch-Once, Fan-Out cache
89
+ #
90
+ # Inshorts hits a community server β€” not a CDN like GitHub Pages.
91
+ # Without a cache, every category loop sends a request to that
92
+ # community server, increasing the chance of a 429 rate-limit block.
93
+ # With a cache: 22 category calls β†’ 1 real HTTP call per 45 minutes.
94
+ self._cached_articles: List[Article] = []
95
+ self._cache_time: float = 0.0
96
+
97
+ # Lock prevents the "thundering herd": multiple concurrent calls
98
+ # all seeing an empty cache and all fetching at the same time.
99
+ self._lock = asyncio.Lock()
100
+
101
+ # ─────────────────────────────────────────────────────────────────────────
102
+ # MAIN ENTRY POINT β€” called by the aggregator's FREE PARALLEL RUN
103
+ # ────────────────────────────────────────���────────────────────────────────
104
+
105
+ async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
106
+ """
107
+ Fetch technology articles from the Inshorts community API.
108
+
109
+ Args:
110
+ category (str): Our internal category string (e.g., "ai").
111
+ We tag every article with it. The keyword gate
112
+ filters out articles that don't actually match.
113
+ limit (int): Max articles to return. Capped at MAX_ARTICLES.
114
+
115
+ Returns:
116
+ List[Article]: Mapped Article objects. Returns [] on any failure.
117
+ """
118
+ # ── Phase 17: Cache check (OUTER) ─────────────────────────────────────
119
+ CACHE_TTL_SECONDS = 2700 # 45 minutes
120
+
121
+ if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
122
+ logger.debug(
123
+ "[Inshorts] Cache hit β€” returning %d cached articles for category='%s'. "
124
+ "No HTTP calls made.",
125
+ len(self._cached_articles), category
126
+ )
127
+ return self._cached_articles
128
+
129
+ # ── Cache stale or empty: acquire the lock and fetch ───────────────────
130
+ async with self._lock:
131
+
132
+ # ── Cache check (INNER) β€” double-checked locking ──────────────
133
+ if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
134
+ logger.debug(
135
+ "[Inshorts] Cache hit after lock β€” returning %d cached articles.",
136
+ len(self._cached_articles)
137
+ )
138
+ return self._cached_articles
139
+
140
+ logger.info(
141
+ "[Inshorts] Cache stale/empty. Fetching from community API for category='%s'...",
142
+ category
143
+ )
144
+
145
+ try:
146
+ async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
147
+
148
+ response = await client.get(
149
+ INSHORTS_URL,
150
+ headers={"User-Agent": "SegmentoPulse-Ingestion/1.0"},
151
+ follow_redirects=True,
152
+ )
153
+
154
+ # ── Handle rate limit ──────────────────────────────────────
155
+ if response.status_code == 429:
156
+ logger.warning("[Inshorts] Hit 429 rate limit.")
157
+ self.mark_rate_limited()
158
+ return []
159
+
160
+ # ── Handle non-200 responses ──────────────────────────────
161
+ if response.status_code != 200:
162
+ logger.warning(
163
+ "[Inshorts] Unexpected HTTP %d. "
164
+ "The community API endpoint may have changed.",
165
+ response.status_code
166
+ )
167
+ return []
168
+
169
+ data = response.json()
170
+
171
+ # Inshorts wraps the article list inside a 'data' key.
172
+ raw_articles = data.get("data", [])
173
+
174
+ if not isinstance(raw_articles, list) or not raw_articles:
175
+ logger.info("[Inshorts] No articles in response.")
176
+ return []
177
+
178
+ all_articles = self._map_articles(
179
+ raw_articles[:min(limit, MAX_ARTICLES)],
180
+ category
181
+ )
182
+
183
+ logger.info(
184
+ "[Inshorts] Fetched %d articles. Caching for 45 minutes.",
185
+ len(all_articles)
186
+ )
187
+
188
+ # Save to class-level cache.
189
+ self._cached_articles = all_articles
190
+ self._cache_time = time.time()
191
+ return all_articles
192
+
193
+ except httpx.TimeoutException:
194
+ logger.warning("[Inshorts] Request timed out β€” endpoint may be slow.")
195
+ return []
196
+ except Exception as e:
197
+ logger.error(f"[Inshorts] Unexpected error: {e}", exc_info=True)
198
+ return []
199
+
200
+ # ─────────────────────────────────────────────────────────────────────────
201
+ # PRIVATE HELPERS
202
+ # ─────────────────────────────────────────────────────────────────────────
203
+
204
+ def _parse_inshorts_date(self, date_str: str, time_str: str) -> str:
205
+ """
206
+ Solve the split date/time problem.
207
+
208
+ Inshorts gives us date and time as two separate strings.
209
+ Example:
210
+ date_str = "Mon, 03 Mar 2026"
211
+ time_str = "10:30 AM, IST"
212
+
213
+ Step 1: Join them β†’ "Mon, 03 Mar 2026 10:30 AM, IST"
214
+ Step 2: Parse with dateutil (handles many date formats automatically)
215
+ Step 3: Convert to UTC-aware ISO 8601 string
216
+
217
+ If parsing fails for any reason, we return the current time as a
218
+ safe fallback. The freshness gate downstream will evaluate it.
219
+
220
+ Args:
221
+ date_str (str): The date portion from the API (e.g., "Mon, 03 Mar 2026")
222
+ time_str (str): The time portion from the API (e.g., "10:30 AM, IST")
223
+
224
+ Returns:
225
+ str: ISO 8601 timestamp string (e.g., "2026-03-03T05:00:00+00:00")
226
+ """
227
+ # Clean up trailing ", IST" or "(IST)" markers β€” dateutil sometimes
228
+ # gets confused by non-standard timezone abbreviations like IST.
229
+ # We strip them and treat the time as IST = UTC+5:30 manually.
230
+ cleaned_time = (
231
+ time_str
232
+ .replace(", IST", "")
233
+ .replace("(IST)", "")
234
+ .strip()
235
+ )
236
+ combined = f"{date_str.strip()} {cleaned_time}"
237
+
238
+ try:
239
+ # dateutil.parser is very flexible β€” it handles formats like:
240
+ # "Mon, 03 Mar 2026 10:30 AM" without needing a strptime pattern.
241
+ parsed_dt = dateutil_parser.parse(combined)
242
+
243
+ # If the parsed datetime has no timezone info (which it won't after
244
+ # we stripped IST), we tell Python it was in IST (UTC+5:30).
245
+ if parsed_dt.tzinfo is None:
246
+ from datetime import timedelta
247
+ IST = timezone(timedelta(hours=5, minutes=30))
248
+ parsed_dt = parsed_dt.replace(tzinfo=IST)
249
+
250
+ # Convert to UTC for consistent storage across all providers.
251
+ utc_dt = parsed_dt.astimezone(timezone.utc)
252
+ return utc_dt.isoformat()
253
+
254
+ except Exception as e:
255
+ logger.debug(
256
+ f"[Inshorts] Date parse failed for '{combined}': {e} β€” using now()."
257
+ )
258
+ # Safe fallback: use current UTC time.
259
+ # The freshness gate will still check it and decide if it's valid.
260
+ return datetime.now(tz=timezone.utc).isoformat()
261
+
262
+ def _map_articles(self, raw_articles: list, category: str) -> List[Article]:
263
+ """
264
+ Convert raw Inshorts JSON items into Segmento Pulse Article objects.
265
+
266
+ Key field mappings:
267
+ Inshorts field β†’ Article field
268
+ ─────────────────────────────────────
269
+ title β†’ title
270
+ content β†’ description (the famous 60-word summary)
271
+ readMoreUrl β†’ url
272
+ imageUrl β†’ image_url
273
+ author β†’ source
274
+ date + time (joined) β†’ published_at
275
+
276
+ Args:
277
+ raw_articles (list): The list from the API's 'data' key.
278
+ category (str): The category from the aggregator.
279
+
280
+ Returns:
281
+ List[Article]: Clean, validated Article objects.
282
+ """
283
+ articles: List[Article] = []
284
+
285
+ for item in raw_articles:
286
+ if not isinstance(item, dict):
287
+ continue
288
+
289
+ # ── Title ────────────────────────────────────────────────────
290
+ title = (item.get("title") or "").strip()
291
+ if not title:
292
+ continue
293
+
294
+ # ── URL ──────────────────────────────────────────────────────
295
+ # Inshorts calls this 'readMoreUrl' β€” the link to the full article.
296
+ url = (item.get("readMoreUrl") or "").strip()
297
+ if not url or not url.startswith("http"):
298
+ continue # Skip if no valid link
299
+
300
+ # ── Description (the 60-word summary) ────────────────────────
301
+ # Inshorts calls the summary field 'content'.
302
+ description = (item.get("content") or "").strip()
303
+
304
+ # ── Image URL ─────────────────────────────────────────────────
305
+ # Inshorts calls this 'imageUrl' (camelCase).
306
+ image_url = (item.get("imageUrl") or "").strip()
307
+
308
+ # ── Source ───────────────────────────────────────────────────
309
+ # The 'author' field holds the original publication name
310
+ # (e.g., "TechCrunch", "NDTV Gadgets"). We use that as source.
311
+ # Fall back to "Inshorts" if author is missing.
312
+ source = (item.get("author") or "Inshorts").strip()
313
+ if not source:
314
+ source = "Inshorts"
315
+
316
+ # ── Date Fix: Combine split date + time ───────────────────────
317
+ # This is THE key transformation for this provider.
318
+ # See _parse_inshorts_date() above for the full explanation.
319
+ date_part = item.get("date") or ""
320
+ time_part = item.get("time") or ""
321
+ published_at = self._parse_inshorts_date(date_part, time_part)
322
+
323
+ # ── Build Article ─────────────────────────────────────────────
324
+ try:
325
+ article = Article(
326
+ title=title,
327
+ description=description,
328
+ url=url,
329
+ image_url=image_url,
330
+ published_at=published_at,
331
+ source=source,
332
+ # ── ROUTING RULE ──────────────────────────────────────
333
+ # We pass through the aggregator's category.
334
+ # The keyword gate will filter irrelevant articles.
335
+ # Unknown categories safely route to 'News Articles'.
336
+ category=category,
337
+ )
338
+ articles.append(article)
339
+
340
+ except Exception as e:
341
+ logger.debug(
342
+ f"[Inshorts] Skipped item '{title[:50]}': {e}"
343
+ )
344
+ continue
345
+
346
+ return articles
app/services/providers/openrss/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # providers/openrss/__init__.py
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ # This file marks the 'openrss' folder as a Python package.
4
+ # To use this provider, import it like this:
5
+ #
6
+ # from app.services.providers.openrss.client import OpenRSSProvider
7
+ #
8
+ # OpenRSS is FREE β€” no API key needed. It generates XML feeds on-the-fly
9
+ # for any website, even sites that don't publish an RSS feed themselves.
10
+ #
11
+ # ── CRITICAL RULE: RESPECT THE COOLDOWN ──────────────────────────────────
12
+ # OpenRSS explicitly says "aggregator use is not officially supported".
13
+ # If you fetch too frequently, they WILL ban your server's IP address.
14
+ # The OpenRSSProvider enforces a strict 60-minute internal cooldown timer.
15
+ # DO NOT reduce COOLDOWN_SECONDS below 3600. Breaking this causes IP bans.
app/services/providers/openrss/client.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ providers/openrss/client.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ The OpenRSS Provider for Segmento Pulse.
5
+
6
+ What this does:
7
+ Fetches RSS feeds for websites that don't publish their own RSS feed,
8
+ by using OpenRSS.org as a free feed generation service.
9
+
10
+ Target blogs:
11
+ dev.to β†’ openrss.org/dev.to
12
+ hashnode.com β†’ openrss.org/hashnode.com
13
+ github.com/blog β†’ openrss.org/github.com/blog
14
+
15
+ Free. No API key. No daily limits. Just XML text.
16
+
17
+ ── THE IP BAN RISK AND HOW WE SOLVE IT ─────────────────────────────────────
18
+
19
+ OpenRSS.org says clearly in their documentation:
20
+ "Aggregator use is not officially supported."
21
+ "We will block IP addresses that ignore our Cache-Control headers."
22
+
23
+ A normal aggregator calls all its sources every hour.
24
+ If we did that with OpenRSS, we would get IP-banned within a day.
25
+
26
+ Our fix: A strict 60-minute (3600 second) internal cooldown timer.
27
+
28
+ How it works:
29
+ - When the provider is first created, self.last_fetched = 0
30
+ - When fetch_news() is called, it first checks:
31
+ time.time() - self.last_fetched < COOLDOWN_SECONDS?
32
+ - If YES β†’ return [] immediately, do not touch the network at all
33
+ - If NO β†’ update self.last_fetched, then fetch
34
+
35
+ This guarantees that OpenRSS sees at most ONE request per hour,
36
+ per URL, from our server β€” which respects their Cache-Control policy.
37
+
38
+ Because our scheduler runs many categories per hour, without this timer,
39
+ OpenRSS would get hit dozens of times per hour. With the timer, it gets
40
+ hit at most once every 60 minutes regardless of how many categories fire.
41
+
42
+ ── WHY WE DO NOT USE parse_provider_rss() ──────────────────────────────────
43
+
44
+ The user instruction suggests using parse_provider_rss() from rss_parser.py.
45
+ We discovered in Phase 4 (direct_rss provider) that this function hardcodes:
46
+
47
+ category = f'cloud-{provider}'
48
+
49
+ on EVERY article it creates. If we passed "dev.to" as the provider name,
50
+ every article from dev.to would get category='cloud-dev.to'. Appwrite
51
+ would not know this collection exists, silently dropping those articles.
52
+
53
+ Decision (consistent with Phase 4): We use feedparser directly and borrow
54
+ only the two STATELESS helper methods from rss_parser.py:
55
+ - _extract_image_from_entry() β†’ extracts images cleanly
56
+ - _parse_date() β†’ handles all date format variants
57
+
58
+ This is the same engineering decision made in Phase 4 for direct_rss,
59
+ and it was reviewed and approved by the lead architect.
60
+ """
61
+
62
+ # ── Standard Library ──────────────────────────────────────────────────────────
63
+ import asyncio
64
+ import logging
65
+ import re
66
+ import time
67
+ from typing import List
68
+
69
+ # ── Third-party (already in requirements.txt) ─────────────────────────────────
70
+ import feedparser # XML/RSS feed parser β€” already used by rss_parser.py
71
+ import httpx # Async HTTP client
72
+
73
+ # ── Internal ──────────────────────────────────────────────────────────────────
74
+ from app.services.providers.base import NewsProvider
75
+ from app.services.rss_parser import RSSParser # Borrowed for helper methods only
76
+ from app.models import Article
77
+ # Phase 15: Import the Redis-backed state utility so the cooldown
78
+ # timer survives Hugging Face Space restarts.
79
+ from app.services.utils.provider_state import (
80
+ get_provider_timestamp,
81
+ set_provider_timestamp,
82
+ )
83
+
84
+ logger = logging.getLogger(__name__)
85
+
86
+ # ── OpenRSS Feed Registry ──────────────────────────────────────────────────────
87
+ #
88
+ # Each entry is a tuple of (openrss_url, source_name).
89
+ # source_name appears in the Pulse UI next to each article headline.
90
+ #
91
+ # To add more feeds in the future, just add a new tuple here.
92
+ # The fetch loop picks it up automatically β€” no other code changes needed.
93
+ #
94
+ # ⚠️ IMPORTANT: Be conservative. Every URL here gets fetched once per cooldown
95
+ # window. Adding too many URLs consumes more of our cooldown budget.
96
+ #
97
+ OPENRSS_FEEDS: List[tuple] = [
98
+ ("https://openrss.org/dev.to", "dev.to"),
99
+ ("https://openrss.org/hashnode.com", "Hashnode"),
100
+ ("https://openrss.org/github.com/blog", "GitHub Blog"),
101
+ ]
102
+
103
+ # ── Cooldown Timer ────────────────────────��────────────────────────────────────
104
+ # 3600 seconds = 60 minutes.
105
+ # This is the minimum safe polling interval as per OpenRSS's documentation.
106
+ # DO NOT reduce this value. Doing so risks an IP ban on Segmento Pulse's server.
107
+ COOLDOWN_SECONDS = 3600
108
+
109
+ # HTTP request timeout. OpenRSS is a third-party service; give it enough time.
110
+ HTTP_TIMEOUT_SECONDS = 15.0
111
+
112
+ # Max articles to take from each individual feed per cooldown window.
113
+ MAX_ARTICLES_PER_FEED = 10
114
+
115
+
116
+ class OpenRSSProvider(NewsProvider):
117
+ """
118
+ Fetches RSS feeds from dev.to, Hashnode, and GitHub Blog via OpenRSS.org.
119
+
120
+ Free. No API key. Strictly rate-self-limited to once per 60 minutes.
121
+ Runs for ALL categories in FREE_SOURCES β€” no category guardrail needed
122
+ because the cooldown timer is the primary protection mechanism.
123
+
124
+ Usage (wired in Phase 9):
125
+ provider = OpenRSSProvider()
126
+ articles = await provider.fetch_news(category="ai", limit=30)
127
+ """
128
+
129
+ def __init__(self):
130
+ # Free provider β€” no API key, no daily limit.
131
+ super().__init__(api_key=None)
132
+ self.daily_limit = 0
133
+
134
+ # Phase 15: The cooldown timer has moved to Redis.
135
+ # self.last_fetched is kept as a local fallback cache: if Redis is
136
+ # unreachable on startup, we fall back to 0.0 (fail-open β€” allowed
137
+ # to run). On every successful Redis read in fetch_news(), this
138
+ # local value is updated so it stays in sync.
139
+ self.last_fetched: float = 0.0
140
+
141
+ # Borrow stateless helpers from the existing RSSParser.
142
+ # We do NOT call parse_provider_rss() β€” see module docstring above.
143
+ self._rss_helpers = RSSParser()
144
+
145
+ # ─────────────────────────────────────────────────────────────────────────
146
+ # MAIN ENTRY POINT β€” called by the aggregator's FREE PARALLEL RUN
147
+ # ─────────────────────────────────────────────────────────────────────────
148
+
149
+ async def fetch_news(self, category: str, limit: int = 30) -> List[Article]:
150
+ """
151
+ Fetch articles from all OpenRSS feeds β€” but only if 60 minutes have
152
+ passed since the last successful fetch.
153
+
154
+ Args:
155
+ category (str): The aggregator's category β€” tagged on every article.
156
+ The keyword gate filters irrelevant articles downstream.
157
+ limit (int): Soft cap on total articles to return.
158
+
159
+ Returns:
160
+ List[Article]: Combined articles from all feeds.
161
+ Returns [] immediately if we are still in cooldown.
162
+ """
163
+ # ── SAFETY CHECK: Are we still in the cooldown window? ────────────────
164
+ # Phase 15: Read the last-fetch timestamp from Redis instead of RAM.
165
+ #
166
+ # Before Phase 15: self.last_fetched (pure RAM, wiped on restart)
167
+ # After Phase 15: Redis key "provider:state:openrss:last_fetch"
168
+ # survives restarts, deployments, and container OOM kills.
169
+ #
170
+ # If Redis is down: get_provider_timestamp returns 0.0 (fail-open).
171
+ # This means the provider is allowed to run. One extra OpenRSS call
172
+ # is far safer than permanently blocking the provider because Redis
173
+ # happened to be unreachable for 10 seconds during a cold boot.
174
+ redis_last_fetched = await get_provider_timestamp("openrss")
175
+
176
+ # Keep the local RAM value in sync for logging and debugging purposes.
177
+ # This does NOT affect the cooldown logic β€” only redis_last_fetched does.
178
+ self.last_fetched = redis_last_fetched
179
+
180
+ seconds_since_last_fetch = time.time() - redis_last_fetched
181
+ if seconds_since_last_fetch < COOLDOWN_SECONDS:
182
+ minutes_remaining = int(
183
+ (COOLDOWN_SECONDS - seconds_since_last_fetch) / 60
184
+ )
185
+ logger.info(
186
+ "[OpenRSS] Cooldown active β€” %d minute(s) remaining before next fetch. "
187
+ "Skipping to protect against IP ban.",
188
+ minutes_remaining
189
+ )
190
+ return []
191
+
192
+ # ── OK to fetch: save the new timestamp to Redis BEFORE hitting the network ──
193
+ # We write BEFORE the network calls, not after. Here is why:
194
+ # If we save the timestamp AFTER and the fetch crashes halfway through,
195
+ # the next scheduler cycle will see "last_fetched = 0" and fire again
196
+ # immediately β€” hammering OpenRSS with rapid retries. That is the
197
+ # exact behaviour that triggers IP bans.
198
+ # By writing the timestamp FIRST, any crash still waits the full
199
+ # 60 minutes before the next attempt. Better to miss one batch than
200
+ # to risk a permanent IP ban.
201
+ current_time = time.time()
202
+ self.last_fetched = current_time # Keep RAM copy in sync
203
+ await set_provider_timestamp("openrss", current_time)
204
+
205
+ logger.info(
206
+ "[OpenRSS] Cooldown clear (Redis-backed). Starting fetch of %d feeds...",
207
+ len(OPENRSS_FEEDS)
208
+ )
209
+
210
+
211
+ try:
212
+ async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
213
+
214
+ # Build one fetch task per feed URL β€” all fire simultaneously.
215
+ fetch_tasks = [
216
+ self._fetch_and_parse_feed(client, url, source_name, category)
217
+ for url, source_name in OPENRSS_FEEDS
218
+ ]
219
+
220
+ # Wait for all feeds to complete at the same time.
221
+ results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
222
+
223
+ # Combine all articles from all feeds.
224
+ all_articles: List[Article] = []
225
+ for (_, source_name), result in zip(OPENRSS_FEEDS, results):
226
+ if isinstance(result, Exception):
227
+ logger.warning(
228
+ f"[OpenRSS] [{source_name}] Feed fetch failed: {result}"
229
+ )
230
+ elif isinstance(result, list):
231
+ all_articles.extend(result)
232
+
233
+ logger.info(
234
+ f"[OpenRSS] Collected {len(all_articles)} articles "
235
+ f"from {len(OPENRSS_FEEDS)} feeds for category='{category}'"
236
+ )
237
+ return all_articles
238
+
239
+ except Exception as e:
240
+ logger.error(f"[OpenRSS] Unexpected error: {e}", exc_info=True)
241
+ return []
242
+
243
+ # ─────────────────────────────────────────────────────────────────────────
244
+ # PRIVATE HELPERS
245
+ # ─────────────────────────────────────────────────────────────────────────
246
+
247
+ async def _fetch_and_parse_feed(
248
+ self,
249
+ client: httpx.AsyncClient,
250
+ url: str,
251
+ source_name: str,
252
+ category: str,
253
+ ) -> List[Article]:
254
+ """
255
+ Fetch one OpenRSS feed URL and parse its XML into Article objects.
256
+
257
+ Args:
258
+ client (httpx.AsyncClient): Shared HTTP client from fetch_news().
259
+ url (str): Full OpenRSS URL (e.g., openrss.org/dev.to).
260
+ source_name (str): Human-readable label (e.g., "dev.to").
261
+ category (str): The aggregator's category β€” tagged on each article.
262
+
263
+ Returns:
264
+ List[Article]: Parsed articles. Returns [] on any failure.
265
+ """
266
+ try:
267
+ response = await client.get(
268
+ url,
269
+ headers={
270
+ "User-Agent": "SegmentoPulse-RSS-Reader/1.0",
271
+ # Sending Cache-Control: no-cache would be rude.
272
+ # We rely on our cooldown timer to manage freshness,
273
+ # not by asking OpenRSS to skip their cache.
274
+ },
275
+ follow_redirects=True,
276
+ )
277
+
278
+ if response.status_code == 429:
279
+ # If OpenRSS sends a 429 despite our cooldown, double the wait
280
+ # by resetting the timer to now (conservative recovery).
281
+ logger.warning(
282
+ f"[OpenRSS] [{source_name}] HTTP 429 β€” rate-limited despite "
283
+ "cooldown. Consider increasing COOLDOWN_SECONDS."
284
+ )
285
+ return []
286
+
287
+ if response.status_code != 200:
288
+ logger.warning(
289
+ f"[OpenRSS] [{source_name}] HTTP {response.status_code} β€” skipping."
290
+ )
291
+ return []
292
+
293
+ xml_text = response.text
294
+
295
+ except httpx.TimeoutException:
296
+ logger.warning(f"[OpenRSS] [{source_name}] Timed out β€” skipping.")
297
+ return []
298
+ except Exception as e:
299
+ logger.warning(f"[OpenRSS] [{source_name}] Fetch error: {e}")
300
+ return []
301
+
302
+ return self._parse_feed_xml(xml_text, source_name, category)
303
+
304
+ def _parse_feed_xml(
305
+ self,
306
+ xml_text: str,
307
+ source_name: str,
308
+ category: str,
309
+ ) -> List[Article]:
310
+ """
311
+ Parse raw XML from an OpenRSS feed into Article objects.
312
+
313
+ Uses feedparser directly β€” not parse_provider_rss() β€” because
314
+ parse_provider_rss hardcodes category='cloud-{provider}'.
315
+ We borrow _extract_image_from_entry and _parse_date for consistency.
316
+
317
+ Args:
318
+ xml_text (str): Raw XML string from the HTTP response.
319
+ source_name (str): The blog name (e.g., "dev.to").
320
+ category (str): Aggregator category β€” tagged on every article.
321
+
322
+ Returns:
323
+ List[Article]: Parsed article objects.
324
+ """
325
+ try:
326
+ feed = feedparser.parse(xml_text)
327
+ except Exception as e:
328
+ logger.warning(f"[OpenRSS] [{source_name}] feedparser failed: {e}")
329
+ return []
330
+
331
+ articles: List[Article] = []
332
+
333
+ for entry in feed.entries[:MAX_ARTICLES_PER_FEED]:
334
+
335
+ # ── Title ────────────────────────────────────────────────────
336
+ title = (entry.get("title") or "").strip()
337
+ if not title:
338
+ continue
339
+
340
+ # ── URL ──────────────────────────────────────────────────────
341
+ url = (entry.get("link") or "").strip()
342
+ if not url or not url.startswith("http"):
343
+ continue
344
+
345
+ # ── Description ───────────────────────────────────────────────
346
+ raw_desc = entry.get("summary", "") or ""
347
+ description = re.sub(r"<[^>]+>", "", raw_desc).strip()
348
+ if len(description) > 200:
349
+ description = description[:200] + "..."
350
+
351
+ # ── Image URL ─────────────────────────────────────────────────
352
+ # Reuse rss_parser's helper β€” checks media:content, enclosures, etc.
353
+ image_url = self._rss_helpers._extract_image_from_entry(entry)
354
+
355
+ # ── Published Date ────────────────────────────────────────────
356
+ # Reuse rss_parser's _parse_date β€” handles all date format variants.
357
+ raw_date = entry.get("published", "") or ""
358
+ published_at = self._rss_helpers._parse_date(raw_date)
359
+
360
+ # ── Build Article ─────────────────────────────────────────────
361
+ try:
362
+ article = Article(
363
+ title=title,
364
+ description=description,
365
+ url=url,
366
+ image_url=image_url,
367
+ published_at=published_at,
368
+ source=source_name,
369
+ # ── ROUTING RULE ──────────────────────────────────────
370
+ # Tag with the aggregator's category so the pipeline
371
+ # can route this correctly. Unknown categories safely
372
+ # fall back to the default 'News Articles' collection.
373
+ category=category,
374
+ )
375
+ articles.append(article)
376
+
377
+ except Exception as e:
378
+ logger.debug(
379
+ f"[OpenRSS] [{source_name}] Skipped entry '{title[:50]}': {e}"
380
+ )
381
+ continue
382
+
383
+ logger.info(f"[OpenRSS] [{source_name}] Parsed {len(articles)} articles.")
384
+ return articles
app/services/providers/sauravkanchan/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # providers/sauravkanchan/__init__.py
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ # This file marks the 'sauravkanchan' folder as a Python package.
4
+ # To use this provider, import it like this:
5
+ #
6
+ # from app.services.providers.sauravkanchan.client import SauravKanchanProvider
7
+ #
8
+ # This is a FREE, zero-rate-limit provider β€” it reads static JSON files
9
+ # hosted on GitHub Pages by developer Saurav Kanchan. No API key needed.
10
+ # It fetches tech headlines from both India (in.json) and the US (us.json)
11
+ # simultaneously, doubling volume with a single aggregator call.
12
+ # Gated behind GENERAL_TECH_CATEGORIES (same as Hacker News & Inshorts).
app/services/providers/sauravkanchan/client.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ providers/sauravkanchan/client.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ The SauravKanchan Static JSON Provider for Segmento Pulse.
5
+
6
+ What this does:
7
+ Reads two static JSON files hosted on GitHub Pages by a developer named
8
+ Saurav Kanchan. These files are automatically updated by a GitHub Action
9
+ that scrapes the top tech headlines from NewsAPI.org and saves them as
10
+ plain JSON files anyone can read for free.
11
+
12
+ We fetch TWO files at the same time:
13
+ in.json β†’ Top tech headlines from India
14
+ us.json β†’ Top tech headlines from the United States
15
+
16
+ Fetching both simultaneously means we get double the volume and double
17
+ the geographic coverage in roughly the same time as fetching just one.
18
+
19
+ Why this is zero-cost and zero-rate-limit:
20
+ These are not API calls β€” they are just reading a text file from the
21
+ internet. GitHub Pages has no rate limit for public static file reads.
22
+ No API key. No signup. No credit card. Completely free forever.
23
+
24
+ Why the data is high quality:
25
+ The JSON structure is identical to the paid NewsAPI.org format, which
26
+ means we get proper titles, descriptions, image URLs, publication dates,
27
+ and source names β€” all cleanly pre-formatted for us.
28
+
29
+ Freshness note (important):
30
+ Saurav's GitHub Action runs on its own schedule β€” typically a few times
31
+ per day. This means some articles in the file may be several hours old
32
+ by the time we read them. That is perfectly fine. Our freshness gate in
33
+ data_validation.is_valid_article() will automatically reject anything
34
+ older than our midnight IST cutoff. We never need to pre-filter here.
35
+
36
+ Client-side constraint note:
37
+ These are static files β€” we cannot add query parameters. We get
38
+ whatever is in the file. The keyword gate handles topic filtering.
39
+ """
40
+
41
+ # ── Standard Library ──────────────────────────────────────────────────────────
42
+ import asyncio
43
+ import logging
44
+ import time
45
+ from typing import List, Optional
46
+
47
+ # ── Third-party (already in requirements.txt) ─────────────────────────────────
48
+ import httpx # Async HTTP client
49
+
50
+ # ── Internal ──────────────────────────────────────────────────────────────────
51
+ from app.services.providers.base import NewsProvider, ProviderStatus
52
+ from app.models import Article
53
+
54
+ logger = logging.getLogger(__name__)
55
+
56
+ # ── Static JSON URLs ───────────────────────────────────────────────────────────
57
+ #
58
+ # Both files are hosted on GitHub Pages and updated automatically by a
59
+ # GitHub Action. They follow the exact same JSON structure as NewsAPI.org.
60
+ #
61
+ # To change regions or add new ones (e.g., gb.json), just add a new entry here.
62
+ # The fetch loop picks it up automatically.
63
+ #
64
+ STATIC_FEED_URLS: List[tuple] = [
65
+ (
66
+ "https://saurav.tech/NewsAPI/top-headlines/category/technology/in.json",
67
+ "in", # Region code β€” used only in log messages
68
+ ),
69
+ (
70
+ "https://saurav.tech/NewsAPI/top-headlines/category/technology/us.json",
71
+ "us", # Region code β€” used only in log messages
72
+ ),
73
+ ]
74
+
75
+ # HTTP request timeout. Static files are fast, but we keep this generous
76
+ # because GitHub Pages occasionally has slow cold starts.
77
+ HTTP_TIMEOUT_SECONDS = 12.0
78
+
79
+ # Max articles to take from each regional file.
80
+ # 100 articles per file Γ— 2 files = up to 200 raw articles per call.
81
+ # The freshness gate will reject most of the older ones, leaving us
82
+ # with the freshest and most relevant subset.
83
+ MAX_ARTICLES_PER_REGION = 100
84
+
85
+
86
+ class SauravKanchanProvider(NewsProvider):
87
+ """
88
+ Reads top tech headlines from two static JSON files on GitHub Pages.
89
+
90
+ Covers India (in.json) and the United States (us.json) simultaneously.
91
+ Free. Zero rate limits. No API key required.
92
+ Gated behind GENERAL_TECH_CATEGORIES in the aggregator.
93
+
94
+ Usage (wired in Phase 7):
95
+ provider = SauravKanchanProvider()
96
+ articles = await provider.fetch_news(category="ai", limit=50)
97
+ """
98
+
99
+ def __init__(self):
100
+ # Free provider β€” no key, no daily limit.
101
+ super().__init__(api_key=None)
102
+ self.daily_limit = 0
103
+
104
+ # Phase 17: Fetch-Once, Fan-Out cache
105
+ #
106
+ # Saurav's JSON files contain a snapshot of top India + US tech headlines.
107
+ # The file contents are the same regardless of whether we ask for
108
+ # category "ai" or category "cloud-gcp" β€” the files don't change.
109
+ # Without a cache: the aggregator downloads IN + US files 22 separate
110
+ # times (once per category), wasting bandwidth and GitHub's servers.
111
+ # With a cache: downloaded once, stored here for 45 minutes.
112
+ #
113
+ # We store the FINAL Pydantic Article objects, not the raw JSON.
114
+ # This means zero re-parsing on cache hits β€” callers get typed objects.
115
+ self._cached_articles: List[Article] = []
116
+ self._cache_time: float = 0.0
117
+
118
+ # The lock prevents the "thundering herd" problem:
119
+ # If 5 categories hit this provider at the exact same millisecond
120
+ # (which asyncio.gather() will do), only the first one fetches.
121
+ # The other 4 wait patiently at the lock, then return from cache.
122
+ self._lock = asyncio.Lock()
123
+
124
+ # ─────────────────────────────────────────────────────────────────────────
125
+ # MAIN ENTRY POINT β€” called by the aggregator's FREE PARALLEL RUN
126
+ # ─────────────────────────────────────────────────────────────────────────
127
+
128
+ async def fetch_news(self, category: str, limit: int = 50) -> List[Article]:
129
+ """
130
+ Fetch tech headlines from the India and US static JSON files.
131
+
132
+ Both files are downloaded at the same time using asyncio.gather().
133
+ Their article lists are then combined into one big list and returned.
134
+
135
+ Args:
136
+ category (str): The aggregator's category string (e.g., "ai").
137
+ We tag every article with it. The keyword gate
138
+ later filters which ones are truly relevant.
139
+ limit (int): Soft cap on total articles to return.
140
+ The per-region MAX_ARTICLES_PER_REGION cap is
141
+ the real control lever.
142
+
143
+ Returns:
144
+ List[Article]: Combined articles from IN + US feeds.
145
+ Returns [] if both feeds fail.
146
+ """
147
+ # ── Phase 17: Cache check (OUTER) ─────────────────────────────────────
148
+ CACHE_TTL_SECONDS = 2700 # 45 minutes
149
+
150
+ if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
151
+ logger.debug(
152
+ "[SauravKanchan] Cache hit β€” returning %d cached articles for category='%s'. "
153
+ "No HTTP calls made.",
154
+ len(self._cached_articles), category
155
+ )
156
+ return self._cached_articles
157
+
158
+ # ── Cache stale or empty: acquire the lock and fetch ───────────────────
159
+ async with self._lock:
160
+
161
+ # ── Cache check (INNER) β€” double-checked locking ──────────────
162
+ if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
163
+ logger.debug(
164
+ "[SauravKanchan] Cache hit after lock β€” returning %d cached articles.",
165
+ len(self._cached_articles)
166
+ )
167
+ return self._cached_articles
168
+
169
+ logger.info("[SauravKanchan] Cache stale/empty. Fetching IN + US JSON files...")
170
+
171
+ try:
172
+ async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
173
+
174
+ # Build one fetch task per regional URL β€” both fire at the same time.
175
+ fetch_tasks = [
176
+ self._fetch_single_region(client, url, region_code, category)
177
+ for url, region_code in STATIC_FEED_URLS
178
+ ]
179
+
180
+ # Wait for both regional fetches to complete simultaneously.
181
+ results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
182
+
183
+ # Combine articles from both regions into one flat list.
184
+ all_articles: List[Article] = []
185
+ for (_, region_code), result in zip(STATIC_FEED_URLS, results):
186
+ if isinstance(result, Exception):
187
+ logger.warning(
188
+ f"[SauravKanchan] [{region_code.upper()}] "
189
+ f"Fetch failed: {result}"
190
+ )
191
+ elif isinstance(result, list):
192
+ all_articles.extend(result)
193
+
194
+ logger.info(
195
+ "[SauravKanchan] Fetched %d articles from %d regions. "
196
+ "Caching for 45 minutes.",
197
+ len(all_articles), len(STATIC_FEED_URLS)
198
+ )
199
+
200
+ # Store the fully-mapped Pydantic Article objects in the cache.
201
+ # Future category calls get typed objects with zero re-parsing.
202
+ self._cached_articles = all_articles
203
+ self._cache_time = time.time()
204
+ return all_articles
205
+
206
+ except Exception as e:
207
+ logger.error(f"[SauravKanchan] Unexpected error: {e}", exc_info=True)
208
+ return []
209
+
210
+ # ─────────────────────────────────────────────────────────────────────────
211
+ # PRIVATE HELPERS
212
+ # ─────────────────────────────────────────────────────────────────────────
213
+
214
+ async def _fetch_single_region(
215
+ self,
216
+ client: httpx.AsyncClient,
217
+ url: str,
218
+ region_code: str,
219
+ category: str,
220
+ ) -> List[Article]:
221
+ """
222
+ Download one regional JSON file and parse its articles.
223
+
224
+ Args:
225
+ client (httpx.AsyncClient): Shared HTTP client from fetch_news().
226
+ url (str): The full static JSON URL to fetch.
227
+ region_code (str): Short label for logging (e.g., "us", "in").
228
+ category (str): The aggregator's category β€” tagged on articles.
229
+
230
+ Returns:
231
+ List[Article]: Parsed articles from this region. Returns [] on failure.
232
+ """
233
+ try:
234
+ response = await client.get(
235
+ url,
236
+ headers={"User-Agent": "SegmentoPulse-Ingestion/1.0"},
237
+ follow_redirects=True,
238
+ )
239
+
240
+ if response.status_code != 200:
241
+ logger.warning(
242
+ f"[SauravKanchan] [{region_code.upper()}] "
243
+ f"HTTP {response.status_code} β€” skipping."
244
+ )
245
+ return []
246
+
247
+ data = response.json()
248
+
249
+ except httpx.TimeoutException:
250
+ logger.warning(
251
+ f"[SauravKanchan] [{region_code.upper()}] Timed out β€” skipping."
252
+ )
253
+ return []
254
+ except Exception as e:
255
+ logger.warning(
256
+ f"[SauravKanchan] [{region_code.upper()}] Fetch error: {e}"
257
+ )
258
+ return []
259
+
260
+ # The JSON has the same shape as NewsAPI.org:
261
+ # { "status": "ok", "totalResults": 20, "articles": [ ... ] }
262
+ raw_articles = data.get("articles", [])
263
+
264
+ if not isinstance(raw_articles, list) or not raw_articles:
265
+ logger.info(
266
+ f"[SauravKanchan] [{region_code.upper()}] "
267
+ "No articles found in response."
268
+ )
269
+ return []
270
+
271
+ articles = self._map_articles(
272
+ raw_articles[:MAX_ARTICLES_PER_REGION],
273
+ region_code,
274
+ category,
275
+ )
276
+ logger.info(
277
+ f"[SauravKanchan] [{region_code.upper()}] "
278
+ f"Parsed {len(articles)} articles."
279
+ )
280
+ return articles
281
+
282
+ def _map_articles(
283
+ self,
284
+ raw_articles: list,
285
+ region_code: str,
286
+ category: str,
287
+ ) -> List[Article]:
288
+ """
289
+ Convert raw NewsAPI-format JSON items into Segmento Pulse Article objects.
290
+
291
+ The field names in this JSON are camelCase (like JavaScript), so:
292
+ urlToImage β†’ image_url
293
+ publishedAt β†’ published_at
294
+ source.name β†’ source
295
+
296
+ Everything else maps directly.
297
+
298
+ Args:
299
+ raw_articles (list): The 'articles' array from the JSON response.
300
+ region_code (str): "in" or "us" β€” appended to the source name
301
+ so we know where the article came from.
302
+ category (str): The aggregator's category string.
303
+
304
+ Returns:
305
+ List[Article]: Clean Article objects for the pipeline.
306
+ """
307
+ articles: List[Article] = []
308
+
309
+ for item in raw_articles:
310
+ if not isinstance(item, dict):
311
+ continue
312
+
313
+ # ── Title ────────────────────────────────────────────────────
314
+ title = (item.get("title") or "").strip()
315
+ # NewsAPI sometimes puts "[Removed]" as a title for deleted articles
316
+ if not title or title == "[Removed]":
317
+ continue
318
+
319
+ # ── URL ──────────────────────────────────────────────────────
320
+ url = (item.get("url") or "").strip()
321
+ if not url or not url.startswith("http"):
322
+ continue
323
+
324
+ # ── Description ──────────────────────────────────────────────��
325
+ description = (item.get("description") or "").strip()
326
+ # Skip "[Removed]" placeholder descriptions too
327
+ if description == "[Removed]":
328
+ description = ""
329
+
330
+ # ── Image URL (camelCase: urlToImage) ─────────────────────────
331
+ image_url = (item.get("urlToImage") or "").strip()
332
+
333
+ # ── Published Date (camelCase: publishedAt) ───────────────────
334
+ # NewsAPI format is already ISO 8601 (e.g., "2026-03-03T06:00:00Z").
335
+ # Our Pydantic Article model accepts this directly β€” no conversion.
336
+ published_at = item.get("publishedAt") or ""
337
+
338
+ # ── Source Name (nested object) ───────────────────────────────
339
+ # NewsAPI wraps the source as { "id": "...", "name": "..." }.
340
+ # We only want the 'name' string.
341
+ source_obj = item.get("source") or {}
342
+ raw_source_name = (source_obj.get("name") or "").strip()
343
+
344
+ # Append the region code so it's clear in the UI where
345
+ # this article came from, e.g., "The Verge (IN)" or "Wired (US)".
346
+ if raw_source_name:
347
+ source = f"{raw_source_name} ({region_code.upper()})"
348
+ else:
349
+ source = f"SauravKanchan ({region_code.upper()})"
350
+
351
+ # ── Build Article ─────────────────────────────────────────────
352
+ try:
353
+ article = Article(
354
+ title=title,
355
+ description=description,
356
+ url=url,
357
+ image_url=image_url,
358
+ published_at=published_at,
359
+ source=source,
360
+ # ── ROUTING RULE ──────────────────────────────────────
361
+ # Pass through the aggregator's category.
362
+ # The keyword gate filters out off-topic articles.
363
+ # Unknown or empty categories safely route to
364
+ # the default 'News Articles' collection.
365
+ category=category,
366
+ )
367
+ articles.append(article)
368
+
369
+ except Exception as e:
370
+ logger.debug(
371
+ f"[SauravKanchan] Skipped item '{title[:50]}...': {e}"
372
+ )
373
+ continue
374
+
375
+ return articles
app/services/providers/thenewsapi/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # providers/thenewsapi/__init__.py
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ # This file marks the 'thenewsapi' folder as a Python package.
4
+ # To use TheNewsAPI provider, import it like this:
5
+ #
6
+ # from app.services.providers.thenewsapi.client import TheNewsAPIProvider
7
+ #
8
+ # This is a PAID provider β€” it requires the THENEWSAPI_API_KEY environment
9
+ # variable to be set. It has a daily_limit of 100 requests (free tier).
10
+ # It lives in the PAID_CHAIN, meaning it only fires if all providers above
11
+ # it in the chain (GNews, NewsAPI, NewsData) have already failed.
app/services/providers/thenewsapi/client.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ providers/thenewsapi/client.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ TheNewsAPI.com Provider for Segmento Pulse.
5
+
6
+ What this does:
7
+ Fetches fresh technology news articles from TheNewsAPI.com.
8
+ This is a paid API but has the cleanest JSON structure of all paid
9
+ providers β€” most of its field names even match our Pydantic Article model.
10
+
11
+ Free Tier Limits:
12
+ - 100 requests per day (resets midnight UTC)
13
+ - Requires an API key (THENEWSAPI_API_KEY in your .env file)
14
+
15
+ Where it sits in the pipeline:
16
+ PAID_CHAIN position 4 (after GNews β†’ NewsAPI β†’ NewsData).
17
+ Only fires if all three above it have already failed or hit their limits.
18
+ Once it returns articles, the paid chain stops β€” credits protected.
19
+
20
+ The special data quirk (categories array):
21
+ TheNewsAPI returns a 'categories' field as a LIST, not a single string.
22
+ Example: { "categories": ["tech", "science"] }
23
+
24
+ We grab only the FIRST item from that list.
25
+ Example: "tech"
26
+
27
+ This raw value ("tech") is then passed through our pipeline.
28
+ The keyword gate in data_validation.is_relevant_to_category() handles
29
+ whether the article truly belongs in our system.
30
+
31
+ We do NOT try to translate "tech" β†’ "magazines" ourselves here.
32
+ That mapping belongs in the validation/data layer, not the fetcher layer.
33
+ Keep the fetcher dumb β€” let the pipeline be smart.
34
+
35
+ Client-side constraint note:
36
+ TheNewsAPI supports date filters (published_after, published_before) and
37
+ language filters (language=en). We use language=en to avoid non-English
38
+ articles. We do NOT apply date filters because the freshness gate in
39
+ data_validation.is_valid_article() handles that more accurately in IST.
40
+ """
41
+
42
+ # ── Standard Library ──────────────────────────────────────────────────────────
43
+ import logging
44
+ from datetime import datetime, timezone
45
+ from typing import List, Optional
46
+
47
+ # ── Third-party (already in requirements.txt) ─────────────────────────────────
48
+ import httpx # Async HTTP client
49
+
50
+ # ── Internal ──────────────────────────────────────────────────────────────────
51
+ from app.services.providers.base import NewsProvider, ProviderStatus
52
+ from app.models import Article
53
+ from app.config import settings # Single source of truth for all keys
54
+ # Phase 16: Import the Redis counter utility to make the daily budget
55
+ # restart-proof. TheNewsAPI only allows 3 real calls per day on the free tier.
56
+ # Without Redis, a server restart resets request_count to 0 and lets us
57
+ # make 3 more calls β€” potentially 9+ calls on a restart-heavy day.
58
+ from app.services.utils.provider_state import (
59
+ get_provider_counter,
60
+ increment_provider_counter,
61
+ )
62
+
63
+ logger = logging.getLogger(__name__)
64
+
65
+ # ── Constants ─────────────────────────────────────────────────────────────────
66
+
67
+ # Base URL for all TheNewsAPI endpoints
68
+ THENEWSAPI_BASE_URL = "https://api.thenewsapi.com/v1/news/all"
69
+
70
+ # How long (seconds) to wait before giving up on a request
71
+ HTTP_TIMEOUT_SECONDS = 10.0
72
+
73
+ # How many articles to request per call. 25 is their recommended page size.
74
+ ARTICLES_PER_REQUEST = 25
75
+
76
+
77
+ class TheNewsAPIProvider(NewsProvider):
78
+ """
79
+ Fetches technology news from TheNewsAPI.com.
80
+
81
+ Paid provider β€” needs THENEWSAPI_API_KEY in your .env file.
82
+ Sits at position 4 in the PAID_CHAIN (last paid fallback).
83
+ 100 requests/day on the free tier.
84
+
85
+ Usage (wired into the aggregator in Phase 5):
86
+ provider = TheNewsAPIProvider(api_key="your_key_here")
87
+ articles = await provider.fetch_news(category="ai", limit=25)
88
+ """
89
+
90
+ def __init__(self, api_key: Optional[str] = None):
91
+ super().__init__(api_key=api_key)
92
+
93
+ # Phase 16 Audit Fix: Corrected from 100 β†’ 3.
94
+ #
95
+ # The free tier documentation lists "100 requests/day" but in practice
96
+ # the Community (free) tier is hard-capped at 3 requests per day.
97
+ # Our QA audit caught this discrepancy: with daily_limit=100, the old
98
+ # code would keep calling this API expecting 100 slots, burning all 3
99
+ # real calls immediately and then receiving 402s for the rest of the day.
100
+ #
101
+ # With daily_limit=3 + Redis persistence: we use at most 3 calls/day
102
+ # even across multiple server restarts. The 3rd call is reserved as an
103
+ # emergency slot β€” Redis budget enforcement kicks in at 2.
104
+ self.daily_limit = 3
105
+
106
+ # Category mapping: translate our internal category names into the
107
+ # categories that TheNewsAPI actually understands.
108
+ # TheNewsAPI uses these: tech, science, sports, business, health, entertainment, general
109
+ # We map our fine-grained categories to the closest match.
110
+ self.category_map = {
111
+ 'ai': 'tech',
112
+ 'data-security': 'tech',
113
+ 'data-governance': 'tech',
114
+ 'data-privacy': 'tech',
115
+ 'data-engineering': 'tech',
116
+ 'data-management': 'tech',
117
+ 'business-intelligence': 'business',
118
+ 'business-analytics': 'business',
119
+ 'customer-data-platform': 'business',
120
+ 'data-centers': 'tech',
121
+ 'cloud-computing': 'tech',
122
+ 'magazines': 'tech',
123
+ 'data-laws': 'tech',
124
+ # Cloud sub-categories β†’ all map to 'tech' in TheNewsAPI's world
125
+ 'cloud-aws': 'tech',
126
+ 'cloud-azure': 'tech',
127
+ 'cloud-gcp': 'tech',
128
+ 'cloud-oracle': 'tech',
129
+ 'cloud-ibm': 'tech',
130
+ 'cloud-alibaba': 'tech',
131
+ 'cloud-digitalocean': 'tech',
132
+ 'cloud-huawei': 'tech',
133
+ 'cloud-cloudflare': 'tech',
134
+ }
135
+
136
+ # ─────────────────────────────────────────────────────────────────────────
137
+ # MAIN ENTRY POINT β€” called by the aggregator's PAID WATERFALL
138
+ # ─────────────────────────────────────────────────────────────────────────
139
+
140
+ async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
141
+ """
142
+ Fetch technology articles from TheNewsAPI.com.
143
+
144
+ Args:
145
+ category (str): Our internal category (e.g., "ai", "cloud-aws").
146
+ We look this up in self.category_map to get the
147
+ correct TheNewsAPI category keyword.
148
+ limit (int): Maximum number of articles to return.
149
+
150
+ Returns:
151
+ List[Article]: Mapped Article objects. Returns [] on failure.
152
+ """
153
+ # No API key means this provider cannot run.
154
+ # The aggregator will have already checked this via is_available(),
155
+ # but we double-check here for safety.
156
+ if not self.api_key:
157
+ logger.debug("[TheNewsAPI] No API key configured β€” skipping.")
158
+ return []
159
+
160
+ # ── PHASE 16: Redis-backed daily budget guard ────────────────────────
161
+ # Real free-tier limit: 3 calls/day (corrected in this phase).
162
+ # We check Redis FIRST before building any params or making any HTTP call.
163
+ #
164
+ # Why inside fetch_news and not inside is_available()?
165
+ # is_available() is a synchronous function on the base class.
166
+ # Redis calls are async (they use `await`). You cannot mix them:
167
+ # calling an async function from a sync function crashes at runtime.
168
+ # So we do the async Redis check here, at the very top of async fetch_news.
169
+ today_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
170
+ current_calls = await get_provider_counter("thenewsapi", today_str)
171
+
172
+ if current_calls >= self.daily_limit:
173
+ logger.warning(
174
+ "[TheNewsAPI] Daily Redis budget exhausted β€” %d/%d calls used today. "
175
+ "Skipping to protect the 3-call daily quota.",
176
+ current_calls, self.daily_limit
177
+ )
178
+ self.mark_rate_limited()
179
+ return []
180
+
181
+ try:
182
+ # Translate our internal category to TheNewsAPI's category keyword.
183
+ # If the category is not in our map, default to 'tech'.
184
+ api_category = self.category_map.get(category, "tech")
185
+
186
+ params = {
187
+ "api_token": self.api_key,
188
+ "language": "en", # English articles only
189
+ "categories": api_category, # TheNewsAPI category keyword
190
+ "limit": min(limit, ARTICLES_PER_REQUEST),
191
+ # NOTE: We deliberately do NOT add 'published_after' or
192
+ # 'published_before' date filters.
193
+ # TheNewsAPI supports them, but our freshness gate
194
+ # (is_valid_article in data_validation.py) already enforces
195
+ # the correct IST-based date boundary. Letting the gate handle
196
+ # it is safer and avoids timezone conversion bugs here.
197
+ }
198
+
199
+ async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
200
+ logger.info("[TheNewsAPI] Fetching '%s' (api_category='%s')...", category, api_category)
201
+ response = await client.get(THENEWSAPI_BASE_URL, params=params)
202
+
203
+ # ── Handle rate limit ─────────────────────────────────────
204
+ if response.status_code == 429:
205
+ logger.warning("[TheNewsAPI] Hit 429 rate limit. Marking as rate-limited.")
206
+ self.mark_rate_limited()
207
+ return []
208
+
209
+ # ── Handle authentication failure ─────────────────────────
210
+ if response.status_code == 401:
211
+ logger.error("[TheNewsAPI] 401 Unauthorized β€” API key is invalid or expired.")
212
+ self.status = ProviderStatus.ERROR
213
+ return []
214
+
215
+ # ── Handle quota exhaustion ───────────────────────────────
216
+ if response.status_code == 402:
217
+ logger.warning("[TheNewsAPI] 402 Payment Required β€” daily quota exhausted.")
218
+ self.mark_rate_limited()
219
+ return []
220
+
221
+ # ── Handle other non-200 responses ────────────────────────
222
+ if response.status_code != 200:
223
+ logger.warning(f"[TheNewsAPI] Unexpected HTTP {response.status_code}.")
224
+ return []
225
+
226
+ # ── Parse and map the response ──────────────────────────────────
227
+ self.request_count += 1 # Keep RAM shadow in sync for debugging
228
+ data = response.json()
229
+
230
+ # TheNewsAPI wraps articles in a 'data' key at the top level
231
+ raw_articles = data.get("data", [])
232
+
233
+ if not raw_articles:
234
+ logger.info(f"[TheNewsAPI] No articles returned for category='{category}'.")
235
+ return []
236
+
237
+ articles = self._map_articles(raw_articles, category)
238
+
239
+ # ── PHASE 16: Increment the Redis counter after a successful call ──
240
+ # Only successful 200 responses count against the daily budget.
241
+ # 402/429/timeout failures do not consume a slot.
242
+ await increment_provider_counter("thenewsapi", today_str)
243
+
244
+ logger.info("[TheNewsAPI] Got %d articles for '%s'.", len(articles), category)
245
+ return articles
246
+
247
+ except httpx.TimeoutException:
248
+ logger.warning("[TheNewsAPI] Request timed out.")
249
+ return []
250
+ except Exception as e:
251
+ logger.error(f"[TheNewsAPI] Unexpected error: {e}", exc_info=True)
252
+ return []
253
+
254
+ # ─────────────────────────────────────────────────────────────────────────
255
+ # PRIVATE HELPER β€” maps raw JSON items to Article objects
256
+ # ─────────────────────────────────────────────────────────────────────────
257
+
258
+ def _map_articles(self, raw_articles: list, category: str) -> List[Article]:
259
+ """
260
+ Convert TheNewsAPI JSON items into Segmento Pulse Article objects.
261
+
262
+ The mapping is almost 1-to-1 with our Pydantic model, which is why
263
+ this is the easiest of all paid providers to integrate.
264
+
265
+ One special case: 'categories' is a list, not a string.
266
+ We take [0] (the first item) as the article's category value.
267
+
268
+ Args:
269
+ raw_articles (list): The 'data' array from TheNewsAPI's response.
270
+ category (str): Our internal category (from the aggregator).
271
+
272
+ Returns:
273
+ List[Article]: Clean Article objects for the pipeline.
274
+ """
275
+ articles: List[Article] = []
276
+
277
+ for item in raw_articles:
278
+
279
+ # ── Title ─────────────────────────────────────────────────────
280
+ title = (item.get("title") or "").strip()
281
+ if not title:
282
+ continue
283
+
284
+ # ── URL ───────────────────────────────────────────────────────
285
+ url = (item.get("url") or "").strip()
286
+ if not url or not url.startswith("http"):
287
+ continue
288
+
289
+ # ── Description ───────────────────────────────────────────────
290
+ # TheNewsAPI provides real summaries β€” a huge advantage over HN.
291
+ description = (item.get("description") or "").strip()
292
+
293
+ # ── Image URL ─────────────────────────────────────────────────
294
+ # The field is ALREADY called 'image_url' in their API.
295
+ # This is the cleanest mapping of any provider we have integrated.
296
+ image_url = (item.get("image_url") or "").strip()
297
+
298
+ # ── Published Date ────────────────────────────────────────────
299
+ # TheNewsAPI returns ISO 8601 format (e.g., "2024-03-03T06:00:00.000000Z").
300
+ # Our Pydantic Article model already handles this format in its
301
+ # published_at validator β€” no conversion needed.
302
+ published_at = item.get("published_at") or ""
303
+
304
+ # ── Source Name ───────────────────────────────────────────────
305
+ # TheNewsAPI's live response returns `source` as a plain string
306
+ # (the publisher domain, e.g. "techcrunch.com"), NOT as a nested
307
+ # dict like NewsAPI.org does. We handle both shapes defensively.
308
+ raw_source = item.get("source") or ""
309
+ if isinstance(raw_source, dict):
310
+ # Nested object shape: {"name": "TechCrunch", "url": "..."}
311
+ source = (raw_source.get("name") or "TheNewsAPI").strip()
312
+ else:
313
+ # Plain string shape: "techcrunch.com" β€” use it as-is.
314
+ source = str(raw_source).strip() or "TheNewsAPI"
315
+
316
+ # ── Category ──────────────────────────────────────────────────
317
+ # TheNewsAPI returns categories as a LIST, e.g., ["tech", "science"]
318
+ # We take only the first item. Our keyword gate will verify relevance.
319
+ # ROUTING RULE: if the list is empty, fall back to our internal
320
+ # category name. Both "" and category will safely route to the
321
+ # default 'News Articles' collection if unrecognised.
322
+ raw_categories = item.get("categories") or []
323
+ if raw_categories and isinstance(raw_categories, list):
324
+ article_category = raw_categories[0]
325
+ else:
326
+ article_category = category # Fallback to aggregator's category
327
+
328
+ # ── Build Article ─────────────────────────────────────────────
329
+ try:
330
+ article = Article(
331
+ title=title,
332
+ description=description,
333
+ url=url,
334
+ image_url=image_url,
335
+ published_at=published_at,
336
+ source=source,
337
+ category=article_category,
338
+ )
339
+ articles.append(article)
340
+
341
+ except Exception as e:
342
+ logger.debug(
343
+ f"[TheNewsAPI] Skipped item url='{url[:60]}': {e}"
344
+ )
345
+ continue
346
+
347
+ return articles
app/services/providers/webz/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # providers/webz/__init__.py
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ # This file marks the 'webz' folder as a Python package.
4
+ # To use this provider, import it like this:
5
+ #
6
+ # from app.services.providers.webz.client import WebzProvider
7
+ #
8
+ # This is a PAID provider β€” requires WEBZ_API_KEY in your .env file.
9
+ # Position 6 in the PAID_CHAIN (deepest paid failover).
10
+ #
11
+ # ── CRITICAL BUDGET WARNING ───────────────────────────────────────────────
12
+ # Webz.io free tier: 1,000 calls per MONTH (not per day).
13
+ # daily_limit is set to 30 inside WebzProvider to pace usage to ~900/month.
14
+ # DO NOT increase daily_limit above 33 β€” doing so will exhaust the
15
+ # monthly budget before the month ends.
app/services/providers/webz/client.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ providers/webz/client.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ The Webz.io Provider for Segmento Pulse.
5
+
6
+ What this does:
7
+ Fetches enterprise-grade news articles from Webz.io's News API Lite.
8
+ Webz crawls 3.5 million articles per day from across the open web,
9
+ making it one of the richest news sources we have available.
10
+
11
+ Paid provider β€” needs WEBZ_API_KEY in your .env file.
12
+ Position 6 in the PAID_CHAIN (absolute final paid failover).
13
+
14
+ ── THE MONTHLY BUDGET PROBLEM AND HOW WE SOLVE IT ──────────────────────────
15
+
16
+ Webz free tier gives us 1,000 calls per MONTH β€” not per day.
17
+ Our scheduler runs many categories every hour. Without a limit, we would
18
+ exhaust the entire 1,000-call monthly budget in less than 48 hours.
19
+
20
+ Our fix: daily_limit = 30 inside this class.
21
+ The quota tracker caps us at 30 calls per calendar day.
22
+ 30 calls/day Γ— 30 days = 900 calls/month β€” safely under 1,000.
23
+ This paces the budget across the whole month as an even, predictable cost.
24
+
25
+ Math visible to future engineers:
26
+ 1,000 calls Γ· 30 days = 33.3 calls/day max to exactly hit the limit.
27
+ We use 30 to leave a 10% safety margin for edge cases (month resets,
28
+ server restarts that lose the quota counter's in-memory state, etc.).
29
+
30
+ ── THE NESTED IMAGE PROBLEM AND HOW WE SOLVE IT ─────────────────────────────
31
+
32
+ Webz does not put images at the top level of each article object.
33
+ Instead, the image is buried inside a nested 'thread' object like this:
34
+
35
+ {
36
+ "title": "Article Title",
37
+ "url": "https://...",
38
+ "thread": {
39
+ "site_full": "techcrunch.com", ← source name is here too
40
+ "main_image": "https://..." ← image is here
41
+ },
42
+ "text": "Full article body (thousands of words)..."
43
+ }
44
+
45
+ Our fix: We safely "drill down" using chained .get() calls.
46
+ thread = item.get("thread") or {}
47
+ image_url = thread.get("main_image") or ""
48
+
49
+ If 'thread' is missing β†’ {} (empty dict, no crash)
50
+ If 'main_image' is missing β†’ "" (empty string, no crash)
51
+ Either way, the pipeline gets a clean empty string for the fallback image.
52
+
53
+ ── THE FULL TEXT BODY PROBLEM AND HOW WE SOLVE IT ──────────────────────────
54
+
55
+ Webz provides the COMPLETE article body in the 'text' field β€” this can be
56
+ thousands of words. Storing that in our database is too large and risks
57
+ reproducing copyright-protected content.
58
+
59
+ Our fix: Truncate to the first 200 characters (same approach as Phase 8).
60
+ 200 characters is enough for a preview. Our newsletter system uses the
61
+ description field but also has its own 160-char cap, so anything beyond
62
+ 200 already has no use downstream.
63
+ """
64
+
65
+ # ── Standard Library ──────────────────────────────────────────────────────────
66
+ import logging
67
+ from datetime import datetime, timezone
68
+ from typing import List, Optional
69
+
70
+ # ── Third-party (already in requirements.txt) ─────────────────────────────────
71
+ import httpx # Async HTTP client
72
+
73
+ # ── Internal ──────────────────────────────────────────────────────────────────
74
+ from app.services.providers.base import NewsProvider, ProviderStatus
75
+ from app.models import Article
76
+ from app.config import settings
77
+ # Phase 16: Import the Redis counter utility for dual-layer budget protection.
78
+ # Webz has the strictest budget of all three paid providers β€” 1,000 calls per
79
+ # MONTH. Without restart-proof counters, a restart-heavy day can exhaust the
80
+ # entire monthly budget in a few hours. Two Redis keys protect us:
81
+ # 1. Daily key ("webz", today_str) β€” caps us at 30/day
82
+ # 2. Monthly key ("webz_month", month_str) β€” caps us at 900/month total
83
+ from app.services.utils.provider_state import (
84
+ get_provider_counter,
85
+ increment_provider_counter,
86
+ )
87
+
88
+ logger = logging.getLogger(__name__)
89
+
90
+ # ── Constants ─────────────────────────────────────────────────────────────────
91
+
92
+ # Webz.io News API Lite endpoint
93
+ WEBZ_API_URL = "https://api.webz.io/newsApiLite"
94
+
95
+ # Request timeout in seconds. Enterprise APIs are usually fast.
96
+ HTTP_TIMEOUT_SECONDS = 12.0
97
+
98
+ # Articles to request per call. Keeping this modest saves the budget
99
+ # because Webz deducts from quota based on results returned, not just calls.
100
+ ARTICLES_PER_REQUEST = 10
101
+
102
+ # Maximum characters to keep from the article body for the description field.
103
+ # Matches Phase 8's WorldNewsAI approach for consistency.
104
+ DESCRIPTION_MAX_CHARS = 200
105
+
106
+ # Category β†’ search query translation.
107
+ # Webz uses free-text query strings (like Google search), so we convert
108
+ # our internal category slugs into descriptive keyword phrases that maximise
109
+ # the quality of results from Webz's index.
110
+ CATEGORY_QUERY_MAP = {
111
+ 'ai': 'artificial intelligence machine learning',
112
+ 'data-security': 'data security cybersecurity breach hacking',
113
+ 'data-governance': 'data governance compliance policy',
114
+ 'data-privacy': 'data privacy GDPR regulation',
115
+ 'data-engineering': 'data engineering pipeline ETL spark',
116
+ 'data-management': 'data management master data catalog',
117
+ 'business-intelligence': 'business intelligence analytics BI tools',
118
+ 'business-analytics': 'business analytics data-driven decisions',
119
+ 'customer-data-platform': 'customer data platform CDP personalization',
120
+ 'data-centers': 'data center infrastructure hyperscaler',
121
+ 'cloud-computing': 'cloud computing technology platform',
122
+ 'magazines': 'technology news innovation',
123
+ 'data-laws': 'AI regulation data law privacy act',
124
+ 'cloud-aws': 'Amazon AWS cloud services',
125
+ 'cloud-azure': 'Microsoft Azure cloud platform',
126
+ 'cloud-gcp': 'Google Cloud Platform GCP services',
127
+ 'cloud-oracle': 'Oracle Cloud OCI database',
128
+ 'cloud-ibm': 'IBM Cloud Red Hat OpenShift',
129
+ 'cloud-alibaba': 'Alibaba Cloud Aliyun technology',
130
+ 'cloud-digitalocean': 'DigitalOcean cloud developer platform',
131
+ 'cloud-huawei': 'Huawei Cloud services technology',
132
+ 'cloud-cloudflare': 'Cloudflare CDN security network',
133
+ }
134
+
135
+
136
+ class WebzProvider(NewsProvider):
137
+ """
138
+ Fetches enterprise-grade news articles from Webz.io News API Lite.
139
+
140
+ Paid provider β€” 1,000 calls/month free tier, paced to 30/day.
141
+ Position 6 in the PAID_CHAIN (deepest paid failover).
142
+ Only fires when all 5 providers above it have failed or hit limits.
143
+ Requires WEBZ_API_KEY in the .env file.
144
+
145
+ Usage (wired in Phase 10):
146
+ provider = WebzProvider(api_key="your_key_here")
147
+ articles = await provider.fetch_news(category="ai", limit=10)
148
+ """
149
+
150
+ def __init__(self, api_key: Optional[str] = None):
151
+ super().__init__(api_key=api_key)
152
+
153
+ # 30 calls/day Γ— 30 days = 900/month β€” safely under the 1,000 cap.
154
+ # The quota tracker enforces this limit before each call.
155
+ # 10% safety margin included for server restart edge cases.
156
+ self.daily_limit = 30
157
+
158
+ # ─────────────────────────────────────────────────────────────────────────
159
+ # MAIN ENTRY POINT β€” called by the aggregator's PAID WATERFALL
160
+ # ─────────────────────────────────────────────────────────────────────────
161
+
162
+ async def fetch_news(self, category: str, limit: int = 10) -> List[Article]:
163
+ """
164
+ Fetch news articles from Webz.io for the given category.
165
+
166
+ Args:
167
+ category (str): Our internal category slug (e.g., "ai").
168
+ Translated to a keyword query via CATEGORY_QUERY_MAP.
169
+ limit (int): Max articles to return. Kept at 10 to conserve
170
+ the monthly call budget (Webz charges per result).
171
+
172
+ Returns:
173
+ List[Article]: Mapped Article objects. Returns [] on any failure.
174
+ """
175
+ if not self.api_key:
176
+ logger.debug("[Webz] No API key configured β€” skipping.")
177
+ return []
178
+
179
+ # ── PHASE 16: Dual-layer Redis budget guard ────────────────────────
180
+ #
181
+ # Webz is the most budget-constrained provider we have: 1,000 calls/MONTH.
182
+ # We protect it with TWO independent Redis counters running in parallel.
183
+ #
184
+ # Gate 1 β€” DAILY: Stops at 30 calls/day to pace spending evenly.
185
+ # Redis key: "provider:state:webz:calls:2026-03-03" (TTL: 24h)
186
+ #
187
+ # Gate 2 β€” MONTHLY: Stops at 900 calls/month (10% safety margin on 1,000).
188
+ # Redis key: "provider:state:webz_month:calls:2026-03" (TTL: 30 days)
189
+ # Note: The key name includes the month string ("2026-03").
190
+ # When April starts, the key name changes to "2026-04" automatically
191
+ # β€” no manual cleanup needed. The old March key expires via TTL.
192
+ #
193
+ # Either gate being exhausted blocks the call completely.
194
+ # Fail-safe design: if Redis is down, both return 999999 β€” call is skipped.
195
+ today_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
196
+ month_str = datetime.now(timezone.utc).strftime("%Y-%m")
197
+
198
+ daily_calls = await get_provider_counter("webz", today_str)
199
+ monthly_calls = await get_provider_counter("webz_month", month_str)
200
+
201
+ # Hard monthly ceiling: 900 (leaving 100 as safety buffer on the 1,000 limit)
202
+ MONTHLY_HARD_LIMIT = 900
203
+
204
+ if daily_calls >= self.daily_limit:
205
+ logger.warning(
206
+ "[Webz] Daily Redis budget exhausted β€” %d/%d calls used today. "
207
+ "Skipping to protect the monthly quota.",
208
+ daily_calls, self.daily_limit
209
+ )
210
+ self.mark_rate_limited()
211
+ return []
212
+
213
+ if monthly_calls >= MONTHLY_HARD_LIMIT:
214
+ logger.warning(
215
+ "[Webz] Monthly Redis budget exhausted β€” %d/%d calls used this month. "
216
+ "No more Webz calls until next month to protect the 1,000-call limit.",
217
+ monthly_calls, MONTHLY_HARD_LIMIT
218
+ )
219
+ self.mark_rate_limited()
220
+ return []
221
+
222
+ # Translate our internal category slug into a Webz-friendly search phrase.
223
+ search_query = CATEGORY_QUERY_MAP.get(category, f"technology {category}")
224
+
225
+ params = {
226
+ "token": self.api_key,
227
+ "q": search_query,
228
+ "language": "english",
229
+ "size": min(limit, ARTICLES_PER_REQUEST),
230
+ # NOTE: No date filters applied here intentionally.
231
+ # Our freshness gate in data_validation.is_valid_article()
232
+ # handles date boundaries accurately using IST windows.
233
+ # Adding date filters here would add timezone conversion risk.
234
+ }
235
+
236
+ try:
237
+ async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
238
+ print(
239
+ f"[Webz] Fetching '{category}' "
240
+ f"(query='{search_query[:40]}...')..."
241
+ )
242
+ response = await client.get(WEBZ_API_URL, params=params)
243
+
244
+ # ── HTTP 402: Monthly budget exhausted ────────────────────
245
+ # Webz uses 402 to mean "you have no more credits this month".
246
+ # We mark as rate-limited so the circuit breaker respects it.
247
+ if response.status_code == 402:
248
+ logger.warning(
249
+ "[Webz] HTTP 402 β€” monthly call budget exhausted. "
250
+ "No more calls until quota resets at month end."
251
+ )
252
+ self.mark_rate_limited()
253
+ return []
254
+
255
+ # ── HTTP 401: Bad API key ─────────────────────────────────
256
+ if response.status_code == 401:
257
+ logger.error(
258
+ "[Webz] HTTP 401 β€” API key is invalid or expired. "
259
+ "Check WEBZ_API_KEY in your .env file."
260
+ )
261
+ self.status = ProviderStatus.ERROR
262
+ return []
263
+
264
+ # ── HTTP 429: Too many requests (short-term rate limit) ───
265
+ if response.status_code == 429:
266
+ logger.warning("[Webz] HTTP 429 β€” request rate exceeded.")
267
+ self.mark_rate_limited()
268
+ return []
269
+
270
+ # ── Any other non-200 ─────────────────────────────────────
271
+ if response.status_code != 200:
272
+ logger.warning(f"[Webz] Unexpected HTTP {response.status_code}.")
273
+ return []
274
+
275
+ # ── Parse the response ────────────────────────────────────
276
+ self.request_count += 1 # Keep RAM shadow in sync for debugging
277
+ data = response.json()
278
+
279
+ # Webz wraps the article list in a 'posts' key at the top level.
280
+ raw_posts = data.get("posts", [])
281
+
282
+ if not raw_posts:
283
+ logger.info(f"[Webz] No articles returned for '{category}'.")
284
+ return []
285
+
286
+ articles = self._map_articles(raw_posts, category)
287
+
288
+ # ── PHASE 16: Increment BOTH Redis counters after a successful call ──
289
+ # The monthly counter uses a 30-day TTL (2592000 seconds).
290
+ # This is long enough to outlive any calendar month.
291
+ # The key name ("webz_month:calls:2026-03") changes with each month
292
+ # so old keys just fade away on their own without our help.
293
+ await increment_provider_counter("webz", today_str, expire_seconds=86400)
294
+ await increment_provider_counter("webz_month", month_str, expire_seconds=2592000)
295
+
296
+ logger.info("[Webz] Got %d articles for '%s'.", len(articles), category)
297
+ return articles
298
+
299
+ except httpx.TimeoutException:
300
+ logger.warning("[Webz] Request timed out.")
301
+ return []
302
+ except Exception as e:
303
+ logger.error(f"[Webz] Unexpected error: {e}", exc_info=True)
304
+ return []
305
+
306
+ # ─────────────────────────────────────────────────────────────────────────
307
+ # PRIVATE HELPER β€” maps raw JSON posts to Article objects
308
+ # ─────────────────────────────────────────────────────────────────────────
309
+
310
+ def _map_articles(self, raw_posts: list, category: str) -> List[Article]:
311
+ """
312
+ Convert Webz.io JSON 'posts' items into Segmento Pulse Article objects.
313
+
314
+ Key challenges handled here:
315
+ 1. Nested image β€” lives inside posts[].thread.main_image
316
+ 2. Nested source β€” lives inside posts[].thread.site_full
317
+ 3. Full text body β€” truncated to 200 characters
318
+ 4. Published date β€” Webz uses ISO 8601, our model accepts it directly
319
+
320
+ Webz field β†’ Article field
321
+ ─────────────────────────────────────────
322
+ title β†’ title
323
+ url β†’ url
324
+ thread.site_full β†’ source (nested β€” safe .get() chain)
325
+ thread.main_image β†’ image_url (nested β€” safe .get() chain)
326
+ published β†’ published_at
327
+ text (truncated 200) β†’ description
328
+
329
+ Args:
330
+ raw_posts (list): The 'posts' array from the API response.
331
+ category (str): The aggregator's category for routing.
332
+
333
+ Returns:
334
+ List[Article]: Clean Article objects ready for the pipeline.
335
+ """
336
+ articles: List[Article] = []
337
+
338
+ for item in raw_posts:
339
+ if not isinstance(item, dict):
340
+ continue
341
+
342
+ # ── Title ────────────────────────────────────────────────────
343
+ title = (item.get("title") or "").strip()
344
+ if not title:
345
+ continue
346
+
347
+ # ── URL ──────────────────────────────────────────────────────
348
+ url = (item.get("url") or "").strip()
349
+ if not url or not url.startswith("http"):
350
+ continue
351
+
352
+ # ── Published Date ────────────────────────────────────────────
353
+ # Webz returns ISO 8601 format (e.g., "2026-03-03T06:00:00.000+0000").
354
+ # Our Article model's published_at validator handles this directly.
355
+ published_at = item.get("published") or ""
356
+
357
+ # ── Nested: Source and Image ──────────────────────────────────
358
+ # The 'thread' field is a nested dictionary containing both.
359
+ # We extract it once, then pull from it safely.
360
+ # If 'thread' is missing for any reason, we fall back to an empty
361
+ # dict {} so the chained .get() calls below don't crash.
362
+ thread = item.get("thread") or {}
363
+
364
+ # Source: the full domain name of the publishing site.
365
+ # Example: "techcrunch.com" or "thenextweb.com"
366
+ source = (thread.get("site_full") or "Webz").strip()
367
+ if not source:
368
+ source = "Webz"
369
+
370
+ # Image: the main article image from the thread context.
371
+ # Buried one level deep β€” safe because of the `or {}` fallback above.
372
+ image_url = (thread.get("main_image") or "").strip()
373
+
374
+ # ── Description (TRUNCATED full article body) ─────────────────
375
+ # 'text' contains the complete article body β€” potentially thousands
376
+ # of words. We keep only the first 200 characters as a preview.
377
+ # This protects us from database bloat and copyright issues.
378
+ raw_text = (item.get("text") or "").strip()
379
+ if len(raw_text) > DESCRIPTION_MAX_CHARS:
380
+ description = raw_text[:DESCRIPTION_MAX_CHARS] + "..."
381
+ else:
382
+ description = raw_text
383
+
384
+ # ── Build Article ─────────────────────────────────────────────
385
+ try:
386
+ article = Article(
387
+ title=title,
388
+ description=description,
389
+ url=url,
390
+ image_url=image_url,
391
+ published_at=published_at,
392
+ source=source,
393
+ # ── ROUTING RULE ──────────────────────────────────────
394
+ # Pass through the aggregator's category.
395
+ # Unknown/empty categories route to 'News Articles'.
396
+ category=category,
397
+ )
398
+ articles.append(article)
399
+
400
+ except Exception as e:
401
+ logger.debug(f"[Webz] Skipped post '{title[:50]}': {e}")
402
+ continue
403
+
404
+ return articles
app/services/providers/wikinews/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # providers/wikinews/__init__.py
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ # This file marks the 'wikinews' folder as a Python package.
4
+ # To use this provider, import it like this:
5
+ #
6
+ # from app.services.providers.wikinews.client import WikinewsProvider
7
+ #
8
+ # Wikinews is 100% FREE β€” no API key, no rate limits, no registration.
9
+ # It is run by the Wikimedia Foundation (same people who run Wikipedia).
10
+ #
11
+ # All content is published under Public Domain or Creative Commons licenses.
12
+ # This makes it the only copyright-bulletproof news source in our pipeline.
13
+ #
14
+ # Gated behind GENERAL_TECH_CATEGORIES (same as HN, Inshorts, SauravKanchan)
15
+ # because Wikinews tech categories cover broad technology topics only.
app/services/providers/wikinews/client.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ providers/wikinews/client.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ The Wikinews Provider for Segmento Pulse.
5
+
6
+ What this does:
7
+ Fetches technology news articles from Wikinews (en.wikinews.org).
8
+ Wikinews is run by the Wikimedia Foundation β€” the same organization
9
+ behind Wikipedia and Wiktionary.
10
+
11
+ Free. No API key. No rate limits. No copyright concerns.
12
+
13
+ Why Wikinews is unique:
14
+ Every article on Wikinews is published under Public Domain or extremely
15
+ open Creative Commons licenses. This means we can freely display their
16
+ content without any legal risk. It is the only fully copyright-bulletproof
17
+ news source in our entire pipeline.
18
+
19
+ We search TWO Wikinews categories concurrently for maximum coverage:
20
+ - "Computing" β†’ software, hardware, AI, security news
21
+ - "Internet" β†’ web tech, data, social media policy news
22
+
23
+ Gated behind GENERAL_TECH_CATEGORIES in the aggregator because Wikinews
24
+ tech content is broad β€” it does not know about "cloud-alibaba" or
25
+ "data-governance" as separate topics.
26
+
27
+ ── THE HTML SNIPPET PROBLEM AND HOW WE FIX IT ───────────────────────────────
28
+
29
+ The MediaWiki search API highlights your search terms inside the description
30
+ snippet by wrapping them in HTML tags like this:
31
+
32
+ "The latest advances in <span class=\"searchmatch\">computing</span> have..."
33
+
34
+ If we stored that raw, our database would get cluttered with raw HTML tags
35
+ that would then appear in the Pulse UI as literal text.
36
+
37
+ Fix: We use a simple regex pattern to strip ALL HTML tags from the snippet.
38
+
39
+ re.sub(r'<[^>]+>', '', raw_snippet).strip()
40
+
41
+ <[^>]+> means: any '<', followed by one or more characters that are
42
+ NOT '>', followed by '>'. This matches every HTML tag universally,
43
+ not just MediaWiki's specific span tags β€” making it bulletproof for
44
+ any future format changes on their end.
45
+
46
+ ── URL CONSTRUCTION FROM pageid ─────────────────────────────────────────────
47
+
48
+ MediaWiki search results give us a 'pageid' integer, NOT a direct URL.
49
+ We construct a permanent, stable URL using the curid URL format:
50
+
51
+ f"https://en.wikinews.org/?curid={pageid}"
52
+
53
+ Example: pageid = 4684321 β†’ https://en.wikinews.org/?curid=4684321
54
+
55
+ This URL format is guaranteed stable by Wikimedia β€” it never changes
56
+ even if the article is moved or renamed.
57
+ """
58
+
59
+ # ── Standard Library ──────────────────────────────────────────────────────────
60
+ import asyncio
61
+ import logging
62
+ import re
63
+ from typing import List
64
+
65
+ # ── Third-party (already in requirements.txt) ─────────────────────────────────
66
+ import httpx # Async HTTP client
67
+
68
+ # ── Internal ──────────────────────────────────────────────────────────────────
69
+ from app.services.providers.base import NewsProvider
70
+ from app.models import Article
71
+ # Phase 12: Shared image enricher (extracts og:image from article pages)
72
+ from app.services.utils.image_enricher import extract_top_image
73
+
74
+ logger = logging.getLogger(__name__)
75
+
76
+ # ── Wikinews API Configuration ────────────────────────────────────────────────
77
+
78
+ # The MediaWiki Action API endpoint for English Wikinews.
79
+ WIKINEWS_API_URL = "https://en.wikinews.org/w/api.php"
80
+
81
+ # We search two categories to broaden our coverage of tech news.
82
+ # 'Computing' β†’ software, AI, hardware. 'Internet' β†’ web, data, social policy.
83
+ WIKINEWS_CATEGORIES = [
84
+ "Computing",
85
+ "Internet",
86
+ ]
87
+
88
+ # Max articles to take per category query.
89
+ # 10 per category Γ— 2 categories = up to 20 articles per call.
90
+ MAX_ARTICLES_PER_CATEGORY = 10
91
+
92
+ # HTTP timeout in seconds. Wikimedia servers are reliable but can be slow.
93
+ HTTP_TIMEOUT_SECONDS = 12.0
94
+
95
+ # Regex to strip ALL HTML tags from MediaWiki search snippets.
96
+ # MediaWiki wraps search terms in <span class="searchmatch">...</span> tags.
97
+ # We strip all HTML universally so any future tag changes are also handled.
98
+ HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
99
+
100
+
101
+ class WikinewsProvider(NewsProvider):
102
+ """
103
+ Fetches technology news from Wikinews using the MediaWiki search API.
104
+
105
+ Free. No API key. Copyright-bulletproof (Public Domain / CC).
106
+ Queries 'Computing' and 'Internet' categories concurrently.
107
+ Gated behind GENERAL_TECH_CATEGORIES in the aggregator.
108
+
109
+ Usage (wired in Phase 11):
110
+ provider = WikinewsProvider()
111
+ articles = await provider.fetch_news(category="ai", limit=20)
112
+ """
113
+
114
+ def __init__(self):
115
+ # Free provider β€” no API key, no daily limit.
116
+ super().__init__(api_key=None)
117
+ self.daily_limit = 0
118
+
119
+ # ─────────────────────────────────────────────────────────────────────────
120
+ # MAIN ENTRY POINT β€” called by the aggregator's FREE PARALLEL RUN
121
+ # ─────────────────────────────────────────────────────────────────────────
122
+
123
+ async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
124
+ """
125
+ Fetch tech articles from Wikinews's Computing and Internet categories.
126
+
127
+ Both category queries run at the same time using asyncio.gather().
128
+ Their results are combined into one flat list and returned.
129
+
130
+ Args:
131
+ category (str): Our internal category slug (e.g., "ai").
132
+ Tagged on every article. The keyword gate filters
133
+ irrelevant articles downstream.
134
+ limit (int): Soft cap on total articles to return.
135
+
136
+ Returns:
137
+ List[Article]: Combined articles from both Wikinews categories.
138
+ Returns [] if both queries fail.
139
+ """
140
+ try:
141
+ async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
142
+
143
+ # Fire queries for both categories simultaneously.
144
+ fetch_tasks = [
145
+ self._query_category(client, wiki_cat, category)
146
+ for wiki_cat in WIKINEWS_CATEGORIES
147
+ ]
148
+
149
+ results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
150
+
151
+ # Combine results from both categories.
152
+ all_articles: List[Article] = []
153
+ for wiki_cat, result in zip(WIKINEWS_CATEGORIES, results):
154
+ if isinstance(result, Exception):
155
+ logger.warning(
156
+ f"[Wikinews] [{wiki_cat}] Query failed: {result}"
157
+ )
158
+ elif isinstance(result, list):
159
+ all_articles.extend(result)
160
+
161
+ logger.info(
162
+ f"[Wikinews] Collected {len(all_articles)} articles from "
163
+ f"{len(WIKINEWS_CATEGORIES)} categories for '{category}'"
164
+ )
165
+ return all_articles
166
+
167
+ except Exception as e:
168
+ logger.error(f"[Wikinews] Unexpected error: {e}", exc_info=True)
169
+ return []
170
+
171
+ # ─────────────────────────────────────────────────────────────────────────
172
+ # PRIVATE HELPERS
173
+ # ─────────────────────────────────────────────────────────────────────────
174
+
175
+ async def _query_category(
176
+ self,
177
+ client: httpx.AsyncClient,
178
+ wiki_category: str,
179
+ pulse_category: str,
180
+ ) -> List[Article]:
181
+ """
182
+ Run one MediaWiki search query for articles in a given Wikinews category.
183
+
184
+ Args:
185
+ client (httpx.AsyncClient): Shared HTTP client from fetch_news().
186
+ wiki_category (str): The Wikinews category to search within
187
+ (e.g., "Computing", "Internet").
188
+ pulse_category (str): Our internal Pulse category β€” tagged on articles.
189
+
190
+ Returns:
191
+ List[Article]: Parsed articles. Returns [] on any failure.
192
+ """
193
+ params = {
194
+ "action": "query",
195
+ "list": "search",
196
+ # incategory: restricts results to articles in that Wikinews category.
197
+ "srsearch": f"incategory:{wiki_category}",
198
+ "srlimit": MAX_ARTICLES_PER_CATEGORY,
199
+ "srprop": "snippet|timestamp", # Only fetch what we actually need
200
+ "format": "json",
201
+ "formatversion": "2", # Cleaner JSON output format
202
+ # Phase 14 fix: Adding 'info' query alongside the search so that
203
+ # MediaWiki returns the 'canonicalurl' for each result page.
204
+ # This eliminates the redirect hop in the image enricher:
205
+ # Before: curid URL β†’ 301 redirect β†’ actual page β†’ parse og:image (2 requests)
206
+ # After: canonicalurl β†’ actual page β†’ parse og:image (1 request)
207
+ # We do not add 'generator=search' because that changes the response
208
+ # format entirely and would break our current _map_search_hits() logic.
209
+ # Instead we capture the canonicalurl inside the search result hit itself
210
+ # via the 'url' srprop (supported by MediaWiki's search module).
211
+ "srprop": "snippet|timestamp|titlesnippet", # Overrides above β€” note below
212
+ # NOTE: MediaWiki does NOT expose canonicalurl through srprop directly.
213
+ # The correct approach is a separate 'prop=info&inprop=url' sub-query.
214
+ # That requires changing from 'list=search' to 'generator=search' which
215
+ # is a larger refactor. For Phase 14 we use a safe, narrow approach:
216
+ # keep 'snippet|timestamp' as srprop and construct the canonical URL
217
+ # from the title (URL-encoded), which is always stable on Wikinews.
218
+ "srprop": "snippet|timestamp", # Keep original β€” canonical from title
219
+ }
220
+
221
+ try:
222
+ response = await client.get(
223
+ WIKINEWS_API_URL,
224
+ params=params,
225
+ headers={
226
+ "User-Agent": "SegmentoPulse-Ingestion/1.0 (https://segmento.in)"
227
+ # Wikimedia's API rules require a descriptive User-Agent.
228
+ },
229
+ )
230
+
231
+ if response.status_code == 429:
232
+ logger.warning(f"[Wikinews] [{wiki_category}] HTTP 429 rate limit.")
233
+ self.mark_rate_limited()
234
+ return []
235
+
236
+ if response.status_code != 200:
237
+ logger.warning(
238
+ f"[Wikinews] [{wiki_category}] HTTP {response.status_code} β€” skipping."
239
+ )
240
+ return []
241
+
242
+ data = response.json()
243
+
244
+ except httpx.TimeoutException:
245
+ logger.warning(f"[Wikinews] [{wiki_category}] Request timed out.")
246
+ return []
247
+ except Exception as e:
248
+ logger.warning(f"[Wikinews] [{wiki_category}] Fetch error: {e}")
249
+ return []
250
+
251
+ # Drill into the MediaWiki response structure.
252
+ # Shape: { "query": { "search": [ {...}, {...} ] } }
253
+ query_block = data.get("query") or {}
254
+ search_hits = query_block.get("search") or []
255
+
256
+ if not search_hits:
257
+ logger.info(f"[Wikinews] [{wiki_category}] No results returned.")
258
+ return []
259
+
260
+ articles = self._map_search_hits(search_hits, wiki_category, pulse_category)
261
+
262
+ # ── ENRICH: Fetch images for articles that have none ──────────────
263
+ # _map_search_hits is sync β€” enrichment happens here in the async caller.
264
+ # Wikinews curid URLs do have og:image tags on their article pages.
265
+ articles = await self._enrich_article_images(wiki_category, articles)
266
+
267
+ logger.info(
268
+ f"[Wikinews] [{wiki_category}] Parsed {len(articles)} articles."
269
+ )
270
+ return articles
271
+
272
+ def _map_search_hits(
273
+ self,
274
+ search_hits: list,
275
+ wiki_category: str,
276
+ pulse_category: str,
277
+ ) -> List[Article]:
278
+ """
279
+ Convert MediaWiki search result items into Segmento Pulse Article objects.
280
+
281
+ Key transformations:
282
+ title β†’ title (direct)
283
+ pageid β†’ url (constructed as curid URL)
284
+ timestamp β†’ published_at (already ISO 8601)
285
+ snippet β†’ description (HTML tags stripped via regex)
286
+ (none) β†’ image_url = "" (no images in search results β€” Phase 12 fix)
287
+ (hardcoded)β†’ source = "Wikinews"
288
+
289
+ Args:
290
+ search_hits (list): The 'query.search' array from the API response.
291
+ wiki_category (str): Which Wikinews category these came from.
292
+ pulse_category (str): Our internal category β€” tagged on each article.
293
+
294
+ Returns:
295
+ List[Article]: Clean Article objects.
296
+ """
297
+ articles: List[Article] = []
298
+
299
+ for hit in search_hits:
300
+ if not isinstance(hit, dict):
301
+ continue
302
+
303
+ # ── Title ────────────────────────────────────────────────────
304
+ title = (hit.get("title") or "").strip()
305
+ if not title:
306
+ continue
307
+
308
+ # ── URL β€” canonical title URL with curid fallback ──────────────
309
+ # Phase 14 fix: Construct the canonical URL from the article title.
310
+ # Wikinews titles map directly to stable URLs under /wiki/.
311
+ # Example: title = "AI chip shortage hits 2026"
312
+ # β†’ https://en.wikinews.org/wiki/AI_chip_shortage_hits_2026
313
+ # This URL is permanent (Wikimedia guarantees title-based URLs).
314
+ # The image enricher can now visit this URL directly without
315
+ # following a 301 redirect from the curid format β€” saving one
316
+ # HTTP round-trip per article during image enrichment.
317
+ #
318
+ # We still require pageid as a sanity check. If both checks fail,
319
+ # we skip the article entirely (no pageid = no reliable identity).
320
+ pageid = hit.get("pageid")
321
+ if not pageid:
322
+ continue
323
+
324
+ # Build canonical URL from the URL-safe title.
325
+ # urllib.parse.quote() turns spaces β†’ underscores β†’ %20, but Wikimedia
326
+ # actually uses underscores in URLs (not %20). We replace spaces first.
327
+ title_for_url = title.replace(" ", "_")
328
+ import urllib.parse
329
+ canonical_url = (
330
+ "https://en.wikinews.org/wiki/"
331
+ + urllib.parse.quote(title_for_url, safe="/:@!$&'()*+,;=")
332
+ )
333
+
334
+ # curid URL is kept as fallback β€” if the canonical URL ever fails
335
+ # to load in the enricher, the curid URL still reaches the same page.
336
+ # We use canonical_url as the primary because it has no redirect hop.
337
+ url = canonical_url
338
+
339
+ # ── Published Date ────────────────────────────────────────────
340
+ # MediaWiki returns ISO 8601 already, e.g., "2026-03-03T06:00:00Z".
341
+ # Our Article model's published_at validator accepts this directly.
342
+ published_at = hit.get("timestamp") or ""
343
+
344
+ # ── Description (HTML-stripped snippet) ───────────────────────
345
+ # MediaWiki injects HTML like <span class="searchmatch">term</span>
346
+ # into snippets to highlight search terms. We strip ALL HTML tags
347
+ # using the pre-compiled regex pattern defined at the module level.
348
+ raw_snippet = hit.get("snippet") or ""
349
+ description = HTML_TAG_PATTERN.sub("", raw_snippet).strip()
350
+
351
+ # ── Image URL ─────────────────────────────────────────────────
352
+ # MediaWiki search results do not include images.
353
+ # Phase 12 will add a separate image enrichment step for Wikinews.
354
+ # For now, empty string routes to the Segmento Pulse banner fallback.
355
+ image_url = ""
356
+
357
+ # ── Build Article ─────────────────────────────────────────────
358
+ try:
359
+ article = Article(
360
+ title=title,
361
+ description=description,
362
+ url=url,
363
+ image_url=image_url,
364
+ published_at=published_at,
365
+ source="Wikinews",
366
+ # ── ROUTING RULE ──────────────────────────────────────
367
+ # Tag with pulse_category from the aggregator.
368
+ # Unknown categories safely route to 'News Articles'.
369
+ category=pulse_category,
370
+ )
371
+ articles.append(article)
372
+
373
+ except Exception as e:
374
+ logger.debug(
375
+ f"[Wikinews] [{wiki_category}] Skipped '{title[:50]}': {e}"
376
+ )
377
+ continue
378
+
379
+ return articles
380
+
381
+ # ─────────────────────────────────────────────────────────────────────────
382
+ # PHASE 12: IMAGE ENRICHMENT β€” async post-processing step
383
+ # ─────────────────────────────────────────────────────────────────────────
384
+
385
+ async def _enrich_article_images(
386
+ self, wiki_category: str, articles: List[Article]
387
+ ) -> List[Article]:
388
+ """
389
+ For every article that has an empty image_url, visit its Wikinews
390
+ curid URL and try to find the main image via the og:image meta tag.
391
+
392
+ Wikinews article pages DO include og:image tags β€” they are set by
393
+ the MediaWiki software for every published article. This call is
394
+ therefore likely to succeed for most articles.
395
+
396
+ All image fetches run concurrently. With the outer 4-second timeout
397
+ per call, the entire batch takes ~4 seconds maximum, not N x 4.
398
+
399
+ Args:
400
+ wiki_category (str): Category label used for logging only.
401
+ articles (List[Article]): Output from _map_search_hits().
402
+
403
+ Returns:
404
+ List[Article]: Same articles, with image_url filled in where possible.
405
+ """
406
+ if not articles:
407
+ return articles
408
+
409
+ # Phase 14 fix: Added asyncio.Semaphore(10) to cap concurrent connections.
410
+ # Before: 10 articles per category Γ— 2 categories = 20 simultaneous HTTP
411
+ # requests to Wikinews article pages β€” no limit.
412
+ # After: At most 10 page visits run at the same time. The rest queue safely.
413
+ sem = asyncio.Semaphore(10)
414
+
415
+ async def _get_image(article: Article) -> str:
416
+ if article.image_url and article.image_url.startswith("http"):
417
+ return article.image_url # Already has an image β€” skip
418
+ # Acquire one of 10 available lanes before fetching the page.
419
+ async with sem:
420
+ return await extract_top_image(article.url)
421
+
422
+ image_tasks = [_get_image(a) for a in articles]
423
+ fetched_images = await asyncio.gather(*image_tasks, return_exceptions=True)
424
+
425
+ enriched: List[Article] = []
426
+ for article, image_result in zip(articles, fetched_images):
427
+ if isinstance(image_result, str) and image_result:
428
+ article = article.model_copy(update={"image_url": image_result})
429
+ enriched.append(article)
430
+
431
+ logger.info(
432
+ f"[Wikinews] [{wiki_category}] Image enrichment complete β€” "
433
+ f"{sum(1 for a in enriched if a.image_url)}/{len(enriched)} articles have images."
434
+ )
435
+ return enriched
app/services/providers/worldnewsai/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # providers/worldnewsai/__init__.py
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ # This file marks the 'worldnewsai' folder as a Python package.
4
+ # To use this provider, import it like this:
5
+ #
6
+ # from app.services.providers.worldnewsai.client import WorldNewsAIProvider
7
+ #
8
+ # This is a PAID provider (point-based quota) β€” it requires the
9
+ # WORLDNEWS_API_KEY environment variable to be set.
10
+ #
11
+ # It sits at position 5 in the PAID_CHAIN β€” the last line of defence
12
+ # before the paid chain gives up. Only fires after GNews, NewsAPI,
13
+ # NewsData, and TheNewsAPI have all failed or exhausted their budgets.
14
+ #
15
+ # ── CRITICAL QUOTA WARNING ────────────────────────────────────────────────
16
+ # WorldNewsAI uses a point system, NOT a simple request counter.
17
+ # Each API call costs points + each returned article costs additional points.
18
+ # The client has a conservative daily_limit = 50 calls to protect the budget.
19
+ # If you see HTTP 402, the daily point budget is fully exhausted.
app/services/providers/worldnewsai/client.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ providers/worldnewsai/client.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ The WorldNewsAI Provider for Segmento Pulse.
5
+
6
+ What this does:
7
+ Fetches technology news from WorldNewsAI.com β€” a global news crawler
8
+ that indexes tens of thousands of sources worldwide, including many
9
+ non-English and non-US-centric publications.
10
+
11
+ Paid provider β€” needs WORLDNEWS_API_KEY in your .env file.
12
+ Position 5 in the PAID_CHAIN (last paid failover).
13
+
14
+ ── THE CRITICAL QUOTA PROBLEM AND HOW WE SOLVE IT ──────────────────────────
15
+
16
+ WorldNewsAI does NOT use a simple "100 requests per day" model.
17
+ It uses a POINT system:
18
+ - Each search call costs points
19
+ - Each article returned in the response costs additional points
20
+ - If you run out of points, the API returns HTTP 402 (not 429)
21
+
22
+ If we called this for all 22 categories every hour, we would exhaust our
23
+ free-tier point budget before lunchtime.
24
+
25
+ Our two-layer protection:
26
+ 1. Position 5 in PAID_CHAIN: Only fires as the last fallback after
27
+ GNews, NewsAPI, NewsData, and TheNewsAPI have all failed.
28
+ In a healthy system, it will rarely be called at all.
29
+ 2. daily_limit = 50: The quota tracker caps total calls per day.
30
+ Once 50 calls are used, the circuit breaker prevents further calls.
31
+
32
+ ── THE CONTENT SAFETY PROBLEM AND HOW WE SOLVE IT ──────────────────────────
33
+
34
+ WorldNewsAI returns the FULL article body in the 'text' field.
35
+ A typical article body is 500-3,000 words β€” far too large to store in
36
+ our database for each article, and potentially a copyright issue.
37
+
38
+ Fix: We take only the first 200 characters from the 'text' field
39
+ and use that as the article's description. This is the same "snippet"
40
+ approach used by Google News, Bing News, and other aggregators.
41
+ 200 characters is enough to show a preview without reproducing the article.
42
+ """
43
+
44
+ # ── Standard Library ──────────────────────────────────────────────────────
45
+ import logging
46
+ from datetime import datetime, timezone
47
+ from typing import List, Optional
48
+
49
+ # ── Third-party (already in requirements.txt) ──────────────────────────────────
50
+ import httpx # Async HTTP client
51
+
52
+ # ── Internal ─────────────────────────────────────────────────────────────────
53
+ from app.services.providers.base import NewsProvider, ProviderStatus
54
+ from app.models import Article
55
+ from app.config import settings
56
+ # Phase 16: Import the Redis counter utility to make the daily budget
57
+ # restart-proof. Without this, self.request_count lives in RAM and resets
58
+ # to 0 on every Hugging Face Space restart, letting us overspend the quota.
59
+ from app.services.utils.provider_state import (
60
+ get_provider_counter,
61
+ increment_provider_counter,
62
+ )
63
+
64
+ logger = logging.getLogger(__name__)
65
+
66
+ # ── Constants ─────────────────────────────────────────────────────────────────
67
+
68
+ # WorldNewsAI search endpoint (v1)
69
+ WORLDNEWSAI_SEARCH_URL = "https://api.worldnewsapi.com/search-news"
70
+
71
+ # Request timeout in seconds
72
+ HTTP_TIMEOUT_SECONDS = 12.0
73
+
74
+ # Articles per call. Keep it modest to save points per request.
75
+ ARTICLES_PER_REQUEST = 10
76
+
77
+ # How many characters of article body text to keep as the description.
78
+ # Enough for a readable summary, small enough to avoid copyright concerns
79
+ # and database bloat. Matches the 200-char limit used by our RSS parser.
80
+ DESCRIPTION_MAX_CHARS = 200
81
+
82
+ # Category β†’ search text mapping.
83
+ # WorldNewsAI takes free-text search queries, not categories.
84
+ # We translate our internal category slug into a descriptive keyword phrase.
85
+ CATEGORY_QUERY_MAP = {
86
+ 'ai': 'artificial intelligence machine learning',
87
+ 'data-security': 'data security cybersecurity breach',
88
+ 'data-governance': 'data governance compliance regulation',
89
+ 'data-privacy': 'data privacy GDPR CCPA',
90
+ 'data-engineering': 'data engineering pipeline ETL',
91
+ 'data-management': 'data management master data catalog',
92
+ 'business-intelligence': 'business intelligence analytics BI',
93
+ 'business-analytics': 'business analytics reporting dashboards',
94
+ 'customer-data-platform': 'customer data platform CDP',
95
+ 'data-centers': 'data center infrastructure colocation',
96
+ 'cloud-computing': 'cloud computing technology',
97
+ 'magazines': 'technology news',
98
+ 'data-laws': 'data privacy law regulation AI act',
99
+ 'cloud-aws': 'Amazon Web Services AWS cloud',
100
+ 'cloud-azure': 'Microsoft Azure cloud',
101
+ 'cloud-gcp': 'Google Cloud Platform GCP',
102
+ 'cloud-oracle': 'Oracle Cloud OCI',
103
+ 'cloud-ibm': 'IBM Cloud Red Hat',
104
+ 'cloud-alibaba': 'Alibaba Cloud technology',
105
+ 'cloud-digitalocean': 'DigitalOcean cloud platform',
106
+ 'cloud-huawei': 'Huawei Cloud technology',
107
+ 'cloud-cloudflare': 'Cloudflare network security',
108
+ }
109
+
110
+
111
+ class WorldNewsAIProvider(NewsProvider):
112
+ """
113
+ Fetches global technology news from WorldNewsAI.com.
114
+
115
+ Paid provider (point-based quota) β€” position 5 in the PAID_CHAIN.
116
+ Only fires when GNews, NewsAPI, NewsData, and TheNewsAPI have all failed.
117
+ Requires WORLDNEWS_API_KEY in the .env file.
118
+
119
+ Usage (wired in Phase 8):
120
+ provider = WorldNewsAIProvider(api_key="your_key_here")
121
+ articles = await provider.fetch_news(category="ai", limit=10)
122
+ """
123
+
124
+ def __init__(self, api_key: Optional[str] = None):
125
+ super().__init__(api_key=api_key)
126
+
127
+ # Phase 16: This value is now the CEILING checked in Redis, not just
128
+ # a RAM counter. Even if the server restarts mid-day, Redis remembers
129
+ # exactly how many calls we have already made today.
130
+ self.daily_limit = 50
131
+
132
+ # ─────────────────────────────────────────────────────────────────────────
133
+ # MAIN ENTRY POINT β€” called by the aggregator's PAID WATERFALL
134
+ # ─────────────────────────────────────────────────────────────────────────
135
+
136
+ async def fetch_news(self, category: str, limit: int = 10) -> List[Article]:
137
+ """
138
+ Fetch global technology news from WorldNewsAI.
139
+
140
+ Args:
141
+ category (str): Our internal category slug (e.g., "ai").
142
+ We look it up in CATEGORY_QUERY_MAP to get
143
+ the search text for the API call.
144
+ limit (int): Max articles to return. Kept at 10 by default
145
+ to conserve the point budget per call.
146
+
147
+ Returns:
148
+ List[Article]: Mapped Article objects. Returns [] on any failure.
149
+ """
150
+ if not self.api_key:
151
+ logger.debug("[WorldNewsAI] No API key configured β€” skipping.")
152
+ return []
153
+
154
+ # ── PHASE 16: Redis-backed daily budget guard ────────────────────────
155
+ # Check how many times we have already called WorldNewsAI TODAY
156
+ # using the Redis counter (not self.request_count which lives in RAM).
157
+ #
158
+ # Today's date string (UTC) is used as part of the Redis key so the
159
+ # counter automatically resets at midnight UTC without any manual work.
160
+ # Example key: "provider:state:worldnewsai:calls:2026-03-03"
161
+ #
162
+ # If Redis is unreachable: get_provider_counter returns 999999
163
+ # (fail-safe) so we skip the call rather than risk overspending.
164
+ today_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
165
+ current_calls = await get_provider_counter("worldnewsai", today_str)
166
+
167
+ if current_calls >= self.daily_limit:
168
+ logger.warning(
169
+ "[WorldNewsAI] Daily Redis budget exhausted β€” %d/%d calls used today. "
170
+ "Skipping to protect the API quota.",
171
+ current_calls, self.daily_limit
172
+ )
173
+ self.mark_rate_limited()
174
+ return []
175
+
176
+ search_text = CATEGORY_QUERY_MAP.get(category, "technology news")
177
+
178
+ params = {
179
+ "text": search_text,
180
+ "language": "en",
181
+ "number": min(limit, ARTICLES_PER_REQUEST),
182
+ "api-key": self.api_key,
183
+ # NOTE: No date filters applied here intentionally.
184
+ # WorldNewsAI supports 'earliest-publish-date' and
185
+ # 'latest-publish-date', but our freshness gate handles
186
+ # date filtering more accurately using IST boundaries.
187
+ }
188
+
189
+ try:
190
+ async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
191
+ print(
192
+ f"[WorldNewsAI] Fetching '{category}' "
193
+ f"(query='{search_text[:40]}...')..."
194
+ )
195
+ response = await client.get(WORLDNEWSAI_SEARCH_URL, params=params)
196
+
197
+ # ── HTTP 402: Point quota fully exhausted ─────────────────
198
+ # 402 means we are out of points for today β€” not just rate
199
+ # limited, but completely blocked until tomorrow's reset.
200
+ # We mark the provider as RATE_LIMITED (not ERROR) so it can
201
+ # recover after the scheduler's daily quota reset cycle.
202
+ if response.status_code == 402:
203
+ logger.warning(
204
+ "[WorldNewsAI] HTTP 402 β€” point quota exhausted. "
205
+ "No more calls until tomorrow's reset."
206
+ )
207
+ self.mark_rate_limited()
208
+ return []
209
+
210
+ # ── HTTP 401: Invalid or expired API key ──────────────────
211
+ if response.status_code == 401:
212
+ logger.error(
213
+ "[WorldNewsAI] HTTP 401 β€” API key is invalid or expired. "
214
+ "Check WORLDNEWS_API_KEY in your .env file."
215
+ )
216
+ self.status = ProviderStatus.ERROR
217
+ return []
218
+
219
+ # ── HTTP 429: Too many requests (short-term rate limit) ───
220
+ if response.status_code == 429:
221
+ logger.warning("[WorldNewsAI] HTTP 429 β€” request rate exceeded.")
222
+ self.mark_rate_limited()
223
+ return []
224
+
225
+ # ── Any other non-200 ─────────────────────────────────────
226
+ if response.status_code != 200:
227
+ logger.warning(
228
+ f"[WorldNewsAI] Unexpected HTTP {response.status_code}."
229
+ )
230
+ return []
231
+
232
+ # ── Parse the response ─────────────────────────────────────────
233
+ self.request_count += 1 # Keep RAM shadow in sync for debugging
234
+ data = response.json()
235
+
236
+ # WorldNewsAI wraps articles in a top-level 'news' key
237
+ raw_articles = data.get("news", [])
238
+
239
+ if not raw_articles:
240
+ logger.info(
241
+ f"[WorldNewsAI] No articles returned for '{category}'."
242
+ )
243
+ return []
244
+
245
+ articles = self._map_articles(raw_articles, category)
246
+
247
+ # ── PHASE 16: Increment the Redis counter after a successful call ──
248
+ # We only count successful 200 responses, not failures.
249
+ # A failed call that returns [] should NOT burn our daily budget.
250
+ await increment_provider_counter("worldnewsai", today_str)
251
+
252
+ logger.info("[WorldNewsAI] Got %d articles for '%s'.", len(articles), category)
253
+ return articles
254
+
255
+ except httpx.TimeoutException:
256
+ logger.warning("[WorldNewsAI] Request timed out.")
257
+ return []
258
+ except Exception as e:
259
+ logger.error(f"[WorldNewsAI] Unexpected error: {e}", exc_info=True)
260
+ return []
261
+
262
+ # ─────────────────────────────────────────────────────────────────────────
263
+ # PRIVATE HELPER β€” maps raw JSON items to Article objects
264
+ # ─────────────────────────────────────────────────────────────────────────
265
+
266
+ def _map_articles(self, raw_articles: list, category: str) -> List[Article]:
267
+ """
268
+ Convert WorldNewsAI JSON items into Segmento Pulse Article objects.
269
+
270
+ Key transformations:
271
+ - 'text' field is truncated to 200 characters (body is too long)
272
+ - 'authors' is a list β€” we join it with ", " into one string
273
+ - 'image' maps directly to image_url
274
+
275
+ WorldNewsAI field β†’ Article field
276
+ ──────────────────────────────────────
277
+ title β†’ title
278
+ url β†’ url
279
+ image β†’ image_url
280
+ publish_date β†’ published_at
281
+ authors (list) β†’ source (joined)
282
+ text (truncated) β†’ description
283
+
284
+ Args:
285
+ raw_articles (list): The 'news' array from the API response.
286
+ category (str): The aggregator's category for routing.
287
+
288
+ Returns:
289
+ List[Article]: Clean Article objects ready for the pipeline.
290
+ """
291
+ articles: List[Article] = []
292
+
293
+ for item in raw_articles:
294
+ if not isinstance(item, dict):
295
+ continue
296
+
297
+ # ── Title ────────────────────────────────────────────────────
298
+ title = (item.get("title") or "").strip()
299
+ if not title:
300
+ continue
301
+
302
+ # ── URL ──────────────────────────────────────────────────────
303
+ url = (item.get("url") or "").strip()
304
+ if not url or not url.startswith("http"):
305
+ continue
306
+
307
+ # ── Image URL ─────────────────────────────────────────────────
308
+ image_url = (item.get("image") or "").strip()
309
+
310
+ # ── Published Date ────────────────────────────────────────────
311
+ # WorldNewsAI returns ISO 8601 format (e.g., "2026-03-03 06:00:00")
312
+ # Our Article model's published_at validator can handle this.
313
+ published_at = item.get("publish_date") or ""
314
+
315
+ # ── Source (from authors list) ────────────────────────────────
316
+ # 'authors' is a list of names, e.g., ["Jane Doe", "John Smith"]
317
+ # We join them into a comma-separated string for the source field.
318
+ authors = item.get("authors") or []
319
+ if isinstance(authors, list) and authors:
320
+ # Filter out empty strings first, then join
321
+ clean_authors = [a.strip() for a in authors if a and a.strip()]
322
+ source = ", ".join(clean_authors) if clean_authors else "WorldNewsAI"
323
+ else:
324
+ source = "WorldNewsAI"
325
+
326
+ # ── Description (TRUNCATED body text) ─────────────────────────
327
+ # WorldNewsAI returns the FULL article body in 'text'.
328
+ # This is thousands of words β€” we MUST truncate it.
329
+ # 200 characters gives a readable preview without storing
330
+ # copyright-protected full content in our database.
331
+ raw_text = (item.get("text") or item.get("summary") or "").strip()
332
+ if len(raw_text) > DESCRIPTION_MAX_CHARS:
333
+ description = raw_text[:DESCRIPTION_MAX_CHARS] + "..."
334
+ else:
335
+ description = raw_text
336
+
337
+ # ── Build Article ─────────────────────────────────────────────
338
+ try:
339
+ article = Article(
340
+ title=title,
341
+ description=description,
342
+ url=url,
343
+ image_url=image_url,
344
+ published_at=published_at,
345
+ source=source,
346
+ # ── ROUTING RULE ──────────────────────────────────────
347
+ # Pass through the aggregator's category.
348
+ # Unknown categories safely route to 'News Articles'.
349
+ category=category,
350
+ )
351
+ articles.append(article)
352
+
353
+ except Exception as e:
354
+ logger.debug(
355
+ f"[WorldNewsAI] Skipped item '{title[:50]}': {e}"
356
+ )
357
+ continue
358
+
359
+ return articles
app/services/scheduler.py CHANGED
@@ -17,6 +17,8 @@ from app.services.upstash_cache import get_upstash_cache # Needed to bust stal
17
  from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
18
  from app.services.research_aggregator import ResearchAggregator
19
  from app.config import settings
 
 
20
 
21
  # Setup logging
22
  logging.basicConfig(level=logging.INFO)
@@ -377,6 +379,149 @@ async def fetch_daily_research():
377
  logger.info("═" * 80)
378
 
379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  async def fetch_and_validate_category(category: str, aggregator) -> tuple:
381
  """
382
  Fetch and validate articles for a single category.
@@ -393,6 +538,7 @@ async def fetch_and_validate_category(category: str, aggregator) -> tuple:
393
  from app.utils.date_parser import normalize_article_date
394
  from app.utils.url_canonicalization import canonicalize_url
395
  from app.utils.redis_dedup import is_url_seen_or_mark
 
396
 
397
  try:
398
  logger.info("πŸ“Œ Fetching %s...", category.upper())
@@ -477,12 +623,41 @@ async def fetch_and_validate_category(category: str, aggregator) -> tuple:
477
  continue
478
 
479
  # Step 4: Normalize date to UTC ISO-8601.
480
- article = normalize_article_date(article)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
 
482
- # Step 5: Sanitize and clean the article fields.
483
- clean_article = sanitize_article(article)
484
- valid_articles.append(clean_article)
485
-
486
  logger.info("βœ“ %s: %d valid, %d invalid, %d irrelevant",
487
  category.upper(), len(valid_articles), invalid_count, irrelevant_count)
488
  return (category, valid_articles, invalid_count, irrelevant_count, relevant_count)
 
17
  from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
18
  from app.services.research_aggregator import ResearchAggregator
19
  from app.config import settings
20
+ # Phase 13: Global image enrichment β€” fills missing og:image across ALL providers
21
+ from app.services.utils.image_enricher import extract_top_image
22
 
23
  # Setup logging
24
  logging.basicConfig(level=logging.INFO)
 
379
  logger.info("═" * 80)
380
 
381
 
382
+ # ──────────────────────────────────────────────────────────────────────────────
383
+ # PHASE 13: GLOBAL IMAGE ENRICHMENT SAFETY NET
384
+ # ──────────────────────────────────────────────────────────────────────────────
385
+ #
386
+ # What this does:
387
+ # After all validation and deduplication gates have passed, some articles
388
+ # still arrive with an empty or missing image_url. This happens most often
389
+ # with providers like OpenRSS (blog feeds without media tags), Webz.io
390
+ # (small sites without a thread.main_image), and SauravKanchan (NewsAPI
391
+ # null urlToImage). This function visits the article's URL and tries to
392
+ # extract the og:image meta tag β€” the standard way websites declare their
393
+ # main thumbnail image.
394
+ #
395
+ # Why AFTER deduplication?
396
+ # We only enrich articles that actually passed every gate and are about to
397
+ # be saved. We never spend HTTP calls on articles that will be thrown away.
398
+ #
399
+ # Safety guards:
400
+ # 1. MAX_ENRICH_PER_RUN = 20 β€” Hard cap. If 50 no-image articles arrive,
401
+ # we only enrich the first 20, leave the rest as "", and the Pulse banner
402
+ # shows on the frontend. This stops a rogue provider from bottlenecking
403
+ # the cron job.
404
+ # 2. asyncio.Semaphore(10) β€” At most 10 web-page fetches happen at the
405
+ # same time. This prevents memory spikes and avoids hammering websites.
406
+ # 3. Individual 4-second timeout (inside extract_top_image) β€” A broken URL
407
+ # is cancelled in 4 seconds. With Semaphore(10) and MAX 20 articles:
408
+ # worst-case total overhead = (20 / 10) Γ— 4 = 8 seconds per category run.
409
+ # 4. Zero side-effects β€” A failed enrichment returns the article unchanged.
410
+ # The enricher NEVER removes an article from the pipeline.
411
+ #
412
+ async def enrich_missing_images_in_batch(articles: list) -> list:
413
+ """
414
+ Scan a list of fully-vetted articles and fill in any missing images.
415
+
416
+ Only enriches up to MAX_ENRICH_PER_RUN articles that have no valid
417
+ image_url. Articles that already have an image are passed through
418
+ instantly with zero network cost.
419
+
420
+ Args:
421
+ articles (list): Final, deduplicated, validated Article objects.
422
+
423
+ Returns:
424
+ list: Same articles, with image_url filled where possible.
425
+ Never raises. Never removes an article.
426
+ """
427
+ if not articles:
428
+ return articles
429
+
430
+ # ── Constants ─────────────────────────────────────────────────────────────
431
+ # Cap: only attempt image enrichment on the first 20 articles that need it.
432
+ # The rest go to the database as-is (empty image = Pulse banner fallback).
433
+ MAX_ENRICH_PER_RUN = 20
434
+
435
+ # Semaphore: at most 10 website fetches run simultaneously.
436
+ # Think of it like a queue of 10 checkout lanes at a supermarket.
437
+ # If 20 people arrive at once, 10 go straight through and 10 wait
438
+ # in line. Nobody gets turned away, but the store doesn't explode.
439
+ sem = asyncio.Semaphore(10)
440
+
441
+ # ── Count how many articles actually need enrichment ───────────────────────
442
+ articles_needing_images = [
443
+ a for a in articles
444
+ if not a.image_url or not a.image_url.startswith("http")
445
+ ]
446
+ enrich_count = min(len(articles_needing_images), MAX_ENRICH_PER_RUN)
447
+
448
+ if enrich_count == 0:
449
+ # Every article already has a valid image. Nothing to do.
450
+ return articles
451
+
452
+ logger.info(
453
+ "πŸ–ΌοΈ [IMAGE ENRICHER] %d article(s) missing images β€” enriching up to %d...",
454
+ len(articles_needing_images), enrich_count
455
+ )
456
+
457
+ # Build a lookup set of URLs to enrich (only the capped subset).
458
+ urls_to_enrich = {
459
+ str(a.url) for a in articles_needing_images[:MAX_ENRICH_PER_RUN]
460
+ }
461
+
462
+ # ── Internal worker: enrich one article ───────────────────────────────────
463
+ async def _enrich_one(article) -> object:
464
+ """
465
+ If this article needs an image, fetch it under the semaphore guard.
466
+ Returns the article (updated or unchanged).
467
+ """
468
+ url_str = str(article.url) if article.url else ""
469
+
470
+ # Article already has a valid image, or it's outside the cap β€” skip.
471
+ if url_str not in urls_to_enrich:
472
+ return article
473
+
474
+ async with sem:
475
+ # Semaphore acquired: one of our 10 lanes is now occupied.
476
+ # extract_top_image has its own 4-second internal timeout,
477
+ # so this will release the lane quickly regardless of outcome.
478
+ image_url = await extract_top_image(url_str)
479
+
480
+ if image_url and image_url.startswith("http"):
481
+ # Got a valid image β€” update the article cleanly.
482
+ # model_copy() is the correct Pydantic v2 pattern for immutable models.
483
+ return article.model_copy(update={"image_url": image_url})
484
+
485
+ # No image found or fetch failed β€” return article unchanged.
486
+ return article
487
+
488
+ # ── Run all workers concurrently ───────────────────────────────────────────
489
+ # All articles go into gather() at once. The semaphore controls how many
490
+ # actually hit the network at the same time (max 10). The rest wait
491
+ # in asyncio's queue without blocking the event loop.
492
+ try:
493
+ enriched_articles = await asyncio.gather(
494
+ *[_enrich_one(a) for a in articles],
495
+ return_exceptions=True
496
+ )
497
+
498
+ # Replace any Exception results with the original article (safe fallback).
499
+ final = []
500
+ for original, result in zip(articles, enriched_articles):
501
+ if isinstance(result, Exception):
502
+ logger.debug(
503
+ "[IMAGE ENRICHER] Worker exception for %s: %s",
504
+ str(original.url)[:60], result
505
+ )
506
+ final.append(original) # Keep original if worker crashed
507
+ else:
508
+ final.append(result)
509
+
510
+ enriched_total = sum(
511
+ 1 for a in final if a.image_url and a.image_url.startswith("http")
512
+ )
513
+ logger.info(
514
+ "βœ… [IMAGE ENRICHER] Done β€” %d/%d articles now have images.",
515
+ enriched_total, len(final)
516
+ )
517
+ return final
518
+
519
+ except Exception as e:
520
+ # If the entire gather somehow fails, return the original list untouched.
521
+ logger.error("[IMAGE ENRICHER] Gather failed: %s β€” returning articles unchanged.", e)
522
+ return articles
523
+
524
+
525
  async def fetch_and_validate_category(category: str, aggregator) -> tuple:
526
  """
527
  Fetch and validate articles for a single category.
 
538
  from app.utils.date_parser import normalize_article_date
539
  from app.utils.url_canonicalization import canonicalize_url
540
  from app.utils.redis_dedup import is_url_seen_or_mark
541
+ from app.models import Article # Needed to reconstruct Pydantic model after date normalization
542
 
543
  try:
544
  logger.info("πŸ“Œ Fetching %s...", category.upper())
 
623
  continue
624
 
625
  # Step 4: Normalize date to UTC ISO-8601.
626
+ # IMPORTANT: normalize_article_date() always returns a plain dict
627
+ # (it calls model_dump() internally). We reconstruct the Pydantic
628
+ # Article right after so that enrich_missing_images_in_batch()
629
+ # (Phase 13, below) gets the .image_url attribute it needs.
630
+ normalized_dict = normalize_article_date(article)
631
+ try:
632
+ article = Article(**normalized_dict)
633
+ except Exception:
634
+ # If reconstruction fails for any reason, skip this article.
635
+ # The dict is malformed β€” better to drop it than crash.
636
+ invalid_count += 1
637
+ continue
638
+
639
+ # Step 5: Article is now a clean Pydantic object with a normalized date.
640
+ # We intentionally do NOT call sanitize_article() yet β€” that step
641
+ # runs AFTER image enrichment below.
642
+ valid_articles.append(article)
643
+
644
+ # ── PHASE 13: GLOBAL IMAGE ENRICHMENT ─────────────────────────────────
645
+ # This is the bottom of the funnel. Every article here has already:
646
+ # βœ“ Passed basic validation (title, URL, date exist)
647
+ # βœ“ Passed category relevance check
648
+ # βœ“ Passed Redis 48-hour deduplication (it is a NEW article)
649
+ # βœ“ Been date-normalized
650
+ # Articles are still Pydantic objects here β€” enrichment needs .image_url.
651
+ if valid_articles:
652
+ valid_articles = await enrich_missing_images_in_batch(valid_articles)
653
+
654
+ # ── SANITIZE (after enrichment) ──────────────────────────────────────���─
655
+ # Now that images are filled, convert each Pydantic Article to a clean
656
+ # dict for Appwrite storage. sanitize_article() strips unsafe chars,
657
+ # trims lengths, and returns the final dict payload.
658
+ valid_articles = [sanitize_article(a) for a in valid_articles]
659
+ # ──────────────────────────────────────────────────────────────────────
660
 
 
 
 
 
661
  logger.info("βœ“ %s: %d valid, %d invalid, %d irrelevant",
662
  category.upper(), len(valid_articles), invalid_count, irrelevant_count)
663
  return (category, valid_articles, invalid_count, irrelevant_count, relevant_count)
app/services/utils/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # app/services/utils/__init__.py
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ # This folder contains shared helper utilities that are used by multiple
4
+ # providers. They are NOT providers themselves β€” they are small tools that
5
+ # providers can import to do common jobs.
6
+ #
7
+ # Current utilities:
8
+ # image_enricher.py β€” Extracts the main image from any article URL
app/services/utils/image_enricher.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app/services/utils/image_enricher.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ Shared Image Enrichment Utility for Segmento Pulse.
5
+
6
+ What this does:
7
+ Given any article URL, this tool visits the page and tries to find the
8
+ main (top) image that the website publisher chose for that article.
9
+
10
+ It does this by reading two standard HTML meta tags:
11
+ 1. og:image β€” Open Graph (used by Facebook, LinkedIn, Twitter)
12
+ 2. twitter:image β€” Twitter Card image
13
+
14
+ Almost every modern news website, blog, and tech publication sets at least
15
+ one of these tags. They are the industry-standard way to declare "this is
16
+ my article's main image".
17
+
18
+ ── WHY WE USE bs4 + httpx INSTEAD OF newspaper4k ────────────────────────────
19
+
20
+ The user directive requested newspaper4k (a modern async fork of newspaper3k).
21
+ However, newspaper4k is not in our requirements.txt and would add a heavy new
22
+ dependency with many sub-packages (including lxml, Pillow, and others).
23
+
24
+ Our current stack already has everything we need:
25
+ βœ“ httpx β€” async HTTP client (already in requirements.txt)
26
+ βœ“ beautifulsoup4 β€” HTML parser (already in requirements.txt)
27
+ βœ“ lxml β€” fast XML/HTML parser (already in requirements.txt)
28
+
29
+ The og:image meta tag approach is exactly what newspaper4k uses internally
30
+ for its top_image property. We get the same result without a new dependency.
31
+
32
+ This decision follows our Version First-Scan Protocol: never add a library
33
+ when an existing installed library can do the same job.
34
+
35
+ ── HOW THE TIMEOUT PROTECTION WORKS ─────────────────────────────────────────
36
+
37
+ Some websites are slow, broken, or behind Cloudflare protection pages.
38
+ If we waited forever for them, our entire ingestion pipeline would freeze.
39
+
40
+ Two layers of protection:
41
+ 1. httpx timeout: 3 seconds max to receive any response at all.
42
+ If the server doesn't respond in 3 seconds, httpx raises TimeoutException.
43
+
44
+ 2. asyncio.wait_for: 4 seconds total ceiling for the entire function.
45
+ Even if httpx somehow hangs (rare), this outer guard kills it.
46
+
47
+ 3. Universal try/except: Catches EVERYTHING. A bad image URL will NEVER
48
+ crash a provider. The worst it can do is return "".
49
+
50
+ The function signature is intentionally similar to newspaper4k's approach
51
+ so that future migration is a one-line change if newspaper4k is later added.
52
+ """
53
+
54
+ # ── Standard Library ──────────────────────────────────────────────────────────
55
+ import asyncio
56
+ import logging
57
+ from typing import Optional
58
+
59
+ # ── Third-party (already in requirements.txt) ─────────────────────────────────
60
+ import httpx
61
+ from bs4 import BeautifulSoup
62
+
63
+ logger = logging.getLogger(__name__)
64
+
65
+ # ── Timing constants ──────────────────────────────────────────────────────────
66
+
67
+ # How long to wait for the target website to respond (seconds).
68
+ # 3 seconds is generous enough for normal websites, short enough to not
69
+ # freeze our pipeline if a URL is broken or behind Cloudflare.
70
+ HTTP_FETCH_TIMEOUT = 3.0
71
+
72
+ # Hard outer ceiling for the entire extract_top_image() call.
73
+ # Even if httpx somehow hangs past its own timeout, asyncio.wait_for
74
+ # will forcibly cancel the task at this point.
75
+ OUTER_TIMEOUT_SECONDS = 4.0
76
+
77
+
78
+ async def extract_top_image(url: str) -> str:
79
+ """
80
+ Visit an article URL and extract its main (top) image.
81
+
82
+ Looks for the image in two standard HTML meta tags, in this order:
83
+ 1. <meta property="og:image" content="...">
84
+ 2. <meta name="twitter:image" content="...">
85
+
86
+ Args:
87
+ url (str): Full article URL (must start with "http").
88
+
89
+ Returns:
90
+ str: The image URL if found and valid. "" if not found or any error.
91
+
92
+ This function NEVER raises an exception. If anything goes wrong
93
+ (timeout, bad HTML, no meta tag found), it returns "" silently.
94
+ The pipeline treats "" as "no image" and shows the Pulse banner instead.
95
+ """
96
+ if not url or not url.startswith("http"):
97
+ return ""
98
+
99
+ try:
100
+ # Wrap everything in asyncio.wait_for so we have a hard ceiling.
101
+ # If _fetch_and_extract takes longer than OUTER_TIMEOUT_SECONDS, it
102
+ # is cancelled automatically and we return "" from the except block.
103
+ image_url = await asyncio.wait_for(
104
+ _fetch_and_extract(url),
105
+ timeout=OUTER_TIMEOUT_SECONDS,
106
+ )
107
+ return image_url
108
+
109
+ except asyncio.TimeoutError:
110
+ logger.debug(f"[ImageEnricher] Outer timeout for: {url[:60]}")
111
+ return ""
112
+ except Exception as e:
113
+ logger.debug(f"[ImageEnricher] Failed for '{url[:60]}': {e}")
114
+ return ""
115
+
116
+
117
+ async def _fetch_and_extract(url: str) -> str:
118
+ """
119
+ Internal helper: download the HTML and pull out the og:image tag.
120
+
121
+ Separated from extract_top_image() so asyncio.wait_for() has a clean
122
+ coroutine to cancel if needed.
123
+
124
+ Args:
125
+ url (str): Full article URL.
126
+
127
+ Returns:
128
+ str: Image URL from meta tag, or "" if none found.
129
+ """
130
+ try:
131
+ async with httpx.AsyncClient(timeout=HTTP_FETCH_TIMEOUT) as client:
132
+ response = await client.get(
133
+ url,
134
+ headers={
135
+ # Some sites block requests without a browser User-Agent.
136
+ # We mimic a normal browser to get past basic protections.
137
+ "User-Agent": (
138
+ "Mozilla/5.0 (compatible; SegmentoPulse-ImageBot/1.0; "
139
+ "+https://segmento.in)"
140
+ ),
141
+ # Tell the server we only need enough HTML to read the <head>.
142
+ # This does NOT guarantee the server sends less data, but it
143
+ # is polite and some servers respect it.
144
+ "Accept": "text/html",
145
+ },
146
+ follow_redirects=True,
147
+ )
148
+
149
+ if response.status_code != 200:
150
+ return ""
151
+
152
+ html = response.text
153
+
154
+ except Exception:
155
+ # Network error, timeout, SSL error, etc.
156
+ return ""
157
+
158
+ # ── Parse the HTML and look for meta image tags ───────────────────────────
159
+ # We only need the <head> section β€” everything in <body> is irrelevant
160
+ # and would slow down BeautifulSoup's parsing.
161
+ # NOTE: We pass only the first 10,000 characters to avoid processing huge
162
+ # HTML files. og:image is always in the <head> which is near the top.
163
+ try:
164
+ soup = BeautifulSoup(html[:10_000], "lxml")
165
+ except Exception:
166
+ # If lxml fails (malformed HTML), try the built-in html.parser
167
+ try:
168
+ soup = BeautifulSoup(html[:10_000], "html.parser")
169
+ except Exception:
170
+ return ""
171
+
172
+ # ── Priority 1: Open Graph image (most reliable) ─────────────────────────
173
+ og_tag = soup.find("meta", property="og:image")
174
+ if og_tag:
175
+ image_url = (og_tag.get("content") or "").strip()
176
+ if image_url and image_url.startswith("http"):
177
+ logger.debug(f"[ImageEnricher] og:image found for {url[:50]}")
178
+ return image_url
179
+
180
+ # ── Priority 2: Twitter Card image (common fallback) ─────────────────────
181
+ tw_tag = soup.find("meta", attrs={"name": "twitter:image"})
182
+ if tw_tag:
183
+ image_url = (tw_tag.get("content") or "").strip()
184
+ if image_url and image_url.startswith("http"):
185
+ logger.debug(f"[ImageEnricher] twitter:image found for {url[:50]}")
186
+ return image_url
187
+
188
+ # No image tag found β€” return empty, let the banner fallback handle it.
189
+ logger.debug(f"[ImageEnricher] No meta image tag found for: {url[:60]}")
190
+ return ""
app/services/utils/provider_state.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app/services/utils/provider_state.py
3
+ ─────────────────────────────────────────────────────────────────────────────
4
+ Phase 15: Unified Redis State Architecture
5
+
6
+ What this does:
7
+ Saves and restores provider "state" β€” things like "when did we last call
8
+ OpenRSS?" and "how many times have we called Webz today?" β€” to our
9
+ Upstash Redis instance.
10
+
11
+ Why we need this:
12
+ Our backend runs on Hugging Face Spaces, which can restart at any time.
13
+ When a restart happens, all Python RAM is wiped. Without this utility:
14
+ - OpenRSS's 60-minute cooldown resets to 0, so we hammer them on
15
+ every restart and eventually get an IP ban.
16
+ - Webz's monthly budget counter resets, so we can burn our entire
17
+ month's calls in a single bad restart day.
18
+
19
+ With this utility:
20
+ - Even if the server restarts 10 times in an hour, Redis remembers
21
+ the exact timestamp of the last OpenRSS call and the exact number
22
+ of Webz calls made today. Provider quotas are now restart-proof.
23
+
24
+ How it works:
25
+ Two pairs of async functions:
26
+ 1. Timestamps (for cooldown timers like OpenRSS):
27
+ get_provider_timestamp("openrss") β†’ float (Unix timestamp)
28
+ set_provider_timestamp("openrss", time.time())
29
+
30
+ 2. Counters (for daily/monthly budgets like Webz, WorldNewsAI):
31
+ get_provider_counter("webz", "2026-03-03") β†’ int
32
+ increment_provider_counter("webz", "2026-03-03")
33
+
34
+ Redis key format:
35
+ Timestamps: provider:state:{provider_name}:last_fetch
36
+ Counters: provider:state:{provider_name}:calls:{date_key}
37
+
38
+ Mirrored directly from circuit_breaker.py's approach:
39
+ - Same get_upstash_cache() import
40
+ - Same _execute_command([...]) API
41
+ - Same fail-safe try/except pattern
42
+
43
+ Fail-open vs Fail-safe design:
44
+ - get_provider_timestamp: returns 0.0 on Redis failure
45
+ β†’ Provider assumes "never fetched before" β†’ allowed to run
46
+ β†’ This is CORRECT for free providers (OpenRSS). Missing one cooldown
47
+ check is less dangerous than permanently blocking the provider.
48
+
49
+ - get_provider_counter: returns 999999 on Redis failure
50
+ β†’ Provider assumes "budget exhausted" β†’ safely skips the run
51
+ β†’ This is CORRECT for paid providers (Webz, WorldNewsAI). We would
52
+ rather miss one run than accidentally overspend our API budget.
53
+
54
+ Thread safety:
55
+ asyncio is single-threaded. All functions below use `await`. Only one
56
+ coroutine runs at a time, so there are no race conditions to worry about
57
+ within a single Python process. No locks needed.
58
+ """
59
+
60
+ import logging
61
+ from typing import Optional
62
+
63
+ logger = logging.getLogger(__name__)
64
+
65
+
66
+ # ── Key Builders ─────────────────────────────────────────────────────────────
67
+ # Centralizing the key format here means if we ever need to change it,
68
+ # we change it in one place and every provider picks up the fix automatically.
69
+
70
+ def _timestamp_key(provider_name: str) -> str:
71
+ """
72
+ Build the Redis key string for a provider's last-fetch timestamp.
73
+
74
+ Example:
75
+ provider_name = "openrss"
76
+ β†’ "provider:state:openrss:last_fetch"
77
+ """
78
+ return f"provider:state:{provider_name}:last_fetch"
79
+
80
+
81
+ def _counter_key(provider_name: str, date_key: str) -> str:
82
+ """
83
+ Build the Redis key string for a provider's daily call counter.
84
+
85
+ date_key is normally a date string like "2026-03-03" so the key
86
+ automatically changes every day without needing a manual reset.
87
+
88
+ Example:
89
+ provider_name = "webz", date_key = "2026-03-03"
90
+ β†’ "provider:state:webz:calls:2026-03-03"
91
+ """
92
+ return f"provider:state:{provider_name}:calls:{date_key}"
93
+
94
+
95
+ # ── Timestamp Functions (for cooldown timers) ─────────────────────────────────
96
+
97
+ async def get_provider_timestamp(provider_name: str) -> float:
98
+ """
99
+ Read the last-fetch timestamp for a provider from Redis.
100
+
101
+ Returns a Unix timestamp (seconds since 1970). If Redis is unavailable
102
+ or the key doesn't exist yet, returns 0.0 so the provider treats it as
103
+ "never fetched before" and is allowed to run immediately.
104
+
105
+ This is the FAIL-OPEN design β€” when in doubt, let the provider run.
106
+ Suitable for free providers with cooldown timers (OpenRSS).
107
+
108
+ Args:
109
+ provider_name (str): Short name like "openrss".
110
+
111
+ Returns:
112
+ float: Unix timestamp of the last fetch, or 0.0 if not found.
113
+ """
114
+ try:
115
+ from app.services.upstash_cache import get_upstash_cache
116
+ cache = get_upstash_cache()
117
+
118
+ key = _timestamp_key(provider_name)
119
+ # Redis GET returns a string like "1740000000.123" or None if missing.
120
+ raw_value = await cache._execute_command(["GET", key])
121
+
122
+ if raw_value is None:
123
+ # Key doesn't exist yet β€” provider has never fetched before.
124
+ return 0.0
125
+
126
+ # Parse the string back to a float.
127
+ return float(raw_value)
128
+
129
+ except Exception as e:
130
+ # Redis is down, unreachable, or returned something unexpected.
131
+ # Fail open: return 0.0 so the provider is allowed to run.
132
+ # This is the safe direction for free providers β€” one extra call
133
+ # is far less dangerous than permanently blocking the provider.
134
+ logger.warning(
135
+ "[provider_state] get_provider_timestamp('%s') failed (%s) "
136
+ "β€” returning 0.0 (fail-open: provider will be allowed to run).",
137
+ provider_name, e
138
+ )
139
+ return 0.0
140
+
141
+
142
+ async def set_provider_timestamp(
143
+ provider_name: str,
144
+ timestamp: float,
145
+ expire_seconds: int = 7200, # Default TTL: 2 hours
146
+ ) -> None:
147
+ """
148
+ Save a provider's last-fetch timestamp to Redis.
149
+
150
+ Always call this BEFORE you start the actual network request, not after.
151
+ If you save it AFTER and the request crashes halfway through, the provider
152
+ will think "I was never blocked" and fire again immediately on the next
153
+ scheduler cycle β€” the exact opposite of what the cooldown is supposed to do.
154
+
155
+ The TTL (expire_seconds) is a safety net. If the key is never explicitly
156
+ deleted, Redis will remove it automatically after 2 hours so it doesn't
157
+ sit in memory forever. 2 hours is safely above the 60-minute cooldown.
158
+
159
+ Args:
160
+ provider_name (str): Short name like "openrss".
161
+ timestamp (float): Unix timestamp (use time.time() to get the current one).
162
+ expire_seconds (int): How long to keep this key in Redis. Default: 7200s (2h).
163
+ """
164
+ try:
165
+ from app.services.upstash_cache import get_upstash_cache
166
+ cache = get_upstash_cache()
167
+
168
+ key = _timestamp_key(provider_name)
169
+ # Store the float as a string. Redis stores all values as strings anyway.
170
+ # "SET key value EX seconds" β€” sets both the value and the TTL in one call.
171
+ await cache._execute_command(["SET", key, str(timestamp), "EX", expire_seconds])
172
+
173
+ logger.debug(
174
+ "[provider_state] Saved last_fetch timestamp for '%s' to Redis (TTL=%ds).",
175
+ provider_name, expire_seconds
176
+ )
177
+
178
+ except Exception as e:
179
+ # Redis write failed. This is recoverable β€” the cooldown will just
180
+ # fall back to RAM-based tracking for this run. Log it and move on.
181
+ logger.warning(
182
+ "[provider_state] set_provider_timestamp('%s') failed (%s) "
183
+ "β€” cooldown state will not survive a server restart for this run.",
184
+ provider_name, e
185
+ )
186
+
187
+
188
+ # ── Counter Functions (for daily/monthly API budgets) ─────────────────────────
189
+
190
+ async def get_provider_counter(provider_name: str, date_key: str) -> int:
191
+ """
192
+ Read a provider's call counter for a specific date from Redis.
193
+
194
+ If Redis is unavailable or the key doesn't exist, returns 999999.
195
+ This is the FAIL-SAFE design β€” when in doubt, assume the budget is
196
+ exhausted and skip the call. Much better than accidentally burning
197
+ a month's worth of Webz or WorldNewsAI credits on a bad restart day.
198
+
199
+ Args:
200
+ provider_name (str): Short name like "webz" or "worldnewsai".
201
+ date_key (str): Date string like "2026-03-03" (use UTC date).
202
+ Using today's UTC date as the key means the
203
+ counter automatically resets each morning without
204
+ any manual cleanup β€” yesterday's key just expires.
205
+
206
+ Returns:
207
+ int: Number of API calls made today, or 999999 if Redis is down.
208
+ """
209
+ try:
210
+ from app.services.upstash_cache import get_upstash_cache
211
+ cache = get_upstash_cache()
212
+
213
+ key = _counter_key(provider_name, date_key)
214
+ raw_value = await cache._execute_command(["GET", key])
215
+
216
+ if raw_value is None:
217
+ # No calls made today yet β€” counter starts at 0.
218
+ return 0
219
+
220
+ return int(raw_value)
221
+
222
+ except Exception as e:
223
+ # Redis is down. Fail SAFE: return a huge number so the provider
224
+ # thinks its budget is exhausted and skips this run.
225
+ # One missed run costs us nothing. One overspent budget could cost us money.
226
+ logger.warning(
227
+ "[provider_state] get_provider_counter('%s', '%s') failed (%s) "
228
+ "β€” returning 999999 (fail-safe: provider will be skipped this run).",
229
+ provider_name, date_key, e
230
+ )
231
+ return 999999
232
+
233
+
234
+ async def increment_provider_counter(
235
+ provider_name: str,
236
+ date_key: str,
237
+ amount: int = 1,
238
+ expire_seconds: int = 86400, # Default TTL: 24 hours (one full day)
239
+ ) -> None:
240
+ """
241
+ Increment a provider's daily call counter in Redis by `amount`.
242
+
243
+ Uses Redis INCR (atomic increment) which is safe to call concurrently
244
+ from multiple requests β€” though since we run single-process asyncio,
245
+ this is mostly a good practice rather than a strict requirement here.
246
+
247
+ After incrementing, we always refresh the TTL with EXPIRE. This means
248
+ even if the key was created yesterday and is still sitting around, it
249
+ gets a fresh 24-hour life from the moment we update it.
250
+
251
+ Args:
252
+ provider_name (str): Short name like "webz" or "worldnewsai".
253
+ date_key (str): Date string like "2026-03-03" (use UTC date).
254
+ amount (int): How much to add to the counter. Default: 1.
255
+ expire_seconds (int): Key TTL. Default: 86400s (24 hours).
256
+ """
257
+ try:
258
+ from app.services.upstash_cache import get_upstash_cache
259
+ cache = get_upstash_cache()
260
+
261
+ key = _counter_key(provider_name, date_key)
262
+
263
+ # INCRBY key amount β€” atomically adds `amount` to the counter.
264
+ # If the key doesn't exist yet, Redis creates it at 0 and then adds amount.
265
+ await cache._execute_command(["INCRBY", key, str(amount)])
266
+
267
+ # Refresh the TTL so the key doesn't expire mid-day.
268
+ # EXPIRE key seconds β€” resets the countdown timer on the key.
269
+ await cache._execute_command(["EXPIRE", key, str(expire_seconds)])
270
+
271
+ logger.debug(
272
+ "[provider_state] Incremented call counter for '%s' on '%s' by %d.",
273
+ provider_name, date_key, amount
274
+ )
275
+
276
+ except Exception as e:
277
+ # Redis write failed. The counter won't reflect this call in Redis,
278
+ # but in-memory tracking (request_count) still works. Log and continue.
279
+ logger.warning(
280
+ "[provider_state] increment_provider_counter('%s', '%s') failed (%s) "
281
+ "β€” this call will not be counted in Redis. In-memory limit still applies.",
282
+ provider_name, date_key, e
283
+ )
app/utils/data_validation.py CHANGED
@@ -268,158 +268,272 @@ def calculate_quality_score(article: Dict) -> int:
268
  return min(max(score, 0), 100)
269
 
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> bool:
272
  """
273
- Validate that article is relevant to the specified category
274
-
275
- HOTFIX: Now handles both Pydantic Article objects and dicts
276
-
277
- Prevents category pollution (e.g., "Apple pie" in Tech)
278
-
279
- Returns True only if article contains category-specific keywords
 
 
 
 
 
 
 
 
 
280
  """
281
- # HOTFIX: Convert to dict if needed
282
  if hasattr(article, 'model_dump'):
283
  article_dict = article.model_dump()
284
  elif hasattr(article, 'dict'):
285
  article_dict = article.dict()
286
  else:
287
  article_dict = article
288
-
289
- # Category keyword dictionaries
290
- # Each category has a list of words we scan for in the article's title,
291
- # description, AND URL path. If at least one word matches, the article passes.
292
- CATEGORY_KEYWORDS = {
293
- 'ai': [
294
- 'ai', 'artificial intelligence', 'machine learning', 'deep learning',
295
- 'neural network', 'gpt', 'llm', 'chatgpt', 'generative ai',
296
- 'computer vision', 'nlp', 'natural language', 'transformer'
297
- ],
298
- 'data-security': [
299
- 'security', 'cybersecurity', 'data breach', 'hacking', 'vulnerability',
300
- 'encryption', 'malware', 'ransomware', 'firewall', 'threat'
301
- ],
302
- 'data-governance': [
303
- 'governance', 'compliance', 'regulation', 'audit', 'policy',
304
- 'data quality', 'metadata', 'lineage', 'stewardship'
305
- ],
306
- 'data-privacy': [
307
- 'privacy', 'gdpr', 'ccpa', 'consent', 'personal data',
308
- 'pii', 'anonymization', 'data protection', 'privacy law'
309
- ],
310
- 'data-engineering': [
311
- 'data engineering', 'pipeline', 'etl', 'big data', 'spark',
312
- 'hadoop', 'kafka', 'airflow', 'data warehouse', 'snowflake'
313
- ],
314
- 'data-management': [
315
- 'data management', 'master data', 'mdm', 'data catalog',
316
- 'data quality', 'data lineage', 'data stewardship',
317
- 'data governance', 'data integration', 'reference data'
318
- ],
319
- 'business-intelligence': [
320
- 'business intelligence', 'bi', 'analytics', 'dashboard',
321
- 'tableau', 'power bi', 'looker', 'reporting', 'kpi'
322
- ],
323
- 'business-analytics': [
324
- 'analytics', 'analysis', 'insights', 'metrics', 'data-driven',
325
- 'business analytics', 'predictive', 'forecasting'
326
- ],
327
- 'customer-data-platform': [
328
- 'cdp', 'customer data', 'customer platform', 'crm',
329
- 'customer experience', 'personalization', 'segmentation'
330
- ],
331
- 'data-centers': [
332
- 'data center', 'data centre', 'datacenter', 'server', 'infrastructure',
333
- 'colocation', 'edge computing', 'hyperscale'
334
- ],
335
- 'cloud-computing': [
336
- 'cloud', 'aws', 'azure', 'google cloud', 'gcp', 'salesforce',
337
- 'alibaba cloud', 'tencent cloud', 'huawei cloud', 'cloudflare',
338
- 'saas', 'paas', 'iaas', 'serverless', 'kubernetes'
339
- ],
340
- # ── Cloud sub-categories (each maps to a specific provider) ──────────
341
- 'cloud-aws': [
342
- 'aws', 'amazon web services', 's3', 'ec2', 'lambda',
343
- 'cloudfront', 'sagemaker', 'dynamodb', 'amazon'
344
- ],
345
- 'cloud-azure': [
346
- 'azure', 'microsoft azure', 'azure devops', 'azure ml',
347
- 'azure openai', 'microsoft cloud'
348
- ],
349
- 'cloud-gcp': [
350
- 'gcp', 'google cloud', 'bigquery', 'vertex ai',
351
- 'cloud run', 'dataflow', 'google cloud platform'
352
- ],
353
- 'cloud-oracle': [
354
- 'oracle cloud', 'oci', 'oracle database', 'oracle fusion',
355
- 'oracle cloud infrastructure'
356
- ],
357
- 'cloud-ibm': [
358
- 'ibm cloud', 'ibm watson', 'red hat', 'openshift', 'ibm z'
359
- ],
360
- 'cloud-alibaba': [
361
- 'alibaba cloud', 'aliyun', 'alicloud'
362
- ],
363
- 'cloud-digitalocean': [
364
- 'digitalocean', 'droplet', 'app platform'
365
- ],
366
- 'cloud-huawei': [
367
- 'huawei cloud', 'huaweicloud'
368
- ],
369
- 'cloud-cloudflare': [
370
- 'cloudflare', 'cloudflare workers', 'cloudflare r2',
371
- 'cloudflare pages', 'zero trust'
372
- ],
373
- # ── Content / publishing categories ───────────────────────────────────
374
- 'medium-article': [
375
- 'medium', 'article', 'blog', 'writing', 'publishing',
376
- 'content', 'story', 'author', 'blogging'
377
- ],
378
- 'magazines': [
379
- 'technology', 'tech', 'innovation', 'digital', 'startup',
380
- 'software', 'hardware', 'gadget'
381
- ]
382
- }
383
-
384
- # Get keywords for this category
385
- keywords = CATEGORY_KEYWORDS.get(category, [])
386
-
387
- if not keywords:
388
- # Unknown category - allow (don't reject)
389
  return True
390
-
391
- # Build the text we will search for keywords.
392
- # We use title + description as the primary source.
393
- # We also append the article's URL path because RSS feeds (especially Google News)
394
- # often return empty descriptions. The URL itself usually tells you what the
395
- # article is about β€” e.g. "/aws-launches-new-s3-feature" clearly contains 'aws' and 's3'.
396
- # Hyphens and slashes are replaced with spaces so words can be matched individually.
397
- title = (article_dict.get('title') or '').lower()
 
398
  description = (article_dict.get('description') or '').lower()
399
 
400
- # Extract the URL path safely.
401
  raw_url = article_dict.get('url') or ''
402
  url_str = str(raw_url).lower()
403
  try:
404
  parsed_url = urlparse(url_str)
405
- # Replace hyphens and slashes with spaces so
406
- # "/aws-new-s3-launch" becomes "aws new s3 launch".
407
  url_words = parsed_url.path.replace('-', ' ').replace('/', ' ')
408
  except Exception:
409
  url_words = ''
410
 
411
- text = f"{title} {description} {url_words}"
412
-
413
- # Count keyword matches
414
- matches = sum(1 for keyword in keywords if keyword.lower() in text)
415
-
416
- # Require at least 1 keyword match (lenient for now)
417
- # Can increase to 2+ for stricter filtering
418
- if matches >= 1:
419
  return True
420
-
421
- # Log rejection for monitoring
422
- print(f"🚫 Rejected '{article_dict.get('title', 'Unknown')[:50]}' from {category} (0 keyword matches)")
 
 
 
423
  return False
424
 
425
 
 
268
  return min(max(score, 0), 100)
269
 
270
 
271
+ # ==============================================================================
272
+ # MASTER CATEGORY TAXONOMY (Phase 19 β€” Expanded Entity-Based Keywords)
273
+ # ==============================================================================
274
+ #
275
+ # This dictionary is the SINGLE SOURCE OF TRUTH for category routing.
276
+ # Every category has a rich list of keywords covering:
277
+ # β€’ The topic itself (e.g., "machine learning")
278
+ # β€’ Major companies (e.g., "openai", "anthropic")
279
+ # β€’ Flagship products (e.g., "chatgpt", "sagemaker")
280
+ # β€’ Industry acronyms (e.g., "llm", "etl", "gcp")
281
+ #
282
+ # ⚠️ IMPORTANT β€” word-boundary safety:
283
+ # Short acronyms like "ai", "bi", "aws" MUST live here β€” we protect them
284
+ # with \b regex word boundaries in COMPILED_CATEGORY_REGEX below.
285
+ # Do NOT add single-letter keywords; they can never be safe.
286
+ #
287
+ # NOTE: 'cloud-computing' is kept here because it is an active category in
288
+ # config.py, news_aggregator.py, and several providers. Removing it would
289
+ # break article routing for all generic cloud news. β€” Phase 19
290
+ # ==============================================================================
291
+ CATEGORY_KEYWORDS = {
292
+
293
+ # ── Artificial Intelligence ────────────────────────────────────────────────
294
+ 'ai': [
295
+ 'artificial intelligence', 'machine learning', 'deep learning',
296
+ 'neural network', 'gpt', 'llm', 'chatgpt', 'generative ai',
297
+ 'computer vision', 'nlp', 'natural language processing', 'transformer',
298
+ 'openai', 'anthropic', 'sam altman', 'claude', 'gemini', 'mistral',
299
+ 'llama', 'copilot', 'midjourney', 'stable diffusion', 'hugging face',
300
+ 'rag', 'vector database', 'prompt engineering', 'agi', 'agentic ai',
301
+ ],
302
+
303
+ # ── Cloud β€” generic umbrella category (must stay: used in config.py) ──────
304
+ 'cloud-computing': [
305
+ 'cloud computing', 'cloud services', 'aws', 'azure', 'google cloud',
306
+ 'gcp', 'salesforce', 'alibaba cloud', 'tencent cloud', 'huawei cloud',
307
+ 'cloudflare', 'saas', 'paas', 'iaas', 'serverless', 'kubernetes',
308
+ 'multi-cloud', 'hybrid cloud',
309
+ ],
310
+
311
+ # ── Cloud sub-categories (provider-specific) ───────────────────────────────
312
+ 'cloud-aws': [
313
+ 'aws', 'amazon web services', 's3', 'ec2', 'lambda', 'cloudfront',
314
+ 'sagemaker', 'dynamodb', 'amazon bedrock', 'aws reinvent',
315
+ 'fargate', 'aws graviton', 'elastic beanstalk',
316
+ ],
317
+ 'cloud-azure': [
318
+ 'azure', 'microsoft azure', 'azure devops', 'azure ml',
319
+ 'azure openai', 'microsoft cloud', 'azure synapse', 'cosmos db',
320
+ 'azure arc', 'microsoft entra',
321
+ ],
322
+ 'cloud-gcp': [
323
+ 'gcp', 'google cloud', 'bigquery', 'vertex ai', 'cloud run',
324
+ 'dataflow', 'google kubernetes engine', 'gke', 'google spanner',
325
+ 'anthos', 'cloud sql', 'gemini for google cloud',
326
+ ],
327
+ 'cloud-alibaba': [
328
+ 'alibaba cloud', 'aliyun', 'alicloud', 'polar db', 'maxcompute',
329
+ 'elastic compute service', 'tongyi qianwen', 'qwen',
330
+ ],
331
+ 'cloud-huawei': [
332
+ 'huawei cloud', 'huaweicloud', 'pangu model',
333
+ 'harmonyos', 'kunpeng', 'ascend ai',
334
+ ],
335
+ 'cloud-digitalocean': [
336
+ 'digitalocean', 'digital ocean', 'do droplet', 'digitalocean spaces',
337
+ 'digitalocean app platform', 'managed kubernetes', 'cloudways',
338
+ ],
339
+ 'cloud-oracle': [
340
+ 'oracle cloud', 'oci', 'oracle database', 'oracle fusion',
341
+ 'oracle cloud infrastructure', 'mysql heatwave', 'oracle apex',
342
+ ],
343
+ 'cloud-ibm': [
344
+ 'ibm cloud', 'ibm watson', 'red hat', 'openshift',
345
+ 'ibm z', 'watsonx', 'ibm mainframe',
346
+ ],
347
+ 'cloud-cloudflare': [
348
+ 'cloudflare', 'cloudflare workers', 'cloudflare r2',
349
+ 'cloudflare pages', 'zero trust',
350
+ ],
351
+
352
+ # ── Data Engineering ───────────────────────────────────────────────────────
353
+ 'data-engineering': [
354
+ 'data engineering', 'data pipeline', 'etl', 'elt', 'big data',
355
+ 'apache spark', 'hadoop', 'kafka', 'airflow', 'data warehouse',
356
+ 'snowflake', 'databricks', 'dbt', 'fivetran', 'apache iceberg',
357
+ 'delta lake', 'data lakehouse',
358
+ ],
359
+
360
+ # ── Data Security ─────────────────────────────────────────────────────────
361
+ 'data-security': [
362
+ 'security', 'cybersecurity', 'data breach', 'hacking', 'vulnerability',
363
+ 'encryption', 'malware', 'ransomware', 'firewall', 'zero trust',
364
+ 'phishing', 'soc2', 'infosec', 'penetration testing',
365
+ ],
366
+
367
+ # ── Data Governance ───────────────────────────────────────────────────────
368
+ 'data-governance': [
369
+ 'data governance', 'compliance', 'regulation', 'audit', 'data policy',
370
+ 'metadata management', 'data lineage', 'data stewardship',
371
+ 'regulatory compliance',
372
+ ],
373
+
374
+ # ── Data Privacy ──────────────────────────────────────────────────────────
375
+ 'data-privacy': [
376
+ 'data privacy', 'gdpr', 'ccpa', 'user consent', 'personal data',
377
+ 'pii', 'anonymization', 'data protection', 'privacy law',
378
+ 'hipaa', 'cookie tracking',
379
+ ],
380
+
381
+ # ── Data Management ───────────────────────────────────────────────────────
382
+ 'data-management': [
383
+ 'data management', 'master data', 'mdm', 'data catalog',
384
+ 'data quality', 'reference data', 'data lifecycle', 'data architecture',
385
+ ],
386
+
387
+ # ── Business Intelligence ─────────────────────────────────────────────────
388
+ 'business-intelligence': [
389
+ 'business intelligence', 'bi', 'analytics dashboard', 'tableau',
390
+ 'power bi', 'looker', 'data reporting', 'kpi', 'quicksight', 'qlik',
391
+ ],
392
+
393
+ # ── Business Analytics ────────────────────────────────────────────────────
394
+ 'business-analytics': [
395
+ 'data analytics', 'data analysis', 'business insights', 'business metrics',
396
+ 'data-driven', 'business analytics', 'predictive analytics', 'forecasting',
397
+ ],
398
+
399
+ # ── Customer Data Platform ────────────────────────────────────────────────
400
+ 'customer-data-platform': [
401
+ 'cdp', 'customer data platform', 'crm', 'customer experience',
402
+ 'personalization engine', 'audience segmentation',
403
+ 'segment.com', 'salesforce data cloud',
404
+ ],
405
+
406
+ # ── Data Centers ──────────────────────────────────────────────────────────
407
+ 'data-centers': [
408
+ 'data center', 'data centre', 'datacenter', 'server rack', 'colocation',
409
+ 'edge computing', 'hyperscale', 'hpc', 'liquid cooling',
410
+ 'data center cooling',
411
+ ],
412
+
413
+ # ── Publishing categories ─────────────────────────────────────────────────
414
+ 'medium-article': [
415
+ 'medium', 'article', 'blog', 'writing', 'publishing',
416
+ 'content', 'story', 'author', 'blogging',
417
+ ],
418
+ 'magazines': [
419
+ 'technology', 'tech', 'innovation', 'digital', 'startup',
420
+ 'software', 'hardware', 'gadget',
421
+ ],
422
+ }
423
+
424
+
425
+ # ==============================================================================
426
+ # PRE-COMPILED REGEX ENGINE (Phase 19 β€” Word-Boundary Patterns)
427
+ # ==============================================================================
428
+ #
429
+ # Problem this solves:
430
+ # Old code: "ai" in text β†’ matches "tr[ai]n", "ava[i]lable" β€” garbage hits.
431
+ # New code: \bai\b in text β†’ only "AI" as a standalone word β€” clean hits.
432
+ #
433
+ # Why pre-compile?
434
+ # Building a regex from scratch takes CPU time. If we do it inside the
435
+ # validation function, it runs once per article Γ— 22 categories = thousands of
436
+ # compilations per scheduler cycle. By compiling ONCE at import time and
437
+ # storing the result, all subsequent lookups are instant memory reads.
438
+ #
439
+ # How each pattern is built:
440
+ # For every keyword in a category we do:
441
+ # re.escape(keyword) β†’ safely escapes dots, plus signs, brackets etc.
442
+ # \b ... \b β†’ word boundaries so "aws" won't match "kawasaki"
443
+ # All keywords in one category are joined with | (OR), so a single
444
+ # re.search() call checks every keyword at once β€” maximum speed.
445
+ #
446
+ # Example β€” 'ai' category compiles to:
447
+ # \bartificial intelligence\b|\bmachine learning\b|\bgpt\b|\bllm\b|...
448
+ # ==============================================================================
449
+ def _build_category_regex(keywords: list) -> 're.Pattern':
450
+ """
451
+ Turn a list of keywords into one pre-compiled word-boundary OR pattern.
452
+
453
+ Example:
454
+ ['gpt', 'llm', 'openai']
455
+ β†’ re.compile(r'\\bgpt\\b|\\bllm\\b|\\bopenai\\b', re.IGNORECASE)
456
+ """
457
+ parts = [r'\b' + re.escape(kw) + r'\b' for kw in keywords]
458
+ return re.compile('|'.join(parts), re.IGNORECASE)
459
+
460
+
461
+ # This dict is built ONCE when the server starts.
462
+ # Key = category slug (e.g. 'ai', 'cloud-aws')
463
+ # Value = compiled regex (e.g. re.compile(r'\bgpt\b|\bllm\b|...'))
464
+ COMPILED_CATEGORY_REGEX: dict = {
465
+ category: _build_category_regex(keywords)
466
+ for category, keywords in CATEGORY_KEYWORDS.items()
467
+ }
468
+
469
+
470
  def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> bool:
471
  """
472
+ Check whether an article belongs to the given category.
473
+
474
+ Uses pre-compiled word-boundary regex patterns (built once at server start)
475
+ so that:
476
+ β€’ Short acronyms like "ai", "bi", "aws" only match as full words.
477
+ "trail" β†’ does NOT match 'ai' anymore.
478
+ "kubernot" β†’ does NOT match 'gcp' anymore.
479
+ β€’ Multi-word phrases like "openai" or "sagemaker" are matched exactly.
480
+ β€’ Unknown categories automatically pass (return True) so we don't
481
+ accidentally drop articles routed to categories we haven't mapped yet.
482
+
483
+ Scans: article title + description + URL path (all lowercased).
484
+
485
+ Returns:
486
+ True β€” article is relevant (at least 1 keyword matches).
487
+ False β€” no keyword matched; article is rejected for this category.
488
  """
489
+ # ── Step 1: Convert to dict safely ────────────────────────────────────────
490
  if hasattr(article, 'model_dump'):
491
  article_dict = article.model_dump()
492
  elif hasattr(article, 'dict'):
493
  article_dict = article.dict()
494
  else:
495
  article_dict = article
496
+
497
+ # ── Step 2: Look up the pre-compiled pattern for this category ────────────
498
+ pattern = COMPILED_CATEGORY_REGEX.get(category)
499
+
500
+ if pattern is None:
501
+ # Category not in our taxonomy β€” let it pass rather than silently drop.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  return True
503
+
504
+ # ── Step 3: Build the search text ─────────────────────────────────────────
505
+ # We scan three sources:
506
+ # β€’ title β€” the headline, most reliable signal
507
+ # β€’ description β€” body summary, adds context
508
+ # β€’ url_words β€” URL path with hyphens β†’ spaces.
509
+ # Catches articles with empty descriptions like Google RSS.
510
+ # e.g. "/aws-launches-sagemaker-feature" β†’ "aws launches sagemaker feature"
511
+ title = (article_dict.get('title') or '').lower()
512
  description = (article_dict.get('description') or '').lower()
513
 
 
514
  raw_url = article_dict.get('url') or ''
515
  url_str = str(raw_url).lower()
516
  try:
517
  parsed_url = urlparse(url_str)
518
+ # Replace hyphens and slashes with spaces so URL path words
519
+ # are treated as individual tokens by the word-boundary regex.
520
  url_words = parsed_url.path.replace('-', ' ').replace('/', ' ')
521
  except Exception:
522
  url_words = ''
523
 
524
+ search_text = f"{title} {description} {url_words}"
525
+
526
+ # ── Step 4: Run the compiled regex ────────────────────────────────────────
527
+ # re.search() returns a Match object on the FIRST hit, or None.
528
+ # The pattern already has re.IGNORECASE compiled in β€” no need to lower() again.
529
+ if pattern.search(search_text):
 
 
530
  return True
531
+
532
+ # No match β€” log the rejection for monitoring.
533
+ print(
534
+ f"🚫 Rejected '{article_dict.get('title', 'Unknown')[:50]}' "
535
+ f"from {category} (0 keyword matches)"
536
+ )
537
  return False
538
 
539