SHAFI commited on
Commit Β·
ff4f05b
1
Parent(s): 7d4e625
added Massive Tech News Ingestion , more than 10+ news providers added to the ingestion part
Browse files- app/config.py +6 -0
- app/services/circuit_breaker.py +18 -2
- app/services/news_aggregator.py +122 -2
- app/services/providers/__init__.py +23 -0
- app/services/providers/base.py +174 -0
- app/services/providers/direct_rss/__init__.py +10 -0
- app/services/providers/direct_rss/client.py +378 -0
- app/services/providers/hackernews/__init__.py +9 -0
- app/services/providers/hackernews/client.py +365 -0
- app/services/providers/inshorts/__init__.py +11 -0
- app/services/providers/inshorts/client.py +346 -0
- app/services/providers/openrss/__init__.py +15 -0
- app/services/providers/openrss/client.py +384 -0
- app/services/providers/sauravkanchan/__init__.py +12 -0
- app/services/providers/sauravkanchan/client.py +375 -0
- app/services/providers/thenewsapi/__init__.py +11 -0
- app/services/providers/thenewsapi/client.py +347 -0
- app/services/providers/webz/__init__.py +15 -0
- app/services/providers/webz/client.py +404 -0
- app/services/providers/wikinews/__init__.py +15 -0
- app/services/providers/wikinews/client.py +435 -0
- app/services/providers/worldnewsai/__init__.py +19 -0
- app/services/providers/worldnewsai/client.py +359 -0
- app/services/scheduler.py +180 -5
- app/services/utils/__init__.py +8 -0
- app/services/utils/image_enricher.py +190 -0
- app/services/utils/provider_state.py +283 -0
- app/utils/data_validation.py +245 -131
app/config.py
CHANGED
|
@@ -29,6 +29,12 @@ class Settings(BaseSettings):
|
|
| 29 |
GNEWS_API_KEY: str = ""
|
| 30 |
NEWSAPI_API_KEY: str = ""
|
| 31 |
NEWSDATA_API_KEY: str = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# Provider priority (will try in order until successful)
|
| 34 |
NEWS_PROVIDER_PRIORITY: List[str] = ["gnews", "newsapi", "newsdata", "google_rss"]
|
|
|
|
| 29 |
GNEWS_API_KEY: str = ""
|
| 30 |
NEWSAPI_API_KEY: str = ""
|
| 31 |
NEWSDATA_API_KEY: str = ""
|
| 32 |
+
# Phase 5: TheNewsAPI.com β 100 req/day free tier, position 4 in PAID_CHAIN
|
| 33 |
+
THENEWSAPI_API_KEY: str = ""
|
| 34 |
+
# Phase 8: WorldNewsAI.com β point-based quota, position 5 in PAID_CHAIN
|
| 35 |
+
WORLDNEWS_API_KEY: str = ""
|
| 36 |
+
# Phase 10: Webz.io β 1,000 calls/month free tier, position 6 in PAID_CHAIN
|
| 37 |
+
WEBZ_API_KEY: str = ""
|
| 38 |
|
| 39 |
# Provider priority (will try in order until successful)
|
| 40 |
NEWS_PROVIDER_PRIORITY: List[str] = ["gnews", "newsapi", "newsdata", "google_rss"]
|
app/services/circuit_breaker.py
CHANGED
|
@@ -79,8 +79,24 @@ class ProviderCircuitBreaker:
|
|
| 79 |
self.circuit_open_time: Dict[str, float] = {}
|
| 80 |
self.half_open_attempts: Dict[str, int] = defaultdict(int)
|
| 81 |
|
| 82 |
-
# Known providers β used by the boot-time Redis restore
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
logger.info("=" * 70)
|
| 86 |
logger.info("β‘ [CIRCUIT BREAKER] Provider protection initialized")
|
|
|
|
| 79 |
self.circuit_open_time: Dict[str, float] = {}
|
| 80 |
self.half_open_attempts: Dict[str, int] = defaultdict(int)
|
| 81 |
|
| 82 |
+
# Known providers β used by the boot-time Redis restore.
|
| 83 |
+
# IMPORTANT: Every provider registered in news_aggregator.py MUST be
|
| 84 |
+
# listed here. If a provider is missing, a circuit that was OPEN before
|
| 85 |
+
# a server restart will not be restored β the Space will hammer a broken
|
| 86 |
+
# API on every restart until it fails 3 more times to re-open.
|
| 87 |
+
#
|
| 88 |
+
# Phases 1-2 (legacy): gnews, newsapi, newsdata, google_rss, medium, official_cloud
|
| 89 |
+
# Phases 3-11 (new modules): hacker_news, direct_rss, thenewsapi, inshorts,
|
| 90 |
+
# saurav_static, worldnewsai, openrss, webz, wikinews
|
| 91 |
+
self._known_providers = [
|
| 92 |
+
# ββ Legacy providers (Phases 1-2) ββββββββββββββββββββββββββββββββ
|
| 93 |
+
"gnews", "newsapi", "newsdata",
|
| 94 |
+
"google_rss", "medium", "official_cloud",
|
| 95 |
+
# ββ New modular providers (Phases 3-11) βββββββββββββββββββββββββββ
|
| 96 |
+
"hacker_news", "direct_rss", "thenewsapi",
|
| 97 |
+
"inshorts", "saurav_static", "worldnewsai",
|
| 98 |
+
"openrss", "webz", "wikinews",
|
| 99 |
+
]
|
| 100 |
|
| 101 |
logger.info("=" * 70)
|
| 102 |
logger.info("β‘ [CIRCUIT BREAKER] Provider protection initialized")
|
app/services/news_aggregator.py
CHANGED
|
@@ -18,6 +18,20 @@ from app.config import settings
|
|
| 18 |
from app.services.api_quota import get_quota_tracker
|
| 19 |
from app.services.circuit_breaker import get_circuit_breaker
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
class NewsAggregator:
|
| 22 |
"""Service for aggregating news from multiple sources with automatic failover"""
|
| 23 |
|
|
@@ -48,12 +62,54 @@ class NewsAggregator:
|
|
| 48 |
|
| 49 |
# Official Cloud Provider (Strict Isolation)
|
| 50 |
self.providers['official_cloud'] = OfficialCloudProvider()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
# ββ Provider role lists ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
# PAID_CHAIN: tried in order, stop after the first success (save credits)
|
| 54 |
# FREE_SOURCES: always tried, always in parallel (no cost, no limits)
|
| 55 |
-
self.PAID_CHAIN = ['gnews', 'newsapi', 'newsdata']
|
| 56 |
-
self.FREE_SOURCES = ['google_rss', 'medium', 'official_cloud']
|
| 57 |
|
| 58 |
# Medium only publishes articles for a small set of topics.
|
| 59 |
# Calling it for 'data-centers' or 'cloud-oracle' would return nothing.
|
|
@@ -70,6 +126,30 @@ class NewsAggregator:
|
|
| 70 |
'cloud-huawei', 'cloud-cloudflare'
|
| 71 |
]
|
| 72 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
# Cloud provider RSS feeds
|
| 75 |
self.cloud_rss_urls = {
|
|
@@ -227,6 +307,46 @@ class NewsAggregator:
|
|
| 227 |
free_tasks.append(official.fetch_news(category, limit=10))
|
| 228 |
free_names.append('official_cloud')
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
if free_tasks:
|
| 231 |
print(f"[FREE] Launching {len(free_tasks)} free source(s) in parallel for '{category}'...")
|
| 232 |
free_results = await asyncio.gather(*free_tasks, return_exceptions=True)
|
|
|
|
| 18 |
from app.services.api_quota import get_quota_tracker
|
| 19 |
from app.services.circuit_breaker import get_circuit_breaker
|
| 20 |
|
| 21 |
+
# ββ Phases 3-11: New modular providers (Strangler Fig pattern) ββββββββββββββ
|
| 22 |
+
# These live in providers/ folder. The legacy news_providers.py is NOT touched.
|
| 23 |
+
# We import each new provider here and the aggregator runs both old and new
|
| 24 |
+
# providers side-by-side safely.
|
| 25 |
+
from app.services.providers.hackernews.client import HackerNewsProvider
|
| 26 |
+
from app.services.providers.direct_rss.client import DirectRSSProvider
|
| 27 |
+
from app.services.providers.thenewsapi.client import TheNewsAPIProvider
|
| 28 |
+
from app.services.providers.inshorts.client import InshortsProvider
|
| 29 |
+
from app.services.providers.sauravkanchan.client import SauravKanchanProvider
|
| 30 |
+
from app.services.providers.worldnewsai.client import WorldNewsAIProvider
|
| 31 |
+
from app.services.providers.openrss.client import OpenRSSProvider
|
| 32 |
+
from app.services.providers.webz.client import WebzProvider
|
| 33 |
+
from app.services.providers.wikinews.client import WikinewsProvider
|
| 34 |
+
|
| 35 |
class NewsAggregator:
|
| 36 |
"""Service for aggregating news from multiple sources with automatic failover"""
|
| 37 |
|
|
|
|
| 62 |
|
| 63 |
# Official Cloud Provider (Strict Isolation)
|
| 64 |
self.providers['official_cloud'] = OfficialCloudProvider()
|
| 65 |
+
|
| 66 |
+
# Direct RSS from premium tech publications (TechCrunch, Wired, The Verge,
|
| 67 |
+
# Engadget, Ars Technica). Free, no key, great images and descriptions.
|
| 68 |
+
# Runs for ALL categories β the keyword gate filters off-topic results.
|
| 69 |
+
self.providers['direct_rss'] = DirectRSSProvider()
|
| 70 |
+
|
| 71 |
+
# TheNewsAPI.com β Position 4 in the PAID_CHAIN (failover after the
|
| 72 |
+
# existing 3 paid providers). 100 requests/day on the free tier.
|
| 73 |
+
# Only registered when the API key is present in the environment.
|
| 74 |
+
if settings.THENEWSAPI_API_KEY:
|
| 75 |
+
self.providers['thenewsapi'] = TheNewsAPIProvider(
|
| 76 |
+
api_key=settings.THENEWSAPI_API_KEY
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# WorldNewsAI.com β Position 5 in the PAID_CHAIN (final paid failover).
|
| 80 |
+
# Point-based quota, conservative daily_limit = 50 calls.
|
| 81 |
+
# Gives global, non-US-centric news from tens of thousands of sources.
|
| 82 |
+
# Only registered when the API key is present in the environment.
|
| 83 |
+
if settings.WORLDNEWS_API_KEY:
|
| 84 |
+
self.providers['worldnewsai'] = WorldNewsAIProvider(
|
| 85 |
+
api_key=settings.WORLDNEWS_API_KEY
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# OpenRSS.org β generates feeds for sites with no native RSS.
|
| 89 |
+
# Free, no key. Has strict 60-minute internal cooldown to avoid IP ban.
|
| 90 |
+
# Runs for ALL categories β no category guardrail needed.
|
| 91 |
+
# The cooldown timer is the only protection this provider needs.
|
| 92 |
+
self.providers['openrss'] = OpenRSSProvider()
|
| 93 |
+
|
| 94 |
+
# Webz.io β Position 6 in the PAID_CHAIN (deepest paid failover).
|
| 95 |
+
# Enterprise-grade crawl from 3.5M articles/day. Rich, global coverage.
|
| 96 |
+
# 1,000 calls/month free tier β paced to 30/day = ~900/month (10% margin).
|
| 97 |
+
# Only registered when the API key is present in the environment.
|
| 98 |
+
if settings.WEBZ_API_KEY:
|
| 99 |
+
self.providers['webz'] = WebzProvider(
|
| 100 |
+
api_key=settings.WEBZ_API_KEY
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# Wikinews β Public Domain, copyright-bulletproof tech news.
|
| 104 |
+
# Free, no key. Searches 'Computing' and 'Internet' categories concurrently.
|
| 105 |
+
# Gated behind GENERAL_TECH_CATEGORIES (broad tech content only).
|
| 106 |
+
self.providers['wikinews'] = WikinewsProvider()
|
| 107 |
|
| 108 |
# ββ Provider role lists ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 109 |
# PAID_CHAIN: tried in order, stop after the first success (save credits)
|
| 110 |
# FREE_SOURCES: always tried, always in parallel (no cost, no limits)
|
| 111 |
+
self.PAID_CHAIN = ['gnews', 'newsapi', 'newsdata', 'thenewsapi', 'worldnewsai', 'webz']
|
| 112 |
+
self.FREE_SOURCES = ['google_rss', 'medium', 'official_cloud', 'direct_rss', 'hacker_news', 'inshorts', 'saurav_static', 'openrss', 'wikinews']
|
| 113 |
|
| 114 |
# Medium only publishes articles for a small set of topics.
|
| 115 |
# Calling it for 'data-centers' or 'cloud-oracle' would return nothing.
|
|
|
|
| 126 |
'cloud-huawei', 'cloud-cloudflare'
|
| 127 |
]
|
| 128 |
}
|
| 129 |
+
|
| 130 |
+
# ββ Phase 3: Hacker News Category Guardrail ββββββββββββββββββββββββββ
|
| 131 |
+
# Hacker News gives broad tech news β it does NOT know about "cloud-alibaba"
|
| 132 |
+
# or "data-governance". Asking it for niche categories wastes CPU cycles
|
| 133 |
+
# and risks polluting those collections with off-topic articles.
|
| 134 |
+
# Only enable Hacker News for the broad categories below where it adds value.
|
| 135 |
+
self.GENERAL_TECH_CATEGORIES = {
|
| 136 |
+
'ai', 'magazines', 'data-engineering', 'cloud-computing',
|
| 137 |
+
'data-security', 'business-intelligence'
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
# Register the Hacker News provider (free, no key needed).
|
| 141 |
+
# It lives in providers/hackernews/client.py β completely isolated from
|
| 142 |
+
# the legacy news_providers.py file.
|
| 143 |
+
self.providers['hacker_news'] = HackerNewsProvider()
|
| 144 |
+
|
| 145 |
+
# Inshorts β 60-word tech summaries. Free, no key, broad tech topics.
|
| 146 |
+
# Gated behind GENERAL_TECH_CATEGORIES (same as Hacker News).
|
| 147 |
+
self.providers['inshorts'] = InshortsProvider()
|
| 148 |
+
|
| 149 |
+
# SauravKanchan static JSON β reads two GitHub Pages files (IN + US).
|
| 150 |
+
# Zero cost, zero rate limits, NewsAPI-format data structure.
|
| 151 |
+
# Gated behind GENERAL_TECH_CATEGORIES (broad tech news only).
|
| 152 |
+
self.providers['saurav_static'] = SauravKanchanProvider()
|
| 153 |
|
| 154 |
# Cloud provider RSS feeds
|
| 155 |
self.cloud_rss_urls = {
|
|
|
|
| 307 |
free_tasks.append(official.fetch_news(category, limit=10))
|
| 308 |
free_names.append('official_cloud')
|
| 309 |
|
| 310 |
+
# ββ Phase 3: Hacker News Guardrail ββββββββββββββββββββββββββββββββββββ
|
| 311 |
+
# Only fire Hacker News when the category is a broad tech topic.
|
| 312 |
+
# For niche categories (e.g., cloud-alibaba), we skip it entirely.
|
| 313 |
+
if category in self.GENERAL_TECH_CATEGORIES:
|
| 314 |
+
hn = self.providers.get('hacker_news')
|
| 315 |
+
if hn and not self.circuit.should_skip('hacker_news'):
|
| 316 |
+
if hn.is_available():
|
| 317 |
+
free_tasks.append(hn.fetch_news(category, limit=30))
|
| 318 |
+
free_names.append('hacker_news')
|
| 319 |
+
|
| 320 |
+
# ββ Phase 6: Inshorts Guardrail βββββββββββββββββββββββββββββββββββββ
|
| 321 |
+
# Same rule as Hacker News: only fire for broad tech categories.
|
| 322 |
+
# Inshorts covers general tech, not niche cloud or governance topics.
|
| 323 |
+
if category in self.GENERAL_TECH_CATEGORIES:
|
| 324 |
+
inshorts = self.providers.get('inshorts')
|
| 325 |
+
if inshorts and not self.circuit.should_skip('inshorts'):
|
| 326 |
+
if inshorts.is_available():
|
| 327 |
+
free_tasks.append(inshorts.fetch_news(category, limit=20))
|
| 328 |
+
free_names.append('inshorts')
|
| 329 |
+
|
| 330 |
+
# ββ Phase 7: SauravKanchan Guardrail βββββββββββββββββββββββββββββββββ
|
| 331 |
+
# Static JSON files (IN + US). Same guardrail as Hacker News and Inshorts.
|
| 332 |
+
# Broad tech content only β niche categories get no value from these files.
|
| 333 |
+
if category in self.GENERAL_TECH_CATEGORIES:
|
| 334 |
+
saurav = self.providers.get('saurav_static')
|
| 335 |
+
if saurav and not self.circuit.should_skip('saurav_static'):
|
| 336 |
+
if saurav.is_available():
|
| 337 |
+
free_tasks.append(saurav.fetch_news(category, limit=50))
|
| 338 |
+
free_names.append('saurav_static')
|
| 339 |
+
|
| 340 |
+
# ββ Phase 11: Wikinews Guardrail ββββββββββββββββββββββββββββββββββ
|
| 341 |
+
# Wikinews searches broad tech categories (Computing + Internet).
|
| 342 |
+
# No value for niche collections like cloud-alibaba or data-governance.
|
| 343 |
+
if category in self.GENERAL_TECH_CATEGORIES:
|
| 344 |
+
wikinews = self.providers.get('wikinews')
|
| 345 |
+
if wikinews and not self.circuit.should_skip('wikinews'):
|
| 346 |
+
if wikinews.is_available():
|
| 347 |
+
free_tasks.append(wikinews.fetch_news(category, limit=20))
|
| 348 |
+
free_names.append('wikinews')
|
| 349 |
+
|
| 350 |
if free_tasks:
|
| 351 |
print(f"[FREE] Launching {len(free_tasks)} free source(s) in parallel for '{category}'...")
|
| 352 |
free_results = await asyncio.gather(*free_tasks, return_exceptions=True)
|
app/services/providers/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# providers/__init__.py
|
| 3 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
# This file marks the 'providers' folder as a Python package so that
|
| 5 |
+
# Python knows it can import code from inside it.
|
| 6 |
+
#
|
| 7 |
+
# ββ HOW TO ADD A NEW PROVIDER ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 8 |
+
# 1. Create a new folder under providers/ (e.g., providers/hackernews/)
|
| 9 |
+
# 2. Inside that folder, create __init__.py (empty) and client.py
|
| 10 |
+
# 3. In client.py, write a class that inherits from base.NewsProvider
|
| 11 |
+
# 4. Add the import line below so the aggregator can find it easily:
|
| 12 |
+
# from app.services.providers.hackernews.client import HackerNewsProvider
|
| 13 |
+
#
|
| 14 |
+
# ββ ROUTING RULE (CRITICAL) ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
+
# Every provider MUST set a 'category' on each Article it returns.
|
| 16 |
+
# If a provider cannot determine a category, it MUST leave category as ""
|
| 17 |
+
# or "magazines". DO NOT LEAVE IT AS None.
|
| 18 |
+
#
|
| 19 |
+
# When category is empty or unrecognized, appwrite_db.get_collection_id()
|
| 20 |
+
# automatically routes the article to the DEFAULT 'News Articles' collection.
|
| 21 |
+
# This is intentional and safe. Never invent a category name that doesn't
|
| 22 |
+
# exist in config.py CATEGORIES β it will silently break routing.
|
| 23 |
+
# =============================================================================
|
app/services/providers/base.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
providers/base.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
The Foundation β every news provider in this system inherits from this file.
|
| 5 |
+
|
| 6 |
+
Think of this like a "job contract" for a news provider. Any class that wants
|
| 7 |
+
to act as a news provider MUST sign this contract by:
|
| 8 |
+
1. Inheriting from the NewsProvider class below.
|
| 9 |
+
2. Implementing the fetch_news() method with real logic.
|
| 10 |
+
|
| 11 |
+
If a class inherits from NewsProvider but does NOT implement fetch_news(),
|
| 12 |
+
Python will throw a TypeError at startup β which is exactly what we want.
|
| 13 |
+
It forces every developer to write proper fetching logic.
|
| 14 |
+
|
| 15 |
+
ββ RULE: THE CATEGORY ROUTING CONTRACT βββββββββββββββββββββββββββββββββββββ
|
| 16 |
+
|
| 17 |
+
Every Article produced by a provider MUST have a 'category' field.
|
| 18 |
+
The category value routes the article to the correct Appwrite collection.
|
| 19 |
+
|
| 20 |
+
Current routing rules (defined in appwrite_db.get_collection_id):
|
| 21 |
+
"ai" β AI collection
|
| 22 |
+
"cloud-*" β Cloud collection
|
| 23 |
+
"data-*" / "business-*" / "customer-data-platform" β Data collection
|
| 24 |
+
"magazines" β Magazine collection
|
| 25 |
+
"medium-article" β Medium collection
|
| 26 |
+
"" (empty)
|
| 27 |
+
or any unknown β DEFAULT 'News Articles' collection β SAFE FALLBACK
|
| 28 |
+
|
| 29 |
+
β οΈ IMPORTANT FOR ALL PROVIDER DEVELOPERS:
|
| 30 |
+
If your provider fetches general tech news and cannot determine a specific
|
| 31 |
+
category, set category = "magazines".
|
| 32 |
+
If your provider truly cannot figure out a category, set category = "".
|
| 33 |
+
The default collection will catch it safely.
|
| 34 |
+
NEVER set category = None β that will cause a Pydantic validation error.
|
| 35 |
+
NEVER invent a category string that is not in config.py CATEGORIES list.
|
| 36 |
+
|
| 37 |
+
ββ HOW CLIENT-SIDE FILTERING WORKS βββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
+
|
| 39 |
+
Many providers (Hacker News, RSS Feeds, static files) do NOT support
|
| 40 |
+
filtering by date or keyword in their API request. That is okay.
|
| 41 |
+
|
| 42 |
+
Do NOT try to add date filters in the URL if the API doesn't support them.
|
| 43 |
+
Our data_validation pipeline enforces all constraints AFTER the fetch:
|
| 44 |
+
- Freshness gate: rejects articles older than midnight IST today
|
| 45 |
+
- Keyword gate: rejects articles with no matching category keywords
|
| 46 |
+
- Redis dedup: rejects URLs we have already saved in the last 48 hours
|
| 47 |
+
|
| 48 |
+
So your job in fetch_news() is simple: fetch as many articles as the
|
| 49 |
+
provider gives you, map them to Article objects, and return them.
|
| 50 |
+
The pipeline does the rest.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
# ββ Imports ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 54 |
+
# Standard library
|
| 55 |
+
from abc import ABC, abstractmethod # ABC = Abstract Base Class toolkit
|
| 56 |
+
from typing import List, Optional
|
| 57 |
+
from datetime import datetime, timezone, timedelta
|
| 58 |
+
from zoneinfo import ZoneInfo # Timezone handling (Python 3.9+ built-in)
|
| 59 |
+
from enum import Enum
|
| 60 |
+
|
| 61 |
+
# Third-party (all already in requirements.txt β no new installs needed)
|
| 62 |
+
import httpx # Async HTTP client for API calls
|
| 63 |
+
|
| 64 |
+
# Internal
|
| 65 |
+
from app.models import Article # The standard Article shape every provider must return
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# ββ Provider Status ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 69 |
+
|
| 70 |
+
class ProviderStatus(Enum):
|
| 71 |
+
"""
|
| 72 |
+
Represents the health of a provider at any given moment.
|
| 73 |
+
|
| 74 |
+
ACTIVE β Provider is working fine. Calls proceed normally.
|
| 75 |
+
RATE_LIMITED β Provider hit its API limit. Calls are paused.
|
| 76 |
+
ERROR β Provider had a hard failure. Circuit breaker may kick in.
|
| 77 |
+
"""
|
| 78 |
+
ACTIVE = "active"
|
| 79 |
+
RATE_LIMITED = "rate_limited"
|
| 80 |
+
ERROR = "error"
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# ββ Abstract Base Class ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
+
|
| 85 |
+
class NewsProvider(ABC):
|
| 86 |
+
"""
|
| 87 |
+
The contract that every news provider must follow.
|
| 88 |
+
|
| 89 |
+
Subclass this, implement fetch_news(), and your provider
|
| 90 |
+
is automatically compatible with the NewsAggregator, circuit breaker,
|
| 91 |
+
quota tracker, and the full validation pipeline.
|
| 92 |
+
|
| 93 |
+
Example of a minimal valid provider:
|
| 94 |
+
|
| 95 |
+
from app.services.providers.base import NewsProvider, ProviderStatus
|
| 96 |
+
from app.models import Article
|
| 97 |
+
from typing import List
|
| 98 |
+
|
| 99 |
+
class MyProvider(NewsProvider):
|
| 100 |
+
async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
|
| 101 |
+
# 1. Call your API / RSS feed
|
| 102 |
+
# 2. Map the response to Article objects
|
| 103 |
+
# 3. Return the list (can be empty if nothing found)
|
| 104 |
+
return []
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
def __init__(self, api_key: Optional[str] = None):
|
| 108 |
+
# The API key for paid providers. Free providers leave this as None.
|
| 109 |
+
self.api_key = api_key
|
| 110 |
+
|
| 111 |
+
# Starts as ACTIVE. The aggregator or circuit breaker may change this.
|
| 112 |
+
self.status = ProviderStatus.ACTIVE
|
| 113 |
+
|
| 114 |
+
# Tracks how many API calls this provider has made today.
|
| 115 |
+
self.request_count: int = 0
|
| 116 |
+
|
| 117 |
+
# Maximum calls per day. 0 = no limit (used by free providers).
|
| 118 |
+
self.daily_limit: int = 0
|
| 119 |
+
|
| 120 |
+
# The name of this provider. Used in logging and circuit breaker tracking.
|
| 121 |
+
# Automatically takes the class name (e.g., "HackerNewsProvider").
|
| 122 |
+
self.name: str = self.__class__.__name__
|
| 123 |
+
|
| 124 |
+
@abstractmethod
|
| 125 |
+
async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
|
| 126 |
+
"""
|
| 127 |
+
REQUIRED: Fetch news articles for the given category.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
category (str): The internal Segmento Pulse category name.
|
| 131 |
+
Example: "ai", "cloud-aws", "magazines"
|
| 132 |
+
limit (int): Maximum number of articles to return.
|
| 133 |
+
This is a guideline β providers may return fewer.
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
List[Article]: A list of Article objects. Return [] on failure.
|
| 137 |
+
Never raise an unhandled exception from here.
|
| 138 |
+
Wrap all network calls in try/except.
|
| 139 |
+
|
| 140 |
+
Remember the ROUTING RULE at the top of this file:
|
| 141 |
+
Every Article MUST have a category string.
|
| 142 |
+
Use "magazines" for general tech. Use "" for truly unknown.
|
| 143 |
+
"""
|
| 144 |
+
pass
|
| 145 |
+
|
| 146 |
+
# ββ Utility Methods (inherited by all providers, no need to override) ββββββ
|
| 147 |
+
|
| 148 |
+
def is_available(self) -> bool:
|
| 149 |
+
"""
|
| 150 |
+
Check if this provider is ready to accept a fetch request.
|
| 151 |
+
|
| 152 |
+
Returns False if:
|
| 153 |
+
- It is currently rate-limited or in an error state.
|
| 154 |
+
- It has used up its daily API call limit.
|
| 155 |
+
"""
|
| 156 |
+
return (
|
| 157 |
+
self.status == ProviderStatus.ACTIVE
|
| 158 |
+
and (self.daily_limit == 0 or self.request_count < self.daily_limit)
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
def mark_rate_limited(self):
|
| 162 |
+
"""
|
| 163 |
+
Call this when the API returns a 429 (Too Many Requests).
|
| 164 |
+
The status changes to RATE_LIMITED so the aggregator knows to skip it.
|
| 165 |
+
"""
|
| 166 |
+
self.status = ProviderStatus.RATE_LIMITED
|
| 167 |
+
|
| 168 |
+
def reset_daily_quota(self):
|
| 169 |
+
"""
|
| 170 |
+
Reset this provider's call counter back to zero.
|
| 171 |
+
Called once per day (midnight UTC) by the scheduler to restore access.
|
| 172 |
+
"""
|
| 173 |
+
self.request_count = 0
|
| 174 |
+
self.status = ProviderStatus.ACTIVE
|
app/services/providers/direct_rss/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# providers/direct_rss/__init__.py
|
| 2 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
+
# This file marks the 'direct_rss' folder as a Python package.
|
| 4 |
+
# To use the Direct RSS provider, import it like this:
|
| 5 |
+
#
|
| 6 |
+
# from app.services.providers.direct_rss.client import DirectRSSProvider
|
| 7 |
+
#
|
| 8 |
+
# This provider fetches XML feeds from premium tech publications
|
| 9 |
+
# (TechCrunch, Wired, The Verge, Engadget, Ars Technica) completely for free.
|
| 10 |
+
# No API keys. No rate limits. Just clean, honest RSS.
|
app/services/providers/direct_rss/client.py
ADDED
|
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
providers/direct_rss/client.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
The Direct RSS Provider for Segmento Pulse.
|
| 5 |
+
|
| 6 |
+
What this does:
|
| 7 |
+
Fetches the latest technology articles from the RSS feeds of the world's
|
| 8 |
+
best tech publications: TechCrunch, Wired, The Verge, Engadget, and
|
| 9 |
+
Ars Technica.
|
| 10 |
+
|
| 11 |
+
Why Direct RSS instead of using rss_parser.parse_provider_rss()?
|
| 12 |
+
The existing rss_parser.parse_provider_rss() function is built for a
|
| 13 |
+
specific use case: fetching official CLOUD PROVIDER blogs (AWS, GCP etc.)
|
| 14 |
+
It hardcodes category = f'cloud-{provider}' on every article it creates.
|
| 15 |
+
|
| 16 |
+
If we ran TechCrunch through that function, every TechCrunch article
|
| 17 |
+
would be tagged "category = cloud-TechCrunch". Appwrite would not know
|
| 18 |
+
where to route it, and articles would end up in the wrong collection β
|
| 19 |
+
or worse, be silently dropped.
|
| 20 |
+
|
| 21 |
+
So instead, we use the feedparser library directly (the same library
|
| 22 |
+
rss_parser.py uses internally). We follow the exact same parsing pattern
|
| 23 |
+
but set the category correctly from what the aggregator tells us.
|
| 24 |
+
|
| 25 |
+
We DO still reuse two helper methods from rss_parser.py for consistency:
|
| 26 |
+
- _extract_image_from_entry() β finds images from media/enclosure tags
|
| 27 |
+
- _parse_date() β handles all date format variations
|
| 28 |
+
|
| 29 |
+
How it works:
|
| 30 |
+
Step 1: Build a list of async HTTP tasks β one per RSS feed URL.
|
| 31 |
+
Step 2: Fire all tasks at the same time using asyncio.gather().
|
| 32 |
+
Step 3: Feed each successful XML response into feedparser.
|
| 33 |
+
Step 4: Map each feedparser entry to a Pulse Article object.
|
| 34 |
+
Step 5: Return the combined list from all feeds.
|
| 35 |
+
|
| 36 |
+
Client-side constraint note:
|
| 37 |
+
RSS feeds give us whatever was published recently by that outlet β
|
| 38 |
+
we cannot ask them for "only today's AI articles".
|
| 39 |
+
The freshness gate (is_valid_article) and keyword gate
|
| 40 |
+
(is_relevant_to_category) in data_validation.py handle all filtering
|
| 41 |
+
after we return these articles. That is by design.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
# ββ Standard Library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
import asyncio
|
| 46 |
+
import logging
|
| 47 |
+
import re
|
| 48 |
+
import time
|
| 49 |
+
from typing import List
|
| 50 |
+
|
| 51 |
+
# ββ Third-party (already in requirements.txt) βββββββββββββββββββββββββββββββββ
|
| 52 |
+
import feedparser # XML/RSS feed parser β already used by rss_parser.py
|
| 53 |
+
import httpx # Async HTTP client
|
| 54 |
+
|
| 55 |
+
# ββ Internal ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
from app.services.providers.base import NewsProvider
|
| 57 |
+
from app.services.rss_parser import RSSParser # Reuse helper methods, not the methods with hardcoded categories
|
| 58 |
+
from app.models import Article
|
| 59 |
+
|
| 60 |
+
logger = logging.getLogger(__name__)
|
| 61 |
+
|
| 62 |
+
# ββ RSS Feed Registry ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
#
|
| 64 |
+
# These are the direct RSS feed URLs for the most trusted tech publications.
|
| 65 |
+
# Each entry is a tuple of (feed_url, source_name).
|
| 66 |
+
#
|
| 67 |
+
# "source_name" is the human-readable name we store on every article.
|
| 68 |
+
# It appears in the Segmento Pulse UI next to the article headline.
|
| 69 |
+
#
|
| 70 |
+
# To add a new RSS feed in the future, just add a new line here.
|
| 71 |
+
# The rest of the code picks it up automatically.
|
| 72 |
+
#
|
| 73 |
+
TECH_RSS_FEEDS: List[tuple] = [
|
| 74 |
+
("https://techcrunch.com/feed", "TechCrunch"),
|
| 75 |
+
("https://www.wired.com/feed/rss", "Wired"),
|
| 76 |
+
("https://www.theverge.com/rss/tech/index.xml", "The Verge"),
|
| 77 |
+
("https://www.engadget.com/rss.xml", "Engadget"),
|
| 78 |
+
("https://feeds.arstechnica.com/arstechnica/technology-lab", "Ars Technica"),
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
# Maximum articles to take from each individual feed.
|
| 82 |
+
# 10 per feed Γ 5 feeds = up to 50 articles total per aggregator run.
|
| 83 |
+
MAX_ARTICLES_PER_FEED = 10
|
| 84 |
+
|
| 85 |
+
# How long (in seconds) to wait for a feed to respond before giving up.
|
| 86 |
+
HTTP_TIMEOUT_SECONDS = 12.0
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class DirectRSSProvider(NewsProvider):
|
| 90 |
+
"""
|
| 91 |
+
Fetches articles directly from the RSS feeds of premium tech publications.
|
| 92 |
+
|
| 93 |
+
Free. No API key needed. No rate limits.
|
| 94 |
+
Provides the best descriptions and images of all our free providers,
|
| 95 |
+
because these are professionally edited by full-time journalists.
|
| 96 |
+
|
| 97 |
+
Usage (wired into the aggregator in Phase 4):
|
| 98 |
+
provider = DirectRSSProvider()
|
| 99 |
+
articles = await provider.fetch_news(category="ai", limit=50)
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
def __init__(self):
|
| 103 |
+
# Free provider β no API key, no daily limit.
|
| 104 |
+
super().__init__(api_key=None)
|
| 105 |
+
self.daily_limit = 0
|
| 106 |
+
|
| 107 |
+
# Phase 17: Fetch-Once, Fan-Out cache
|
| 108 |
+
#
|
| 109 |
+
# Direct RSS fetches TechCrunch, Wired, The Verge, Engadget, and
|
| 110 |
+
# Ars Technica. These do NOT change between categories β the same
|
| 111 |
+
# 5 XML files contain the same articles whether the category is
|
| 112 |
+
# "ai", "cloud-aws", or "data-security".
|
| 113 |
+
#
|
| 114 |
+
# Without a cache: 22 categories Γ 5 feeds = 110 outbound HTTP requests
|
| 115 |
+
# per scheduler run, all downloading the exact same XML.
|
| 116 |
+
#
|
| 117 |
+
# With a cache: first category fetches 5 feeds once, stores results
|
| 118 |
+
# here. The other 21 categories get the list instantly from memory.
|
| 119 |
+
# Total outbound requests: 5. A 95% reduction.
|
| 120 |
+
self._cached_articles: List[Article] = []
|
| 121 |
+
self._cache_time: float = 0.0
|
| 122 |
+
|
| 123 |
+
# asyncio.Lock prevents a race condition during the first run.
|
| 124 |
+
# When the scheduler fires, asyncio.gather() calls fetch_news() for
|
| 125 |
+
# multiple categories at the same time. Without the lock, all of them
|
| 126 |
+
# would see an empty cache and all start their own 5-feed HTTP fetch
|
| 127 |
+
# simultaneously. That defeats the whole purpose. With the lock,
|
| 128 |
+
# only the FIRST caller fetches; the rest wait and then read from cache.
|
| 129 |
+
self._lock = asyncio.Lock()
|
| 130 |
+
|
| 131 |
+
# We borrow helpers from the existing RSSParser.
|
| 132 |
+
# We do NOT call parse_google_news() or parse_provider_rss() β
|
| 133 |
+
# those have category logic built in that would break our routing.
|
| 134 |
+
# We only use the helper methods: _extract_image_from_entry, _parse_date.
|
| 135 |
+
self._rss_helpers = RSSParser()
|
| 136 |
+
|
| 137 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 138 |
+
# MAIN ENTRY POINT β called by the aggregator
|
| 139 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 140 |
+
|
| 141 |
+
async def fetch_news(self, category: str, limit: int = 50) -> List[Article]:
|
| 142 |
+
"""
|
| 143 |
+
Fetch articles from all premium tech RSS feeds concurrently.
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
category (str): The category string passed from the aggregator.
|
| 147 |
+
We tag every article with this so the pipeline
|
| 148 |
+
can route it to the correct Appwrite collection.
|
| 149 |
+
The keyword gate will filter out irrelevant articles.
|
| 150 |
+
limit (int): Not strictly enforced here β we let the per-feed
|
| 151 |
+
cap (MAX_ARTICLES_PER_FEED) control volume, and
|
| 152 |
+
the aggregator deduplication handles the rest.
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
List[Article]: All articles collected across all 5 feeds.
|
| 156 |
+
Returns [] if network is down for all feeds.
|
| 157 |
+
"""
|
| 158 |
+
# ββ Phase 17: Cache check (OUTER) βββββββββββββββββββββββββββββββββββββ
|
| 159 |
+
# 2700 seconds = 45 minutes. If we fetched the RSS feeds less than
|
| 160 |
+
# 45 minutes ago, return the stored articles immediately.
|
| 161 |
+
# No HTTP request. No XML parsing. Instant return.
|
| 162 |
+
#
|
| 163 |
+
# Why 45 minutes? Our freshness gate uses an hourly window. A 45-minute
|
| 164 |
+
# cache is safely inside that window, giving us fresh-enough content
|
| 165 |
+
# without hammering TechCrunch and Wired every minute.
|
| 166 |
+
CACHE_TTL_SECONDS = 2700 # 45 minutes
|
| 167 |
+
|
| 168 |
+
if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
|
| 169 |
+
logger.debug(
|
| 170 |
+
"[DirectRSS] Cache hit β returning %d cached articles for category='%s'. "
|
| 171 |
+
"No HTTP calls made.",
|
| 172 |
+
len(self._cached_articles), category
|
| 173 |
+
)
|
| 174 |
+
return self._cached_articles
|
| 175 |
+
|
| 176 |
+
# ββ Cache stale or empty: acquire the lock and fetch βββββββββββββββββββ
|
| 177 |
+
# Only one coroutine can be inside this block at a time.
|
| 178 |
+
# Any other coroutine that reaches this point will WAIT here until
|
| 179 |
+
# the first one has finished and released the lock.
|
| 180 |
+
async with self._lock:
|
| 181 |
+
|
| 182 |
+
# ββ Cache check (INNER) β double-checked locking ββββββββββββββ
|
| 183 |
+
# While THIS coroutine was waiting for the lock, the coroutine that
|
| 184 |
+
# held the lock before us already fetched and filled the cache.
|
| 185 |
+
# We check again so we don't fetch a second time.
|
| 186 |
+
if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
|
| 187 |
+
logger.debug(
|
| 188 |
+
"[DirectRSS] Cache hit after lock (another task fetched it) β "
|
| 189 |
+
"returning %d cached articles.",
|
| 190 |
+
len(self._cached_articles)
|
| 191 |
+
)
|
| 192 |
+
return self._cached_articles
|
| 193 |
+
|
| 194 |
+
# Cache is genuinely stale β this coroutine won the race.
|
| 195 |
+
# Do the full HTTP fetch now.
|
| 196 |
+
logger.info("[DirectRSS] Cache stale/empty. Fetching all 5 RSS feeds...")
|
| 197 |
+
|
| 198 |
+
try:
|
| 199 |
+
async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
|
| 200 |
+
|
| 201 |
+
# Step 1: Build one fetch task per RSS feed URL.
|
| 202 |
+
# All tasks run at the same time β we do not wait for feed #1
|
| 203 |
+
# before starting feed #2. This keeps total time under 2 seconds.
|
| 204 |
+
fetch_tasks = [
|
| 205 |
+
self._fetch_and_parse_feed(client, url, source_name, category)
|
| 206 |
+
for url, source_name in TECH_RSS_FEEDS
|
| 207 |
+
]
|
| 208 |
+
|
| 209 |
+
# Step 2: Launch all tasks simultaneously.
|
| 210 |
+
results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
|
| 211 |
+
|
| 212 |
+
# Step 3: Combine all lists into one. Skip any that errored.
|
| 213 |
+
all_articles: List[Article] = []
|
| 214 |
+
for feed_url_source, result in zip(TECH_RSS_FEEDS, results):
|
| 215 |
+
source_name = feed_url_source[1]
|
| 216 |
+
if isinstance(result, Exception):
|
| 217 |
+
logger.warning(
|
| 218 |
+
f"[DirectRSS] [{source_name}] Feed fetch failed: {result}"
|
| 219 |
+
)
|
| 220 |
+
elif isinstance(result, list):
|
| 221 |
+
all_articles.extend(result)
|
| 222 |
+
|
| 223 |
+
logger.info(
|
| 224 |
+
"[DirectRSS] Fetched %d articles across %d feeds. "
|
| 225 |
+
"Caching for 45 minutes.",
|
| 226 |
+
len(all_articles), len(TECH_RSS_FEEDS)
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
# Save results and timestamp to the class-level cache.
|
| 230 |
+
self._cached_articles = all_articles
|
| 231 |
+
self._cache_time = time.time()
|
| 232 |
+
return all_articles
|
| 233 |
+
|
| 234 |
+
except Exception as e:
|
| 235 |
+
logger.error(f"[DirectRSS] Unexpected error: {e}", exc_info=True)
|
| 236 |
+
return []
|
| 237 |
+
|
| 238 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 239 |
+
# PRIVATE HELPERS
|
| 240 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 241 |
+
|
| 242 |
+
async def _fetch_and_parse_feed(
|
| 243 |
+
self,
|
| 244 |
+
client: httpx.AsyncClient,
|
| 245 |
+
url: str,
|
| 246 |
+
source_name: str,
|
| 247 |
+
category: str,
|
| 248 |
+
) -> List[Article]:
|
| 249 |
+
"""
|
| 250 |
+
Fetch one RSS feed URL and parse it into Article objects.
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
client (httpx.AsyncClient): Shared HTTP client from fetch_news().
|
| 254 |
+
url (str): The RSS feed URL (e.g., https://techcrunch.com/feed).
|
| 255 |
+
source_name (str): Human-readable name (e.g., "TechCrunch").
|
| 256 |
+
category (str): The category from the aggregator β stored on each article.
|
| 257 |
+
|
| 258 |
+
Returns:
|
| 259 |
+
List[Article]: Parsed articles from this feed. Returns [] on any failure.
|
| 260 |
+
"""
|
| 261 |
+
try:
|
| 262 |
+
response = await client.get(
|
| 263 |
+
url,
|
| 264 |
+
# Politely identify ourselves. Some servers block unknown user agents.
|
| 265 |
+
headers={"User-Agent": "SegmentoPulse-RSS-Reader/1.0"},
|
| 266 |
+
follow_redirects=True,
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
if response.status_code != 200:
|
| 270 |
+
logger.warning(
|
| 271 |
+
f"[DirectRSS] [{source_name}] HTTP {response.status_code} β skipping."
|
| 272 |
+
)
|
| 273 |
+
return []
|
| 274 |
+
|
| 275 |
+
xml_text = response.text
|
| 276 |
+
|
| 277 |
+
except httpx.TimeoutException:
|
| 278 |
+
logger.warning(f"[DirectRSS] [{source_name}] Timed out β skipping.")
|
| 279 |
+
return []
|
| 280 |
+
except Exception as e:
|
| 281 |
+
logger.warning(f"[DirectRSS] [{source_name}] Fetch error: {e}")
|
| 282 |
+
return []
|
| 283 |
+
|
| 284 |
+
# Hand the raw XML to feedparser β it handles all RSS/Atom variants
|
| 285 |
+
# (RSS 2.0, Atom 1.0, etc.) automatically.
|
| 286 |
+
return self._parse_feed_xml(xml_text, source_name, category)
|
| 287 |
+
|
| 288 |
+
def _parse_feed_xml(
|
| 289 |
+
self,
|
| 290 |
+
xml_text: str,
|
| 291 |
+
source_name: str,
|
| 292 |
+
category: str,
|
| 293 |
+
) -> List[Article]:
|
| 294 |
+
"""
|
| 295 |
+
Parse raw XML text from a feed into a list of Article objects.
|
| 296 |
+
|
| 297 |
+
Uses feedparser to decode the XML, then maps each entry to our
|
| 298 |
+
Pydantic Article model. We reuse rss_parser's helper methods for
|
| 299 |
+
image extraction and date parsing so the logic is consistent
|
| 300 |
+
across all RSS sources in the system.
|
| 301 |
+
|
| 302 |
+
Args:
|
| 303 |
+
xml_text (str): Raw XML string from the HTTP response.
|
| 304 |
+
source_name (str): Name of the publication (e.g., "Wired").
|
| 305 |
+
category (str): Category to tag on every article.
|
| 306 |
+
|
| 307 |
+
Returns:
|
| 308 |
+
List[Article]: Parsed articles. May be [] if the feed is malformed.
|
| 309 |
+
"""
|
| 310 |
+
try:
|
| 311 |
+
feed = feedparser.parse(xml_text)
|
| 312 |
+
except Exception as e:
|
| 313 |
+
logger.warning(f"[DirectRSS] [{source_name}] feedparser failed: {e}")
|
| 314 |
+
return []
|
| 315 |
+
|
| 316 |
+
articles: List[Article] = []
|
| 317 |
+
|
| 318 |
+
for entry in feed.entries[:MAX_ARTICLES_PER_FEED]:
|
| 319 |
+
|
| 320 |
+
# ββ Title ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 321 |
+
title = (entry.get("title") or "").strip()
|
| 322 |
+
if not title:
|
| 323 |
+
continue # Every article must have a title
|
| 324 |
+
|
| 325 |
+
# ββ URL ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 326 |
+
url = (entry.get("link") or "").strip()
|
| 327 |
+
if not url or not url.startswith("http"):
|
| 328 |
+
continue # Every article must have a clickable link
|
| 329 |
+
|
| 330 |
+
# ββ Description ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 331 |
+
# RSS feeds usually put a short summary in the 'summary' field.
|
| 332 |
+
# We strip any HTML tags, then cap it at 200 characters.
|
| 333 |
+
raw_desc = entry.get("summary", "") or ""
|
| 334 |
+
description = re.sub(r"<[^>]+>", "", raw_desc).strip()
|
| 335 |
+
if len(description) > 200:
|
| 336 |
+
description = description[:200] + "..."
|
| 337 |
+
|
| 338 |
+
# ββ Image URL ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 339 |
+
# We reuse the existing _extract_image_from_entry helper from
|
| 340 |
+
# rss_parser.py. It checks media:content, media:thumbnail,
|
| 341 |
+
# enclosures, and <img> tags inside the description.
|
| 342 |
+
image_url = self._rss_helpers._extract_image_from_entry(entry)
|
| 343 |
+
|
| 344 |
+
# ββ Published Date βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 345 |
+
# We reuse the existing _parse_date helper from rss_parser.py.
|
| 346 |
+
# It handles RFC 2822, ISO 8601, and other common date formats.
|
| 347 |
+
raw_date = entry.get("published", "") or ""
|
| 348 |
+
published_at = self._rss_helpers._parse_date(raw_date)
|
| 349 |
+
|
| 350 |
+
# ββ Build Article ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 351 |
+
try:
|
| 352 |
+
article = Article(
|
| 353 |
+
title=title,
|
| 354 |
+
description=description,
|
| 355 |
+
url=url,
|
| 356 |
+
image_url=image_url,
|
| 357 |
+
published_at=published_at,
|
| 358 |
+
source=source_name,
|
| 359 |
+
# ββ ROUTING RULE ββββββββββββββββββββββββββββββββββββββ
|
| 360 |
+
# We set the category that the aggregator passed in.
|
| 361 |
+
# The keyword gate will reject articles that don't
|
| 362 |
+
# actually match this category β that's completely fine.
|
| 363 |
+
# It is much safer than guessing a wrong category here.
|
| 364 |
+
category=category,
|
| 365 |
+
)
|
| 366 |
+
articles.append(article)
|
| 367 |
+
|
| 368 |
+
except Exception as e:
|
| 369 |
+
# One bad article should never cancel the rest of the feed
|
| 370 |
+
logger.debug(
|
| 371 |
+
f"[DirectRSS] [{source_name}] Skipped entry '{title[:50]}': {e}"
|
| 372 |
+
)
|
| 373 |
+
continue
|
| 374 |
+
|
| 375 |
+
logger.info(
|
| 376 |
+
f"[DirectRSS] [{source_name}] Parsed {len(articles)} articles."
|
| 377 |
+
)
|
| 378 |
+
return articles
|
app/services/providers/hackernews/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# providers/hackernews/__init__.py
|
| 2 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
+
# This file marks the 'hackernews' folder as a Python package.
|
| 4 |
+
# To use the Hacker News provider, import it like this:
|
| 5 |
+
#
|
| 6 |
+
# from app.services.providers.hackernews.client import HackerNewsProvider
|
| 7 |
+
#
|
| 8 |
+
# This provider is entirely self-contained in this folder.
|
| 9 |
+
# It does not touch news_providers.py, news_aggregator.py, or anything else.
|
app/services/providers/hackernews/client.py
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
providers/hackernews/client.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
The Hacker News Provider for Segmento Pulse.
|
| 5 |
+
|
| 6 |
+
What this does:
|
| 7 |
+
Fetches the top stories from Hacker News β a community-voted list of the
|
| 8 |
+
best tech articles on the internet. It is completely free to use and has
|
| 9 |
+
no rate limits or API key requirement.
|
| 10 |
+
|
| 11 |
+
How the Hacker News API works (Two-Step Process):
|
| 12 |
+
Step 1: Ask HN for a list of top story IDs (one big list)
|
| 13 |
+
Step 2: For each ID, ask HN for that story's actual details
|
| 14 |
+
|
| 15 |
+
We only take the top 30 IDs. If we tried 500 IDs (the full list),
|
| 16 |
+
it would take too long and put unnecessary load on their server.
|
| 17 |
+
30 is a safe, polite number that still gives us great content.
|
| 18 |
+
|
| 19 |
+
What we do about missing data:
|
| 20 |
+
- No URL? β Skip this story entirely (it's an "Ask HN" self-post).
|
| 21 |
+
Our database cannot link to a story without a URL.
|
| 22 |
+
- No image? β Set image_url = "". The frontend will use the
|
| 23 |
+
Segmento Pulse banner image as the default.
|
| 24 |
+
- No summary? β Set description = "". HN only provides the title
|
| 25 |
+
for external links, not a description.
|
| 26 |
+
- Unix time? β Convert to ISO 8601 string (our standard date format).
|
| 27 |
+
|
| 28 |
+
Client-side constraint note (from our architecture plan):
|
| 29 |
+
Hacker News does NOT support any filtering. We cannot ask it for
|
| 30 |
+
"only today's articles" or "only AI news". It gives us what it gives us.
|
| 31 |
+
That is completely fine. Our data_validation pipeline (is_valid_article,
|
| 32 |
+
is_relevant_to_category) will filter out old or off-topic articles
|
| 33 |
+
automatically AFTER we fetch them. We just fetch and map here.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
# ββ Standard Library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 37 |
+
import asyncio # Lets us run multiple HTTP calls at the same time
|
| 38 |
+
import logging
|
| 39 |
+
from datetime import datetime, timezone
|
| 40 |
+
from typing import List, Optional
|
| 41 |
+
|
| 42 |
+
# ββ Third-party (already in requirements.txt) βββββββββββββββββββββββββββββββββ
|
| 43 |
+
import httpx # Async HTTP client
|
| 44 |
+
|
| 45 |
+
# ββ Internal ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
+
# We import only from our new base β no dependency on legacy news_providers.py
|
| 47 |
+
from app.services.providers.base import NewsProvider, ProviderStatus
|
| 48 |
+
from app.models import Article
|
| 49 |
+
# Phase 12: Shared image enricher (extracts og:image from article pages)
|
| 50 |
+
from app.services.utils.image_enricher import extract_top_image
|
| 51 |
+
|
| 52 |
+
logger = logging.getLogger(__name__)
|
| 53 |
+
|
| 54 |
+
# ββ Constants βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
|
| 56 |
+
# The top of this list = the most upvoted stories on Hacker News right now
|
| 57 |
+
HN_TOP_STORIES_URL = "https://hacker-news.firebaseio.com/v0/topstories.json"
|
| 58 |
+
|
| 59 |
+
# Template for fetching one story's full details by its ID
|
| 60 |
+
HN_ITEM_URL = "https://hacker-news.firebaseio.com/v0/item/{item_id}.json"
|
| 61 |
+
|
| 62 |
+
# How many top stories to fetch. Kept small to be polite to HN's servers.
|
| 63 |
+
# The full list has 500 stories β we only want the best 30.
|
| 64 |
+
TOP_STORIES_LIMIT = 30
|
| 65 |
+
|
| 66 |
+
# HTTP timeout in seconds. HN is fast, but we cap it to avoid hanging jobs.
|
| 67 |
+
HTTP_TIMEOUT_SECONDS = 10.0
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class HackerNewsProvider(NewsProvider):
|
| 71 |
+
"""
|
| 72 |
+
Fetches top stories from the Hacker News API.
|
| 73 |
+
|
| 74 |
+
No API key needed. No rate limit. Completely free.
|
| 75 |
+
|
| 76 |
+
Usage (once wired into the aggregator in Phase 3):
|
| 77 |
+
provider = HackerNewsProvider()
|
| 78 |
+
articles = await provider.fetch_news(category="magazines", limit=30)
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
def __init__(self):
|
| 82 |
+
# Free provider β no API key needed, so we pass None to the base class.
|
| 83 |
+
super().__init__(api_key=None)
|
| 84 |
+
|
| 85 |
+
# daily_limit = 0 means "no limit". HN has no quota.
|
| 86 |
+
self.daily_limit = 0
|
| 87 |
+
|
| 88 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 89 |
+
# STEP 1 + 2 COMBINED: fetch_news() is the one method the aggregator calls
|
| 90 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
+
|
| 92 |
+
async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
|
| 93 |
+
"""
|
| 94 |
+
Fetch the top stories from Hacker News.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
category (str): The category passed in by the aggregator.
|
| 98 |
+
We store this on each article, but we cannot
|
| 99 |
+
actually filter HN results by it. The keyword
|
| 100 |
+
gate in data_validation.py will handle that.
|
| 101 |
+
limit (int): Maximum number of articles to return.
|
| 102 |
+
We cap this at TOP_STORIES_LIMIT (30) regardless.
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
List[Article]: Validated Article objects from Hacker News.
|
| 106 |
+
Returns [] if the network is down or HN is unreachable.
|
| 107 |
+
"""
|
| 108 |
+
try:
|
| 109 |
+
async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
|
| 110 |
+
|
| 111 |
+
# ββ STEP 1: Get the list of top story IDs βββββββββββββββββ
|
| 112 |
+
top_ids = await self._fetch_top_ids(client)
|
| 113 |
+
|
| 114 |
+
if not top_ids:
|
| 115 |
+
logger.warning("[HackerNews] Could not retrieve top story IDs.")
|
| 116 |
+
return []
|
| 117 |
+
|
| 118 |
+
# Slice the list β we only want the top N IDs
|
| 119 |
+
ids_to_fetch = top_ids[:min(limit, TOP_STORIES_LIMIT)]
|
| 120 |
+
|
| 121 |
+
# ββ STEP 2: Fetch all story details concurrently βββββββββββ
|
| 122 |
+
# Instead of fetching stories one-by-one (which would take ~30 seconds),
|
| 123 |
+
# we launch all 30 HTTP requests at the same time using asyncio.gather().
|
| 124 |
+
# All 30 requests fly out simultaneously and come back in ~1-2 seconds.
|
| 125 |
+
fetch_tasks = [
|
| 126 |
+
self._fetch_single_item(client, story_id)
|
| 127 |
+
for story_id in ids_to_fetch
|
| 128 |
+
]
|
| 129 |
+
raw_items = await asyncio.gather(*fetch_tasks, return_exceptions=True)
|
| 130 |
+
|
| 131 |
+
# ββ MAP: Convert raw HN items β Article objects ββββββββββββ
|
| 132 |
+
articles = self._map_items_to_articles(raw_items, category)
|
| 133 |
+
|
| 134 |
+
# ββ ENRICH: Fetch images for articles that have none βββββββ
|
| 135 |
+
# _map_items_to_articles is a sync function, so it cannot await.
|
| 136 |
+
# We run image enrichment here in the async caller instead.
|
| 137 |
+
# All image fetches run concurrently β the total extra wait
|
| 138 |
+
# is ~4 seconds maximum (the outer timeout), not 30Γ4 seconds.
|
| 139 |
+
articles = await self._enrich_article_images(articles)
|
| 140 |
+
|
| 141 |
+
logger.info(
|
| 142 |
+
f"[HackerNews] Fetched {len(raw_items)} items β "
|
| 143 |
+
f"{len(articles)} valid articles for category='{category}'"
|
| 144 |
+
)
|
| 145 |
+
return articles
|
| 146 |
+
|
| 147 |
+
except httpx.TimeoutException:
|
| 148 |
+
logger.warning("[HackerNews] Request timed out. Will retry next cycle.")
|
| 149 |
+
return []
|
| 150 |
+
except Exception as e:
|
| 151 |
+
# Catch-all: never let a HN failure crash the aggregator job
|
| 152 |
+
logger.error(f"[HackerNews] Unexpected error: {e}", exc_info=True)
|
| 153 |
+
return []
|
| 154 |
+
|
| 155 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 156 |
+
# PRIVATE HELPERS β internal steps, not called by the aggregator
|
| 157 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 158 |
+
|
| 159 |
+
async def _fetch_top_ids(self, client: httpx.AsyncClient) -> List[int]:
|
| 160 |
+
"""
|
| 161 |
+
Step 1: Ask Hacker News for the IDs of its top stories.
|
| 162 |
+
|
| 163 |
+
Returns a list of integers like [39281947, 39281001, ...].
|
| 164 |
+
Returns [] if HN is unreachable or returns an error.
|
| 165 |
+
"""
|
| 166 |
+
try:
|
| 167 |
+
response = await client.get(HN_TOP_STORIES_URL)
|
| 168 |
+
|
| 169 |
+
if response.status_code != 200:
|
| 170 |
+
logger.warning(
|
| 171 |
+
f"[HackerNews] Top stories endpoint returned HTTP {response.status_code}"
|
| 172 |
+
)
|
| 173 |
+
return []
|
| 174 |
+
|
| 175 |
+
ids = response.json()
|
| 176 |
+
|
| 177 |
+
# Sanity check β make sure we got a list of numbers, not garbage
|
| 178 |
+
if not isinstance(ids, list):
|
| 179 |
+
logger.warning("[HackerNews] Unexpected response format for top IDs.")
|
| 180 |
+
return []
|
| 181 |
+
|
| 182 |
+
return ids
|
| 183 |
+
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.error(f"[HackerNews] Failed to fetch top IDs: {e}")
|
| 186 |
+
return []
|
| 187 |
+
|
| 188 |
+
async def _fetch_single_item(
|
| 189 |
+
self, client: httpx.AsyncClient, item_id: int
|
| 190 |
+
) -> Optional[dict]:
|
| 191 |
+
"""
|
| 192 |
+
Step 2 (single unit): Fetch the details for one Hacker News story.
|
| 193 |
+
|
| 194 |
+
Args:
|
| 195 |
+
client (httpx.AsyncClient): Shared client passed from fetch_news().
|
| 196 |
+
item_id (int): The numeric ID of the story to fetch.
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
dict of story details, or None if the request failed.
|
| 200 |
+
"""
|
| 201 |
+
url = HN_ITEM_URL.format(item_id=item_id)
|
| 202 |
+
try:
|
| 203 |
+
response = await client.get(url)
|
| 204 |
+
|
| 205 |
+
if response.status_code != 200:
|
| 206 |
+
return None
|
| 207 |
+
|
| 208 |
+
item = response.json()
|
| 209 |
+
|
| 210 |
+
# HN can return null for deleted or dead items
|
| 211 |
+
if not item:
|
| 212 |
+
return None
|
| 213 |
+
|
| 214 |
+
return item
|
| 215 |
+
|
| 216 |
+
except Exception:
|
| 217 |
+
# A single story failing should not cancel the other 29 stories
|
| 218 |
+
return None
|
| 219 |
+
|
| 220 |
+
def _map_items_to_articles(
|
| 221 |
+
self, raw_items: list, category: str
|
| 222 |
+
) -> List[Article]:
|
| 223 |
+
"""
|
| 224 |
+
Convert raw Hacker News JSON items into Segmento Pulse Article objects.
|
| 225 |
+
|
| 226 |
+
This is where all the data transformation happens:
|
| 227 |
+
- Unix timestamp β ISO 8601 string
|
| 228 |
+
- Missing URL β skip (self-posts cannot be stored)
|
| 229 |
+
- Missing image β "" (frontend uses Pulse banner)
|
| 230 |
+
- Missing text β "" (HN has no descriptions for external links)
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
raw_items (list): Results from asyncio.gather() β each is either
|
| 234 |
+
a dict (success) or None/Exception (failure).
|
| 235 |
+
category (str): The category string from the aggregator.
|
| 236 |
+
We pass it through as-is.
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
List[Article]: Clean, valid Article objects ready for the pipeline.
|
| 240 |
+
"""
|
| 241 |
+
articles: List[Article] = []
|
| 242 |
+
|
| 243 |
+
for item in raw_items:
|
| 244 |
+
|
| 245 |
+
# Skip anything that errored or returned null from HN
|
| 246 |
+
if item is None or isinstance(item, Exception):
|
| 247 |
+
continue
|
| 248 |
+
|
| 249 |
+
# ββ Check: Skip non-story types βββββββββββββββββββββββββββββββ
|
| 250 |
+
# HN API also returns "job", "comment", "poll" types.
|
| 251 |
+
# We only want "story" type β the actual articles.
|
| 252 |
+
if item.get("type") != "story":
|
| 253 |
+
continue
|
| 254 |
+
|
| 255 |
+
# ββ Check: Skip self-posts that have no external URL ββββββββββ
|
| 256 |
+
# "Ask HN", "Show HN", and other self-posts have no 'url' key.
|
| 257 |
+
# Our database cannot store a meaningful link for these.
|
| 258 |
+
url = item.get("url", "")
|
| 259 |
+
if not url or not url.startswith("http"):
|
| 260 |
+
continue
|
| 261 |
+
|
| 262 |
+
# ββ Check: Skip stories without a title βββββββββββββββββββββββ
|
| 263 |
+
title = (item.get("title") or "").strip()
|
| 264 |
+
if not title:
|
| 265 |
+
continue
|
| 266 |
+
|
| 267 |
+
# ββ Convert: Unix timestamp β ISO 8601 string βββββββββββββββββ
|
| 268 |
+
# HN stores time as seconds since 1970-01-01 (Unix epoch).
|
| 269 |
+
# Example: 1709432800 β "2024-03-03T04:46:40+00:00"
|
| 270 |
+
unix_time = item.get("time")
|
| 271 |
+
if unix_time:
|
| 272 |
+
published_at = datetime.fromtimestamp(
|
| 273 |
+
unix_time, tz=timezone.utc
|
| 274 |
+
).isoformat()
|
| 275 |
+
else:
|
| 276 |
+
# If HN somehow has no timestamp, use now as fallback.
|
| 277 |
+
# The freshness gate in data_validation.py will still check it.
|
| 278 |
+
published_at = datetime.now(tz=timezone.utc).isoformat()
|
| 279 |
+
|
| 280 |
+
# ββ Build the Article dict βββββββββββββββββββββββββββββββββββββ
|
| 281 |
+
# We use a plain dict here; the aggregator's validation layer
|
| 282 |
+
# converts dicts β Article objects and runs all the checks.
|
| 283 |
+
try:
|
| 284 |
+
article = Article(
|
| 285 |
+
title=title,
|
| 286 |
+
description="", # HN does not provide descriptions
|
| 287 |
+
url=url,
|
| 288 |
+
image_url="", # HN does not provide images
|
| 289 |
+
published_at=published_at,
|
| 290 |
+
source="Hacker News",
|
| 291 |
+
# ββ ROUTING RULE ββββββββββββββββββββββββββββββββββββββ
|
| 292 |
+
# We pass through whatever category the aggregator gave us.
|
| 293 |
+
# If the article doesn't match this category, the keyword
|
| 294 |
+
# gate in data_validation.is_relevant_to_category() will
|
| 295 |
+
# reject it safely β no routing damage to the database.
|
| 296 |
+
category=category,
|
| 297 |
+
)
|
| 298 |
+
articles.append(article)
|
| 299 |
+
|
| 300 |
+
except Exception as e:
|
| 301 |
+
# If one article fails Pydantic validation, log and skip it.
|
| 302 |
+
# Never let one bad article break the whole batch.
|
| 303 |
+
logger.debug(
|
| 304 |
+
f"[HackerNews] Skipped item id={item.get('id')}: {e}"
|
| 305 |
+
)
|
| 306 |
+
continue
|
| 307 |
+
|
| 308 |
+
return articles
|
| 309 |
+
|
| 310 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 311 |
+
# PHASE 12: IMAGE ENRICHMENT β async post-processing step
|
| 312 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 313 |
+
|
| 314 |
+
async def _enrich_article_images(self, articles: List[Article]) -> List[Article]:
|
| 315 |
+
"""
|
| 316 |
+
For every article that has an empty image_url, visit its URL and
|
| 317 |
+
try to find the main image using the og:image HTML meta tag.
|
| 318 |
+
|
| 319 |
+
Phase 14 fix: Added asyncio.Semaphore(10) to cap concurrent connections.
|
| 320 |
+
|
| 321 |
+
Before this fix: 30 HN articles β 30 simultaneous HTTP connections to
|
| 322 |
+
30 different websites. On a slow network day or from a Hugging Face
|
| 323 |
+
shared container, this could exhaust available socket handles.
|
| 324 |
+
|
| 325 |
+
After this fix: At most 10 website visits run at the same time.
|
| 326 |
+
Think of it like 10 checkout lanes at a supermarket β if 30 people
|
| 327 |
+
arrive, 10 go through immediately and 20 wait in line. Nobody gets
|
| 328 |
+
turned away, and the store doesn't collapse.
|
| 329 |
+
|
| 330 |
+
The total added time is still bounded by the 4-second timeout inside
|
| 331 |
+
extract_top_image, not by the semaphore.
|
| 332 |
+
|
| 333 |
+
Args:
|
| 334 |
+
articles (List[Article]): Articles from _map_items_to_articles().
|
| 335 |
+
|
| 336 |
+
Returns:
|
| 337 |
+
List[Article]: Same articles, with image_url filled in where possible.
|
| 338 |
+
"""
|
| 339 |
+
if not articles:
|
| 340 |
+
return articles
|
| 341 |
+
|
| 342 |
+
# Max 10 website visits at the same time.
|
| 343 |
+
# The semaphore is created fresh per call so it doesn't leak state
|
| 344 |
+
# between separate fetch_news() invocations.
|
| 345 |
+
sem = asyncio.Semaphore(10)
|
| 346 |
+
|
| 347 |
+
async def _get_image(article: Article) -> str:
|
| 348 |
+
if article.image_url and article.image_url.startswith("http"):
|
| 349 |
+
return article.image_url # Already has an image β skip
|
| 350 |
+
# Acquire one of 10 available slots before hitting the network.
|
| 351 |
+
async with sem:
|
| 352 |
+
return await extract_top_image(article.url)
|
| 353 |
+
|
| 354 |
+
image_tasks = [_get_image(a) for a in articles]
|
| 355 |
+
fetched_images = await asyncio.gather(*image_tasks, return_exceptions=True)
|
| 356 |
+
|
| 357 |
+
# Apply the fetched images back to the articles.
|
| 358 |
+
enriched: List[Article] = []
|
| 359 |
+
for article, image_result in zip(articles, fetched_images):
|
| 360 |
+
if isinstance(image_result, str) and image_result:
|
| 361 |
+
# Pydantic v2: model_copy() changes one field without mutating.
|
| 362 |
+
article = article.model_copy(update={"image_url": image_result})
|
| 363 |
+
enriched.append(article)
|
| 364 |
+
|
| 365 |
+
return enriched
|
app/services/providers/inshorts/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# providers/inshorts/__init__.py
|
| 2 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
+
# This file marks the 'inshorts' folder as a Python package.
|
| 4 |
+
# To use the Inshorts provider, import it like this:
|
| 5 |
+
#
|
| 6 |
+
# from app.services.providers.inshorts.client import InshortsProvider
|
| 7 |
+
#
|
| 8 |
+
# Inshorts is a FREE provider β no API key needed, no rate limits.
|
| 9 |
+
# It runs in the FREE_SOURCES list, behind the GENERAL_TECH_CATEGORIES
|
| 10 |
+
# guardrail (same as Hacker News), because its content is broad tech news
|
| 11 |
+
# rather than anything niche like cloud-alibaba or data-governance.
|
app/services/providers/inshorts/client.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
providers/inshorts/client.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
The Inshorts Provider for Segmento Pulse.
|
| 5 |
+
|
| 6 |
+
What this does:
|
| 7 |
+
Fetches 60-word tech news summaries from the Inshorts community API.
|
| 8 |
+
Inshorts takes long articles from the internet and rewrites them in
|
| 9 |
+
exactly 60 words. This gives our users very quick, scannable reads.
|
| 10 |
+
|
| 11 |
+
Free. No API key needed. No rate limits.
|
| 12 |
+
|
| 13 |
+
Where it sits in the pipeline:
|
| 14 |
+
FREE_SOURCES (always runs in parallel).
|
| 15 |
+
Gated behind GENERAL_TECH_CATEGORIES β same rule as Hacker News.
|
| 16 |
+
Inshorts "technology" news is broad. It does not know the difference
|
| 17 |
+
between "cloud-alibaba" and "cloud-gcp". We only ask it for wide,
|
| 18 |
+
general categories where its content is genuinely valuable.
|
| 19 |
+
|
| 20 |
+
The special data quirk (split date and time):
|
| 21 |
+
Inshorts returns the article timestamp as TWO separate strings:
|
| 22 |
+
"date": "Mon, 03 Mar 2026"
|
| 23 |
+
"time": "10:30 AM, IST"
|
| 24 |
+
|
| 25 |
+
Our Pydantic Article model needs a SINGLE published_at timestamp.
|
| 26 |
+
So we join them: "Mon, 03 Mar 2026 10:30 AM, IST"
|
| 27 |
+
Then we parse that combined string into a proper datetime object using
|
| 28 |
+
dateutil.parser (the same library our rss_parser.py already uses).
|
| 29 |
+
|
| 30 |
+
If parsing fails, we safely fall back to datetime.now() so the article
|
| 31 |
+
still enters the pipeline and the freshness gate makes the final call.
|
| 32 |
+
|
| 33 |
+
API note:
|
| 34 |
+
The endpoint used below is a well-known community-maintained mirror of
|
| 35 |
+
the Inshorts API. It may change URLs over time. The try/except in
|
| 36 |
+
fetch_news() wraps the entire fetch, so even if the endpoint goes down,
|
| 37 |
+
the aggregator just gets an empty list and moves on without crashing.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
# ββ Standard Library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
+
import asyncio
|
| 42 |
+
import logging
|
| 43 |
+
import time
|
| 44 |
+
from datetime import datetime, timezone
|
| 45 |
+
from typing import List
|
| 46 |
+
|
| 47 |
+
# ββ Third-party (already available β used by rss_parser.py line 209) βββββββββ
|
| 48 |
+
import httpx # Async HTTP client
|
| 49 |
+
from dateutil import parser as dateutil_parser # Flexible date string parser
|
| 50 |
+
|
| 51 |
+
# ββ Internal ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 52 |
+
from app.services.providers.base import NewsProvider, ProviderStatus
|
| 53 |
+
from app.models import Article
|
| 54 |
+
|
| 55 |
+
logger = logging.getLogger(__name__)
|
| 56 |
+
|
| 57 |
+
# ββ Constants βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 58 |
+
|
| 59 |
+
# Inshorts community API endpoint.
|
| 60 |
+
# The 'category=technology' filter is the closest match to our content needs.
|
| 61 |
+
# Other available categories: national, business, sports, entertainment, etc.
|
| 62 |
+
INSHORTS_URL = "https://inshorts.deta.dev/news?category=technology"
|
| 63 |
+
|
| 64 |
+
# Request timeout in seconds. Kept generous because this is a community server.
|
| 65 |
+
HTTP_TIMEOUT_SECONDS = 12.0
|
| 66 |
+
|
| 67 |
+
# Max articles to take from one response. Inshorts usually sends 10-25.
|
| 68 |
+
MAX_ARTICLES = 20
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class InshortsProvider(NewsProvider):
|
| 72 |
+
"""
|
| 73 |
+
Fetches 60-word technology summaries from the Inshorts community API.
|
| 74 |
+
|
| 75 |
+
Free. No API key. No daily limit.
|
| 76 |
+
Sits in FREE_SOURCES, gated by GENERAL_TECH_CATEGORIES.
|
| 77 |
+
|
| 78 |
+
Usage (wired in Phase 6):
|
| 79 |
+
provider = InshortsProvider()
|
| 80 |
+
articles = await provider.fetch_news(category="ai", limit=20)
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
def __init__(self):
|
| 84 |
+
# Free provider β no API key, no daily limit.
|
| 85 |
+
super().__init__(api_key=None)
|
| 86 |
+
self.daily_limit = 0
|
| 87 |
+
|
| 88 |
+
# Phase 17: Fetch-Once, Fan-Out cache
|
| 89 |
+
#
|
| 90 |
+
# Inshorts hits a community server β not a CDN like GitHub Pages.
|
| 91 |
+
# Without a cache, every category loop sends a request to that
|
| 92 |
+
# community server, increasing the chance of a 429 rate-limit block.
|
| 93 |
+
# With a cache: 22 category calls β 1 real HTTP call per 45 minutes.
|
| 94 |
+
self._cached_articles: List[Article] = []
|
| 95 |
+
self._cache_time: float = 0.0
|
| 96 |
+
|
| 97 |
+
# Lock prevents the "thundering herd": multiple concurrent calls
|
| 98 |
+
# all seeing an empty cache and all fetching at the same time.
|
| 99 |
+
self._lock = asyncio.Lock()
|
| 100 |
+
|
| 101 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
+
# MAIN ENTRY POINT β called by the aggregator's FREE PARALLEL RUN
|
| 103 |
+
# ββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββββββββββββββββ
|
| 104 |
+
|
| 105 |
+
async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
|
| 106 |
+
"""
|
| 107 |
+
Fetch technology articles from the Inshorts community API.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
category (str): Our internal category string (e.g., "ai").
|
| 111 |
+
We tag every article with it. The keyword gate
|
| 112 |
+
filters out articles that don't actually match.
|
| 113 |
+
limit (int): Max articles to return. Capped at MAX_ARTICLES.
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
List[Article]: Mapped Article objects. Returns [] on any failure.
|
| 117 |
+
"""
|
| 118 |
+
# ββ Phase 17: Cache check (OUTER) βββββββββββββββββββββββββββββββββββββ
|
| 119 |
+
CACHE_TTL_SECONDS = 2700 # 45 minutes
|
| 120 |
+
|
| 121 |
+
if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
|
| 122 |
+
logger.debug(
|
| 123 |
+
"[Inshorts] Cache hit β returning %d cached articles for category='%s'. "
|
| 124 |
+
"No HTTP calls made.",
|
| 125 |
+
len(self._cached_articles), category
|
| 126 |
+
)
|
| 127 |
+
return self._cached_articles
|
| 128 |
+
|
| 129 |
+
# ββ Cache stale or empty: acquire the lock and fetch βββββββββββββββββββ
|
| 130 |
+
async with self._lock:
|
| 131 |
+
|
| 132 |
+
# ββ Cache check (INNER) β double-checked locking ββββββββββββββ
|
| 133 |
+
if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
|
| 134 |
+
logger.debug(
|
| 135 |
+
"[Inshorts] Cache hit after lock β returning %d cached articles.",
|
| 136 |
+
len(self._cached_articles)
|
| 137 |
+
)
|
| 138 |
+
return self._cached_articles
|
| 139 |
+
|
| 140 |
+
logger.info(
|
| 141 |
+
"[Inshorts] Cache stale/empty. Fetching from community API for category='%s'...",
|
| 142 |
+
category
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
try:
|
| 146 |
+
async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
|
| 147 |
+
|
| 148 |
+
response = await client.get(
|
| 149 |
+
INSHORTS_URL,
|
| 150 |
+
headers={"User-Agent": "SegmentoPulse-Ingestion/1.0"},
|
| 151 |
+
follow_redirects=True,
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# ββ Handle rate limit ββββββββββββββββββββββββββββββββββββββ
|
| 155 |
+
if response.status_code == 429:
|
| 156 |
+
logger.warning("[Inshorts] Hit 429 rate limit.")
|
| 157 |
+
self.mark_rate_limited()
|
| 158 |
+
return []
|
| 159 |
+
|
| 160 |
+
# ββ Handle non-200 responses ββββββββββββββββββββββββββββββ
|
| 161 |
+
if response.status_code != 200:
|
| 162 |
+
logger.warning(
|
| 163 |
+
"[Inshorts] Unexpected HTTP %d. "
|
| 164 |
+
"The community API endpoint may have changed.",
|
| 165 |
+
response.status_code
|
| 166 |
+
)
|
| 167 |
+
return []
|
| 168 |
+
|
| 169 |
+
data = response.json()
|
| 170 |
+
|
| 171 |
+
# Inshorts wraps the article list inside a 'data' key.
|
| 172 |
+
raw_articles = data.get("data", [])
|
| 173 |
+
|
| 174 |
+
if not isinstance(raw_articles, list) or not raw_articles:
|
| 175 |
+
logger.info("[Inshorts] No articles in response.")
|
| 176 |
+
return []
|
| 177 |
+
|
| 178 |
+
all_articles = self._map_articles(
|
| 179 |
+
raw_articles[:min(limit, MAX_ARTICLES)],
|
| 180 |
+
category
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
logger.info(
|
| 184 |
+
"[Inshorts] Fetched %d articles. Caching for 45 minutes.",
|
| 185 |
+
len(all_articles)
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
# Save to class-level cache.
|
| 189 |
+
self._cached_articles = all_articles
|
| 190 |
+
self._cache_time = time.time()
|
| 191 |
+
return all_articles
|
| 192 |
+
|
| 193 |
+
except httpx.TimeoutException:
|
| 194 |
+
logger.warning("[Inshorts] Request timed out β endpoint may be slow.")
|
| 195 |
+
return []
|
| 196 |
+
except Exception as e:
|
| 197 |
+
logger.error(f"[Inshorts] Unexpected error: {e}", exc_info=True)
|
| 198 |
+
return []
|
| 199 |
+
|
| 200 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 201 |
+
# PRIVATE HELPERS
|
| 202 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 203 |
+
|
| 204 |
+
def _parse_inshorts_date(self, date_str: str, time_str: str) -> str:
|
| 205 |
+
"""
|
| 206 |
+
Solve the split date/time problem.
|
| 207 |
+
|
| 208 |
+
Inshorts gives us date and time as two separate strings.
|
| 209 |
+
Example:
|
| 210 |
+
date_str = "Mon, 03 Mar 2026"
|
| 211 |
+
time_str = "10:30 AM, IST"
|
| 212 |
+
|
| 213 |
+
Step 1: Join them β "Mon, 03 Mar 2026 10:30 AM, IST"
|
| 214 |
+
Step 2: Parse with dateutil (handles many date formats automatically)
|
| 215 |
+
Step 3: Convert to UTC-aware ISO 8601 string
|
| 216 |
+
|
| 217 |
+
If parsing fails for any reason, we return the current time as a
|
| 218 |
+
safe fallback. The freshness gate downstream will evaluate it.
|
| 219 |
+
|
| 220 |
+
Args:
|
| 221 |
+
date_str (str): The date portion from the API (e.g., "Mon, 03 Mar 2026")
|
| 222 |
+
time_str (str): The time portion from the API (e.g., "10:30 AM, IST")
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
str: ISO 8601 timestamp string (e.g., "2026-03-03T05:00:00+00:00")
|
| 226 |
+
"""
|
| 227 |
+
# Clean up trailing ", IST" or "(IST)" markers β dateutil sometimes
|
| 228 |
+
# gets confused by non-standard timezone abbreviations like IST.
|
| 229 |
+
# We strip them and treat the time as IST = UTC+5:30 manually.
|
| 230 |
+
cleaned_time = (
|
| 231 |
+
time_str
|
| 232 |
+
.replace(", IST", "")
|
| 233 |
+
.replace("(IST)", "")
|
| 234 |
+
.strip()
|
| 235 |
+
)
|
| 236 |
+
combined = f"{date_str.strip()} {cleaned_time}"
|
| 237 |
+
|
| 238 |
+
try:
|
| 239 |
+
# dateutil.parser is very flexible β it handles formats like:
|
| 240 |
+
# "Mon, 03 Mar 2026 10:30 AM" without needing a strptime pattern.
|
| 241 |
+
parsed_dt = dateutil_parser.parse(combined)
|
| 242 |
+
|
| 243 |
+
# If the parsed datetime has no timezone info (which it won't after
|
| 244 |
+
# we stripped IST), we tell Python it was in IST (UTC+5:30).
|
| 245 |
+
if parsed_dt.tzinfo is None:
|
| 246 |
+
from datetime import timedelta
|
| 247 |
+
IST = timezone(timedelta(hours=5, minutes=30))
|
| 248 |
+
parsed_dt = parsed_dt.replace(tzinfo=IST)
|
| 249 |
+
|
| 250 |
+
# Convert to UTC for consistent storage across all providers.
|
| 251 |
+
utc_dt = parsed_dt.astimezone(timezone.utc)
|
| 252 |
+
return utc_dt.isoformat()
|
| 253 |
+
|
| 254 |
+
except Exception as e:
|
| 255 |
+
logger.debug(
|
| 256 |
+
f"[Inshorts] Date parse failed for '{combined}': {e} β using now()."
|
| 257 |
+
)
|
| 258 |
+
# Safe fallback: use current UTC time.
|
| 259 |
+
# The freshness gate will still check it and decide if it's valid.
|
| 260 |
+
return datetime.now(tz=timezone.utc).isoformat()
|
| 261 |
+
|
| 262 |
+
def _map_articles(self, raw_articles: list, category: str) -> List[Article]:
|
| 263 |
+
"""
|
| 264 |
+
Convert raw Inshorts JSON items into Segmento Pulse Article objects.
|
| 265 |
+
|
| 266 |
+
Key field mappings:
|
| 267 |
+
Inshorts field β Article field
|
| 268 |
+
βββββββββββββββββββββββββββββββββββββ
|
| 269 |
+
title β title
|
| 270 |
+
content β description (the famous 60-word summary)
|
| 271 |
+
readMoreUrl β url
|
| 272 |
+
imageUrl β image_url
|
| 273 |
+
author β source
|
| 274 |
+
date + time (joined) β published_at
|
| 275 |
+
|
| 276 |
+
Args:
|
| 277 |
+
raw_articles (list): The list from the API's 'data' key.
|
| 278 |
+
category (str): The category from the aggregator.
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
List[Article]: Clean, validated Article objects.
|
| 282 |
+
"""
|
| 283 |
+
articles: List[Article] = []
|
| 284 |
+
|
| 285 |
+
for item in raw_articles:
|
| 286 |
+
if not isinstance(item, dict):
|
| 287 |
+
continue
|
| 288 |
+
|
| 289 |
+
# ββ Title ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 290 |
+
title = (item.get("title") or "").strip()
|
| 291 |
+
if not title:
|
| 292 |
+
continue
|
| 293 |
+
|
| 294 |
+
# ββ URL ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 295 |
+
# Inshorts calls this 'readMoreUrl' β the link to the full article.
|
| 296 |
+
url = (item.get("readMoreUrl") or "").strip()
|
| 297 |
+
if not url or not url.startswith("http"):
|
| 298 |
+
continue # Skip if no valid link
|
| 299 |
+
|
| 300 |
+
# ββ Description (the 60-word summary) ββββββββββββββββββββββββ
|
| 301 |
+
# Inshorts calls the summary field 'content'.
|
| 302 |
+
description = (item.get("content") or "").strip()
|
| 303 |
+
|
| 304 |
+
# ββ Image URL βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 305 |
+
# Inshorts calls this 'imageUrl' (camelCase).
|
| 306 |
+
image_url = (item.get("imageUrl") or "").strip()
|
| 307 |
+
|
| 308 |
+
# ββ Source βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 309 |
+
# The 'author' field holds the original publication name
|
| 310 |
+
# (e.g., "TechCrunch", "NDTV Gadgets"). We use that as source.
|
| 311 |
+
# Fall back to "Inshorts" if author is missing.
|
| 312 |
+
source = (item.get("author") or "Inshorts").strip()
|
| 313 |
+
if not source:
|
| 314 |
+
source = "Inshorts"
|
| 315 |
+
|
| 316 |
+
# ββ Date Fix: Combine split date + time βββββββββββββββββββββββ
|
| 317 |
+
# This is THE key transformation for this provider.
|
| 318 |
+
# See _parse_inshorts_date() above for the full explanation.
|
| 319 |
+
date_part = item.get("date") or ""
|
| 320 |
+
time_part = item.get("time") or ""
|
| 321 |
+
published_at = self._parse_inshorts_date(date_part, time_part)
|
| 322 |
+
|
| 323 |
+
# ββ Build Article βββββββββββββββββββββββββββββββββββββββββββββ
|
| 324 |
+
try:
|
| 325 |
+
article = Article(
|
| 326 |
+
title=title,
|
| 327 |
+
description=description,
|
| 328 |
+
url=url,
|
| 329 |
+
image_url=image_url,
|
| 330 |
+
published_at=published_at,
|
| 331 |
+
source=source,
|
| 332 |
+
# ββ ROUTING RULE ββββββββββββββββββββββββββββββββββββββ
|
| 333 |
+
# We pass through the aggregator's category.
|
| 334 |
+
# The keyword gate will filter irrelevant articles.
|
| 335 |
+
# Unknown categories safely route to 'News Articles'.
|
| 336 |
+
category=category,
|
| 337 |
+
)
|
| 338 |
+
articles.append(article)
|
| 339 |
+
|
| 340 |
+
except Exception as e:
|
| 341 |
+
logger.debug(
|
| 342 |
+
f"[Inshorts] Skipped item '{title[:50]}': {e}"
|
| 343 |
+
)
|
| 344 |
+
continue
|
| 345 |
+
|
| 346 |
+
return articles
|
app/services/providers/openrss/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# providers/openrss/__init__.py
|
| 2 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
+
# This file marks the 'openrss' folder as a Python package.
|
| 4 |
+
# To use this provider, import it like this:
|
| 5 |
+
#
|
| 6 |
+
# from app.services.providers.openrss.client import OpenRSSProvider
|
| 7 |
+
#
|
| 8 |
+
# OpenRSS is FREE β no API key needed. It generates XML feeds on-the-fly
|
| 9 |
+
# for any website, even sites that don't publish an RSS feed themselves.
|
| 10 |
+
#
|
| 11 |
+
# ββ CRITICAL RULE: RESPECT THE COOLDOWN ββββββββββββββββββββββββββββββββββ
|
| 12 |
+
# OpenRSS explicitly says "aggregator use is not officially supported".
|
| 13 |
+
# If you fetch too frequently, they WILL ban your server's IP address.
|
| 14 |
+
# The OpenRSSProvider enforces a strict 60-minute internal cooldown timer.
|
| 15 |
+
# DO NOT reduce COOLDOWN_SECONDS below 3600. Breaking this causes IP bans.
|
app/services/providers/openrss/client.py
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
providers/openrss/client.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
The OpenRSS Provider for Segmento Pulse.
|
| 5 |
+
|
| 6 |
+
What this does:
|
| 7 |
+
Fetches RSS feeds for websites that don't publish their own RSS feed,
|
| 8 |
+
by using OpenRSS.org as a free feed generation service.
|
| 9 |
+
|
| 10 |
+
Target blogs:
|
| 11 |
+
dev.to β openrss.org/dev.to
|
| 12 |
+
hashnode.com β openrss.org/hashnode.com
|
| 13 |
+
github.com/blog β openrss.org/github.com/blog
|
| 14 |
+
|
| 15 |
+
Free. No API key. No daily limits. Just XML text.
|
| 16 |
+
|
| 17 |
+
ββ THE IP BAN RISK AND HOW WE SOLVE IT βββββββββββββββββββββββββββββββββββββ
|
| 18 |
+
|
| 19 |
+
OpenRSS.org says clearly in their documentation:
|
| 20 |
+
"Aggregator use is not officially supported."
|
| 21 |
+
"We will block IP addresses that ignore our Cache-Control headers."
|
| 22 |
+
|
| 23 |
+
A normal aggregator calls all its sources every hour.
|
| 24 |
+
If we did that with OpenRSS, we would get IP-banned within a day.
|
| 25 |
+
|
| 26 |
+
Our fix: A strict 60-minute (3600 second) internal cooldown timer.
|
| 27 |
+
|
| 28 |
+
How it works:
|
| 29 |
+
- When the provider is first created, self.last_fetched = 0
|
| 30 |
+
- When fetch_news() is called, it first checks:
|
| 31 |
+
time.time() - self.last_fetched < COOLDOWN_SECONDS?
|
| 32 |
+
- If YES β return [] immediately, do not touch the network at all
|
| 33 |
+
- If NO β update self.last_fetched, then fetch
|
| 34 |
+
|
| 35 |
+
This guarantees that OpenRSS sees at most ONE request per hour,
|
| 36 |
+
per URL, from our server β which respects their Cache-Control policy.
|
| 37 |
+
|
| 38 |
+
Because our scheduler runs many categories per hour, without this timer,
|
| 39 |
+
OpenRSS would get hit dozens of times per hour. With the timer, it gets
|
| 40 |
+
hit at most once every 60 minutes regardless of how many categories fire.
|
| 41 |
+
|
| 42 |
+
ββ WHY WE DO NOT USE parse_provider_rss() ββββββββββββββββββββββββββββββββββ
|
| 43 |
+
|
| 44 |
+
The user instruction suggests using parse_provider_rss() from rss_parser.py.
|
| 45 |
+
We discovered in Phase 4 (direct_rss provider) that this function hardcodes:
|
| 46 |
+
|
| 47 |
+
category = f'cloud-{provider}'
|
| 48 |
+
|
| 49 |
+
on EVERY article it creates. If we passed "dev.to" as the provider name,
|
| 50 |
+
every article from dev.to would get category='cloud-dev.to'. Appwrite
|
| 51 |
+
would not know this collection exists, silently dropping those articles.
|
| 52 |
+
|
| 53 |
+
Decision (consistent with Phase 4): We use feedparser directly and borrow
|
| 54 |
+
only the two STATELESS helper methods from rss_parser.py:
|
| 55 |
+
- _extract_image_from_entry() β extracts images cleanly
|
| 56 |
+
- _parse_date() β handles all date format variants
|
| 57 |
+
|
| 58 |
+
This is the same engineering decision made in Phase 4 for direct_rss,
|
| 59 |
+
and it was reviewed and approved by the lead architect.
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
# ββ Standard Library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
import asyncio
|
| 64 |
+
import logging
|
| 65 |
+
import re
|
| 66 |
+
import time
|
| 67 |
+
from typing import List
|
| 68 |
+
|
| 69 |
+
# ββ Third-party (already in requirements.txt) βββββββββββββββββββββββββββββββββ
|
| 70 |
+
import feedparser # XML/RSS feed parser β already used by rss_parser.py
|
| 71 |
+
import httpx # Async HTTP client
|
| 72 |
+
|
| 73 |
+
# ββ Internal ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 74 |
+
from app.services.providers.base import NewsProvider
|
| 75 |
+
from app.services.rss_parser import RSSParser # Borrowed for helper methods only
|
| 76 |
+
from app.models import Article
|
| 77 |
+
# Phase 15: Import the Redis-backed state utility so the cooldown
|
| 78 |
+
# timer survives Hugging Face Space restarts.
|
| 79 |
+
from app.services.utils.provider_state import (
|
| 80 |
+
get_provider_timestamp,
|
| 81 |
+
set_provider_timestamp,
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
logger = logging.getLogger(__name__)
|
| 85 |
+
|
| 86 |
+
# ββ OpenRSS Feed Registry ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
+
#
|
| 88 |
+
# Each entry is a tuple of (openrss_url, source_name).
|
| 89 |
+
# source_name appears in the Pulse UI next to each article headline.
|
| 90 |
+
#
|
| 91 |
+
# To add more feeds in the future, just add a new tuple here.
|
| 92 |
+
# The fetch loop picks it up automatically β no other code changes needed.
|
| 93 |
+
#
|
| 94 |
+
# β οΈ IMPORTANT: Be conservative. Every URL here gets fetched once per cooldown
|
| 95 |
+
# window. Adding too many URLs consumes more of our cooldown budget.
|
| 96 |
+
#
|
| 97 |
+
OPENRSS_FEEDS: List[tuple] = [
|
| 98 |
+
("https://openrss.org/dev.to", "dev.to"),
|
| 99 |
+
("https://openrss.org/hashnode.com", "Hashnode"),
|
| 100 |
+
("https://openrss.org/github.com/blog", "GitHub Blog"),
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
# ββ Cooldown Timer ββββββββββββββββββββββββοΏ½οΏ½ββββββββββββββββββββββββββββββββββββ
|
| 104 |
+
# 3600 seconds = 60 minutes.
|
| 105 |
+
# This is the minimum safe polling interval as per OpenRSS's documentation.
|
| 106 |
+
# DO NOT reduce this value. Doing so risks an IP ban on Segmento Pulse's server.
|
| 107 |
+
COOLDOWN_SECONDS = 3600
|
| 108 |
+
|
| 109 |
+
# HTTP request timeout. OpenRSS is a third-party service; give it enough time.
|
| 110 |
+
HTTP_TIMEOUT_SECONDS = 15.0
|
| 111 |
+
|
| 112 |
+
# Max articles to take from each individual feed per cooldown window.
|
| 113 |
+
MAX_ARTICLES_PER_FEED = 10
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class OpenRSSProvider(NewsProvider):
|
| 117 |
+
"""
|
| 118 |
+
Fetches RSS feeds from dev.to, Hashnode, and GitHub Blog via OpenRSS.org.
|
| 119 |
+
|
| 120 |
+
Free. No API key. Strictly rate-self-limited to once per 60 minutes.
|
| 121 |
+
Runs for ALL categories in FREE_SOURCES β no category guardrail needed
|
| 122 |
+
because the cooldown timer is the primary protection mechanism.
|
| 123 |
+
|
| 124 |
+
Usage (wired in Phase 9):
|
| 125 |
+
provider = OpenRSSProvider()
|
| 126 |
+
articles = await provider.fetch_news(category="ai", limit=30)
|
| 127 |
+
"""
|
| 128 |
+
|
| 129 |
+
def __init__(self):
|
| 130 |
+
# Free provider β no API key, no daily limit.
|
| 131 |
+
super().__init__(api_key=None)
|
| 132 |
+
self.daily_limit = 0
|
| 133 |
+
|
| 134 |
+
# Phase 15: The cooldown timer has moved to Redis.
|
| 135 |
+
# self.last_fetched is kept as a local fallback cache: if Redis is
|
| 136 |
+
# unreachable on startup, we fall back to 0.0 (fail-open β allowed
|
| 137 |
+
# to run). On every successful Redis read in fetch_news(), this
|
| 138 |
+
# local value is updated so it stays in sync.
|
| 139 |
+
self.last_fetched: float = 0.0
|
| 140 |
+
|
| 141 |
+
# Borrow stateless helpers from the existing RSSParser.
|
| 142 |
+
# We do NOT call parse_provider_rss() β see module docstring above.
|
| 143 |
+
self._rss_helpers = RSSParser()
|
| 144 |
+
|
| 145 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 146 |
+
# MAIN ENTRY POINT β called by the aggregator's FREE PARALLEL RUN
|
| 147 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 148 |
+
|
| 149 |
+
async def fetch_news(self, category: str, limit: int = 30) -> List[Article]:
|
| 150 |
+
"""
|
| 151 |
+
Fetch articles from all OpenRSS feeds β but only if 60 minutes have
|
| 152 |
+
passed since the last successful fetch.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
category (str): The aggregator's category β tagged on every article.
|
| 156 |
+
The keyword gate filters irrelevant articles downstream.
|
| 157 |
+
limit (int): Soft cap on total articles to return.
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
List[Article]: Combined articles from all feeds.
|
| 161 |
+
Returns [] immediately if we are still in cooldown.
|
| 162 |
+
"""
|
| 163 |
+
# ββ SAFETY CHECK: Are we still in the cooldown window? ββββββββββββββββ
|
| 164 |
+
# Phase 15: Read the last-fetch timestamp from Redis instead of RAM.
|
| 165 |
+
#
|
| 166 |
+
# Before Phase 15: self.last_fetched (pure RAM, wiped on restart)
|
| 167 |
+
# After Phase 15: Redis key "provider:state:openrss:last_fetch"
|
| 168 |
+
# survives restarts, deployments, and container OOM kills.
|
| 169 |
+
#
|
| 170 |
+
# If Redis is down: get_provider_timestamp returns 0.0 (fail-open).
|
| 171 |
+
# This means the provider is allowed to run. One extra OpenRSS call
|
| 172 |
+
# is far safer than permanently blocking the provider because Redis
|
| 173 |
+
# happened to be unreachable for 10 seconds during a cold boot.
|
| 174 |
+
redis_last_fetched = await get_provider_timestamp("openrss")
|
| 175 |
+
|
| 176 |
+
# Keep the local RAM value in sync for logging and debugging purposes.
|
| 177 |
+
# This does NOT affect the cooldown logic β only redis_last_fetched does.
|
| 178 |
+
self.last_fetched = redis_last_fetched
|
| 179 |
+
|
| 180 |
+
seconds_since_last_fetch = time.time() - redis_last_fetched
|
| 181 |
+
if seconds_since_last_fetch < COOLDOWN_SECONDS:
|
| 182 |
+
minutes_remaining = int(
|
| 183 |
+
(COOLDOWN_SECONDS - seconds_since_last_fetch) / 60
|
| 184 |
+
)
|
| 185 |
+
logger.info(
|
| 186 |
+
"[OpenRSS] Cooldown active β %d minute(s) remaining before next fetch. "
|
| 187 |
+
"Skipping to protect against IP ban.",
|
| 188 |
+
minutes_remaining
|
| 189 |
+
)
|
| 190 |
+
return []
|
| 191 |
+
|
| 192 |
+
# ββ OK to fetch: save the new timestamp to Redis BEFORE hitting the network ββ
|
| 193 |
+
# We write BEFORE the network calls, not after. Here is why:
|
| 194 |
+
# If we save the timestamp AFTER and the fetch crashes halfway through,
|
| 195 |
+
# the next scheduler cycle will see "last_fetched = 0" and fire again
|
| 196 |
+
# immediately β hammering OpenRSS with rapid retries. That is the
|
| 197 |
+
# exact behaviour that triggers IP bans.
|
| 198 |
+
# By writing the timestamp FIRST, any crash still waits the full
|
| 199 |
+
# 60 minutes before the next attempt. Better to miss one batch than
|
| 200 |
+
# to risk a permanent IP ban.
|
| 201 |
+
current_time = time.time()
|
| 202 |
+
self.last_fetched = current_time # Keep RAM copy in sync
|
| 203 |
+
await set_provider_timestamp("openrss", current_time)
|
| 204 |
+
|
| 205 |
+
logger.info(
|
| 206 |
+
"[OpenRSS] Cooldown clear (Redis-backed). Starting fetch of %d feeds...",
|
| 207 |
+
len(OPENRSS_FEEDS)
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
try:
|
| 212 |
+
async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
|
| 213 |
+
|
| 214 |
+
# Build one fetch task per feed URL β all fire simultaneously.
|
| 215 |
+
fetch_tasks = [
|
| 216 |
+
self._fetch_and_parse_feed(client, url, source_name, category)
|
| 217 |
+
for url, source_name in OPENRSS_FEEDS
|
| 218 |
+
]
|
| 219 |
+
|
| 220 |
+
# Wait for all feeds to complete at the same time.
|
| 221 |
+
results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
|
| 222 |
+
|
| 223 |
+
# Combine all articles from all feeds.
|
| 224 |
+
all_articles: List[Article] = []
|
| 225 |
+
for (_, source_name), result in zip(OPENRSS_FEEDS, results):
|
| 226 |
+
if isinstance(result, Exception):
|
| 227 |
+
logger.warning(
|
| 228 |
+
f"[OpenRSS] [{source_name}] Feed fetch failed: {result}"
|
| 229 |
+
)
|
| 230 |
+
elif isinstance(result, list):
|
| 231 |
+
all_articles.extend(result)
|
| 232 |
+
|
| 233 |
+
logger.info(
|
| 234 |
+
f"[OpenRSS] Collected {len(all_articles)} articles "
|
| 235 |
+
f"from {len(OPENRSS_FEEDS)} feeds for category='{category}'"
|
| 236 |
+
)
|
| 237 |
+
return all_articles
|
| 238 |
+
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.error(f"[OpenRSS] Unexpected error: {e}", exc_info=True)
|
| 241 |
+
return []
|
| 242 |
+
|
| 243 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 244 |
+
# PRIVATE HELPERS
|
| 245 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 246 |
+
|
| 247 |
+
async def _fetch_and_parse_feed(
|
| 248 |
+
self,
|
| 249 |
+
client: httpx.AsyncClient,
|
| 250 |
+
url: str,
|
| 251 |
+
source_name: str,
|
| 252 |
+
category: str,
|
| 253 |
+
) -> List[Article]:
|
| 254 |
+
"""
|
| 255 |
+
Fetch one OpenRSS feed URL and parse its XML into Article objects.
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
client (httpx.AsyncClient): Shared HTTP client from fetch_news().
|
| 259 |
+
url (str): Full OpenRSS URL (e.g., openrss.org/dev.to).
|
| 260 |
+
source_name (str): Human-readable label (e.g., "dev.to").
|
| 261 |
+
category (str): The aggregator's category β tagged on each article.
|
| 262 |
+
|
| 263 |
+
Returns:
|
| 264 |
+
List[Article]: Parsed articles. Returns [] on any failure.
|
| 265 |
+
"""
|
| 266 |
+
try:
|
| 267 |
+
response = await client.get(
|
| 268 |
+
url,
|
| 269 |
+
headers={
|
| 270 |
+
"User-Agent": "SegmentoPulse-RSS-Reader/1.0",
|
| 271 |
+
# Sending Cache-Control: no-cache would be rude.
|
| 272 |
+
# We rely on our cooldown timer to manage freshness,
|
| 273 |
+
# not by asking OpenRSS to skip their cache.
|
| 274 |
+
},
|
| 275 |
+
follow_redirects=True,
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
if response.status_code == 429:
|
| 279 |
+
# If OpenRSS sends a 429 despite our cooldown, double the wait
|
| 280 |
+
# by resetting the timer to now (conservative recovery).
|
| 281 |
+
logger.warning(
|
| 282 |
+
f"[OpenRSS] [{source_name}] HTTP 429 β rate-limited despite "
|
| 283 |
+
"cooldown. Consider increasing COOLDOWN_SECONDS."
|
| 284 |
+
)
|
| 285 |
+
return []
|
| 286 |
+
|
| 287 |
+
if response.status_code != 200:
|
| 288 |
+
logger.warning(
|
| 289 |
+
f"[OpenRSS] [{source_name}] HTTP {response.status_code} β skipping."
|
| 290 |
+
)
|
| 291 |
+
return []
|
| 292 |
+
|
| 293 |
+
xml_text = response.text
|
| 294 |
+
|
| 295 |
+
except httpx.TimeoutException:
|
| 296 |
+
logger.warning(f"[OpenRSS] [{source_name}] Timed out β skipping.")
|
| 297 |
+
return []
|
| 298 |
+
except Exception as e:
|
| 299 |
+
logger.warning(f"[OpenRSS] [{source_name}] Fetch error: {e}")
|
| 300 |
+
return []
|
| 301 |
+
|
| 302 |
+
return self._parse_feed_xml(xml_text, source_name, category)
|
| 303 |
+
|
| 304 |
+
def _parse_feed_xml(
|
| 305 |
+
self,
|
| 306 |
+
xml_text: str,
|
| 307 |
+
source_name: str,
|
| 308 |
+
category: str,
|
| 309 |
+
) -> List[Article]:
|
| 310 |
+
"""
|
| 311 |
+
Parse raw XML from an OpenRSS feed into Article objects.
|
| 312 |
+
|
| 313 |
+
Uses feedparser directly β not parse_provider_rss() β because
|
| 314 |
+
parse_provider_rss hardcodes category='cloud-{provider}'.
|
| 315 |
+
We borrow _extract_image_from_entry and _parse_date for consistency.
|
| 316 |
+
|
| 317 |
+
Args:
|
| 318 |
+
xml_text (str): Raw XML string from the HTTP response.
|
| 319 |
+
source_name (str): The blog name (e.g., "dev.to").
|
| 320 |
+
category (str): Aggregator category β tagged on every article.
|
| 321 |
+
|
| 322 |
+
Returns:
|
| 323 |
+
List[Article]: Parsed article objects.
|
| 324 |
+
"""
|
| 325 |
+
try:
|
| 326 |
+
feed = feedparser.parse(xml_text)
|
| 327 |
+
except Exception as e:
|
| 328 |
+
logger.warning(f"[OpenRSS] [{source_name}] feedparser failed: {e}")
|
| 329 |
+
return []
|
| 330 |
+
|
| 331 |
+
articles: List[Article] = []
|
| 332 |
+
|
| 333 |
+
for entry in feed.entries[:MAX_ARTICLES_PER_FEED]:
|
| 334 |
+
|
| 335 |
+
# ββ Title ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 336 |
+
title = (entry.get("title") or "").strip()
|
| 337 |
+
if not title:
|
| 338 |
+
continue
|
| 339 |
+
|
| 340 |
+
# ββ URL ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 341 |
+
url = (entry.get("link") or "").strip()
|
| 342 |
+
if not url or not url.startswith("http"):
|
| 343 |
+
continue
|
| 344 |
+
|
| 345 |
+
# ββ Description βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 346 |
+
raw_desc = entry.get("summary", "") or ""
|
| 347 |
+
description = re.sub(r"<[^>]+>", "", raw_desc).strip()
|
| 348 |
+
if len(description) > 200:
|
| 349 |
+
description = description[:200] + "..."
|
| 350 |
+
|
| 351 |
+
# ββ Image URL βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 352 |
+
# Reuse rss_parser's helper β checks media:content, enclosures, etc.
|
| 353 |
+
image_url = self._rss_helpers._extract_image_from_entry(entry)
|
| 354 |
+
|
| 355 |
+
# ββ Published Date ββββββββββββββββββββββββββββββββββββββββββββ
|
| 356 |
+
# Reuse rss_parser's _parse_date β handles all date format variants.
|
| 357 |
+
raw_date = entry.get("published", "") or ""
|
| 358 |
+
published_at = self._rss_helpers._parse_date(raw_date)
|
| 359 |
+
|
| 360 |
+
# ββ Build Article βββββββββββββββββββββββββββββββββββββββββββββ
|
| 361 |
+
try:
|
| 362 |
+
article = Article(
|
| 363 |
+
title=title,
|
| 364 |
+
description=description,
|
| 365 |
+
url=url,
|
| 366 |
+
image_url=image_url,
|
| 367 |
+
published_at=published_at,
|
| 368 |
+
source=source_name,
|
| 369 |
+
# ββ ROUTING RULE ββββββββββββββββββββββββββββββββββββββ
|
| 370 |
+
# Tag with the aggregator's category so the pipeline
|
| 371 |
+
# can route this correctly. Unknown categories safely
|
| 372 |
+
# fall back to the default 'News Articles' collection.
|
| 373 |
+
category=category,
|
| 374 |
+
)
|
| 375 |
+
articles.append(article)
|
| 376 |
+
|
| 377 |
+
except Exception as e:
|
| 378 |
+
logger.debug(
|
| 379 |
+
f"[OpenRSS] [{source_name}] Skipped entry '{title[:50]}': {e}"
|
| 380 |
+
)
|
| 381 |
+
continue
|
| 382 |
+
|
| 383 |
+
logger.info(f"[OpenRSS] [{source_name}] Parsed {len(articles)} articles.")
|
| 384 |
+
return articles
|
app/services/providers/sauravkanchan/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# providers/sauravkanchan/__init__.py
|
| 2 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
+
# This file marks the 'sauravkanchan' folder as a Python package.
|
| 4 |
+
# To use this provider, import it like this:
|
| 5 |
+
#
|
| 6 |
+
# from app.services.providers.sauravkanchan.client import SauravKanchanProvider
|
| 7 |
+
#
|
| 8 |
+
# This is a FREE, zero-rate-limit provider β it reads static JSON files
|
| 9 |
+
# hosted on GitHub Pages by developer Saurav Kanchan. No API key needed.
|
| 10 |
+
# It fetches tech headlines from both India (in.json) and the US (us.json)
|
| 11 |
+
# simultaneously, doubling volume with a single aggregator call.
|
| 12 |
+
# Gated behind GENERAL_TECH_CATEGORIES (same as Hacker News & Inshorts).
|
app/services/providers/sauravkanchan/client.py
ADDED
|
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
providers/sauravkanchan/client.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
The SauravKanchan Static JSON Provider for Segmento Pulse.
|
| 5 |
+
|
| 6 |
+
What this does:
|
| 7 |
+
Reads two static JSON files hosted on GitHub Pages by a developer named
|
| 8 |
+
Saurav Kanchan. These files are automatically updated by a GitHub Action
|
| 9 |
+
that scrapes the top tech headlines from NewsAPI.org and saves them as
|
| 10 |
+
plain JSON files anyone can read for free.
|
| 11 |
+
|
| 12 |
+
We fetch TWO files at the same time:
|
| 13 |
+
in.json β Top tech headlines from India
|
| 14 |
+
us.json β Top tech headlines from the United States
|
| 15 |
+
|
| 16 |
+
Fetching both simultaneously means we get double the volume and double
|
| 17 |
+
the geographic coverage in roughly the same time as fetching just one.
|
| 18 |
+
|
| 19 |
+
Why this is zero-cost and zero-rate-limit:
|
| 20 |
+
These are not API calls β they are just reading a text file from the
|
| 21 |
+
internet. GitHub Pages has no rate limit for public static file reads.
|
| 22 |
+
No API key. No signup. No credit card. Completely free forever.
|
| 23 |
+
|
| 24 |
+
Why the data is high quality:
|
| 25 |
+
The JSON structure is identical to the paid NewsAPI.org format, which
|
| 26 |
+
means we get proper titles, descriptions, image URLs, publication dates,
|
| 27 |
+
and source names β all cleanly pre-formatted for us.
|
| 28 |
+
|
| 29 |
+
Freshness note (important):
|
| 30 |
+
Saurav's GitHub Action runs on its own schedule β typically a few times
|
| 31 |
+
per day. This means some articles in the file may be several hours old
|
| 32 |
+
by the time we read them. That is perfectly fine. Our freshness gate in
|
| 33 |
+
data_validation.is_valid_article() will automatically reject anything
|
| 34 |
+
older than our midnight IST cutoff. We never need to pre-filter here.
|
| 35 |
+
|
| 36 |
+
Client-side constraint note:
|
| 37 |
+
These are static files β we cannot add query parameters. We get
|
| 38 |
+
whatever is in the file. The keyword gate handles topic filtering.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
# ββ Standard Library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
+
import asyncio
|
| 43 |
+
import logging
|
| 44 |
+
import time
|
| 45 |
+
from typing import List, Optional
|
| 46 |
+
|
| 47 |
+
# ββ Third-party (already in requirements.txt) βββββββββββββββββββββββββββββββββ
|
| 48 |
+
import httpx # Async HTTP client
|
| 49 |
+
|
| 50 |
+
# ββ Internal ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
+
from app.services.providers.base import NewsProvider, ProviderStatus
|
| 52 |
+
from app.models import Article
|
| 53 |
+
|
| 54 |
+
logger = logging.getLogger(__name__)
|
| 55 |
+
|
| 56 |
+
# ββ Static JSON URLs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 57 |
+
#
|
| 58 |
+
# Both files are hosted on GitHub Pages and updated automatically by a
|
| 59 |
+
# GitHub Action. They follow the exact same JSON structure as NewsAPI.org.
|
| 60 |
+
#
|
| 61 |
+
# To change regions or add new ones (e.g., gb.json), just add a new entry here.
|
| 62 |
+
# The fetch loop picks it up automatically.
|
| 63 |
+
#
|
| 64 |
+
STATIC_FEED_URLS: List[tuple] = [
|
| 65 |
+
(
|
| 66 |
+
"https://saurav.tech/NewsAPI/top-headlines/category/technology/in.json",
|
| 67 |
+
"in", # Region code β used only in log messages
|
| 68 |
+
),
|
| 69 |
+
(
|
| 70 |
+
"https://saurav.tech/NewsAPI/top-headlines/category/technology/us.json",
|
| 71 |
+
"us", # Region code β used only in log messages
|
| 72 |
+
),
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
# HTTP request timeout. Static files are fast, but we keep this generous
|
| 76 |
+
# because GitHub Pages occasionally has slow cold starts.
|
| 77 |
+
HTTP_TIMEOUT_SECONDS = 12.0
|
| 78 |
+
|
| 79 |
+
# Max articles to take from each regional file.
|
| 80 |
+
# 100 articles per file Γ 2 files = up to 200 raw articles per call.
|
| 81 |
+
# The freshness gate will reject most of the older ones, leaving us
|
| 82 |
+
# with the freshest and most relevant subset.
|
| 83 |
+
MAX_ARTICLES_PER_REGION = 100
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class SauravKanchanProvider(NewsProvider):
|
| 87 |
+
"""
|
| 88 |
+
Reads top tech headlines from two static JSON files on GitHub Pages.
|
| 89 |
+
|
| 90 |
+
Covers India (in.json) and the United States (us.json) simultaneously.
|
| 91 |
+
Free. Zero rate limits. No API key required.
|
| 92 |
+
Gated behind GENERAL_TECH_CATEGORIES in the aggregator.
|
| 93 |
+
|
| 94 |
+
Usage (wired in Phase 7):
|
| 95 |
+
provider = SauravKanchanProvider()
|
| 96 |
+
articles = await provider.fetch_news(category="ai", limit=50)
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
def __init__(self):
|
| 100 |
+
# Free provider β no key, no daily limit.
|
| 101 |
+
super().__init__(api_key=None)
|
| 102 |
+
self.daily_limit = 0
|
| 103 |
+
|
| 104 |
+
# Phase 17: Fetch-Once, Fan-Out cache
|
| 105 |
+
#
|
| 106 |
+
# Saurav's JSON files contain a snapshot of top India + US tech headlines.
|
| 107 |
+
# The file contents are the same regardless of whether we ask for
|
| 108 |
+
# category "ai" or category "cloud-gcp" β the files don't change.
|
| 109 |
+
# Without a cache: the aggregator downloads IN + US files 22 separate
|
| 110 |
+
# times (once per category), wasting bandwidth and GitHub's servers.
|
| 111 |
+
# With a cache: downloaded once, stored here for 45 minutes.
|
| 112 |
+
#
|
| 113 |
+
# We store the FINAL Pydantic Article objects, not the raw JSON.
|
| 114 |
+
# This means zero re-parsing on cache hits β callers get typed objects.
|
| 115 |
+
self._cached_articles: List[Article] = []
|
| 116 |
+
self._cache_time: float = 0.0
|
| 117 |
+
|
| 118 |
+
# The lock prevents the "thundering herd" problem:
|
| 119 |
+
# If 5 categories hit this provider at the exact same millisecond
|
| 120 |
+
# (which asyncio.gather() will do), only the first one fetches.
|
| 121 |
+
# The other 4 wait patiently at the lock, then return from cache.
|
| 122 |
+
self._lock = asyncio.Lock()
|
| 123 |
+
|
| 124 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 125 |
+
# MAIN ENTRY POINT β called by the aggregator's FREE PARALLEL RUN
|
| 126 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 127 |
+
|
| 128 |
+
async def fetch_news(self, category: str, limit: int = 50) -> List[Article]:
|
| 129 |
+
"""
|
| 130 |
+
Fetch tech headlines from the India and US static JSON files.
|
| 131 |
+
|
| 132 |
+
Both files are downloaded at the same time using asyncio.gather().
|
| 133 |
+
Their article lists are then combined into one big list and returned.
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
category (str): The aggregator's category string (e.g., "ai").
|
| 137 |
+
We tag every article with it. The keyword gate
|
| 138 |
+
later filters which ones are truly relevant.
|
| 139 |
+
limit (int): Soft cap on total articles to return.
|
| 140 |
+
The per-region MAX_ARTICLES_PER_REGION cap is
|
| 141 |
+
the real control lever.
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
List[Article]: Combined articles from IN + US feeds.
|
| 145 |
+
Returns [] if both feeds fail.
|
| 146 |
+
"""
|
| 147 |
+
# ββ Phase 17: Cache check (OUTER) βββββββββββββββββββββββββββββββββββββ
|
| 148 |
+
CACHE_TTL_SECONDS = 2700 # 45 minutes
|
| 149 |
+
|
| 150 |
+
if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
|
| 151 |
+
logger.debug(
|
| 152 |
+
"[SauravKanchan] Cache hit β returning %d cached articles for category='%s'. "
|
| 153 |
+
"No HTTP calls made.",
|
| 154 |
+
len(self._cached_articles), category
|
| 155 |
+
)
|
| 156 |
+
return self._cached_articles
|
| 157 |
+
|
| 158 |
+
# ββ Cache stale or empty: acquire the lock and fetch βββββββββββββββββββ
|
| 159 |
+
async with self._lock:
|
| 160 |
+
|
| 161 |
+
# ββ Cache check (INNER) β double-checked locking ββββββββββββββ
|
| 162 |
+
if time.time() - self._cache_time < CACHE_TTL_SECONDS and self._cached_articles:
|
| 163 |
+
logger.debug(
|
| 164 |
+
"[SauravKanchan] Cache hit after lock β returning %d cached articles.",
|
| 165 |
+
len(self._cached_articles)
|
| 166 |
+
)
|
| 167 |
+
return self._cached_articles
|
| 168 |
+
|
| 169 |
+
logger.info("[SauravKanchan] Cache stale/empty. Fetching IN + US JSON files...")
|
| 170 |
+
|
| 171 |
+
try:
|
| 172 |
+
async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
|
| 173 |
+
|
| 174 |
+
# Build one fetch task per regional URL β both fire at the same time.
|
| 175 |
+
fetch_tasks = [
|
| 176 |
+
self._fetch_single_region(client, url, region_code, category)
|
| 177 |
+
for url, region_code in STATIC_FEED_URLS
|
| 178 |
+
]
|
| 179 |
+
|
| 180 |
+
# Wait for both regional fetches to complete simultaneously.
|
| 181 |
+
results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
|
| 182 |
+
|
| 183 |
+
# Combine articles from both regions into one flat list.
|
| 184 |
+
all_articles: List[Article] = []
|
| 185 |
+
for (_, region_code), result in zip(STATIC_FEED_URLS, results):
|
| 186 |
+
if isinstance(result, Exception):
|
| 187 |
+
logger.warning(
|
| 188 |
+
f"[SauravKanchan] [{region_code.upper()}] "
|
| 189 |
+
f"Fetch failed: {result}"
|
| 190 |
+
)
|
| 191 |
+
elif isinstance(result, list):
|
| 192 |
+
all_articles.extend(result)
|
| 193 |
+
|
| 194 |
+
logger.info(
|
| 195 |
+
"[SauravKanchan] Fetched %d articles from %d regions. "
|
| 196 |
+
"Caching for 45 minutes.",
|
| 197 |
+
len(all_articles), len(STATIC_FEED_URLS)
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
# Store the fully-mapped Pydantic Article objects in the cache.
|
| 201 |
+
# Future category calls get typed objects with zero re-parsing.
|
| 202 |
+
self._cached_articles = all_articles
|
| 203 |
+
self._cache_time = time.time()
|
| 204 |
+
return all_articles
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.error(f"[SauravKanchan] Unexpected error: {e}", exc_info=True)
|
| 208 |
+
return []
|
| 209 |
+
|
| 210 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 211 |
+
# PRIVATE HELPERS
|
| 212 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 213 |
+
|
| 214 |
+
async def _fetch_single_region(
|
| 215 |
+
self,
|
| 216 |
+
client: httpx.AsyncClient,
|
| 217 |
+
url: str,
|
| 218 |
+
region_code: str,
|
| 219 |
+
category: str,
|
| 220 |
+
) -> List[Article]:
|
| 221 |
+
"""
|
| 222 |
+
Download one regional JSON file and parse its articles.
|
| 223 |
+
|
| 224 |
+
Args:
|
| 225 |
+
client (httpx.AsyncClient): Shared HTTP client from fetch_news().
|
| 226 |
+
url (str): The full static JSON URL to fetch.
|
| 227 |
+
region_code (str): Short label for logging (e.g., "us", "in").
|
| 228 |
+
category (str): The aggregator's category β tagged on articles.
|
| 229 |
+
|
| 230 |
+
Returns:
|
| 231 |
+
List[Article]: Parsed articles from this region. Returns [] on failure.
|
| 232 |
+
"""
|
| 233 |
+
try:
|
| 234 |
+
response = await client.get(
|
| 235 |
+
url,
|
| 236 |
+
headers={"User-Agent": "SegmentoPulse-Ingestion/1.0"},
|
| 237 |
+
follow_redirects=True,
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
if response.status_code != 200:
|
| 241 |
+
logger.warning(
|
| 242 |
+
f"[SauravKanchan] [{region_code.upper()}] "
|
| 243 |
+
f"HTTP {response.status_code} β skipping."
|
| 244 |
+
)
|
| 245 |
+
return []
|
| 246 |
+
|
| 247 |
+
data = response.json()
|
| 248 |
+
|
| 249 |
+
except httpx.TimeoutException:
|
| 250 |
+
logger.warning(
|
| 251 |
+
f"[SauravKanchan] [{region_code.upper()}] Timed out β skipping."
|
| 252 |
+
)
|
| 253 |
+
return []
|
| 254 |
+
except Exception as e:
|
| 255 |
+
logger.warning(
|
| 256 |
+
f"[SauravKanchan] [{region_code.upper()}] Fetch error: {e}"
|
| 257 |
+
)
|
| 258 |
+
return []
|
| 259 |
+
|
| 260 |
+
# The JSON has the same shape as NewsAPI.org:
|
| 261 |
+
# { "status": "ok", "totalResults": 20, "articles": [ ... ] }
|
| 262 |
+
raw_articles = data.get("articles", [])
|
| 263 |
+
|
| 264 |
+
if not isinstance(raw_articles, list) or not raw_articles:
|
| 265 |
+
logger.info(
|
| 266 |
+
f"[SauravKanchan] [{region_code.upper()}] "
|
| 267 |
+
"No articles found in response."
|
| 268 |
+
)
|
| 269 |
+
return []
|
| 270 |
+
|
| 271 |
+
articles = self._map_articles(
|
| 272 |
+
raw_articles[:MAX_ARTICLES_PER_REGION],
|
| 273 |
+
region_code,
|
| 274 |
+
category,
|
| 275 |
+
)
|
| 276 |
+
logger.info(
|
| 277 |
+
f"[SauravKanchan] [{region_code.upper()}] "
|
| 278 |
+
f"Parsed {len(articles)} articles."
|
| 279 |
+
)
|
| 280 |
+
return articles
|
| 281 |
+
|
| 282 |
+
def _map_articles(
|
| 283 |
+
self,
|
| 284 |
+
raw_articles: list,
|
| 285 |
+
region_code: str,
|
| 286 |
+
category: str,
|
| 287 |
+
) -> List[Article]:
|
| 288 |
+
"""
|
| 289 |
+
Convert raw NewsAPI-format JSON items into Segmento Pulse Article objects.
|
| 290 |
+
|
| 291 |
+
The field names in this JSON are camelCase (like JavaScript), so:
|
| 292 |
+
urlToImage β image_url
|
| 293 |
+
publishedAt β published_at
|
| 294 |
+
source.name β source
|
| 295 |
+
|
| 296 |
+
Everything else maps directly.
|
| 297 |
+
|
| 298 |
+
Args:
|
| 299 |
+
raw_articles (list): The 'articles' array from the JSON response.
|
| 300 |
+
region_code (str): "in" or "us" β appended to the source name
|
| 301 |
+
so we know where the article came from.
|
| 302 |
+
category (str): The aggregator's category string.
|
| 303 |
+
|
| 304 |
+
Returns:
|
| 305 |
+
List[Article]: Clean Article objects for the pipeline.
|
| 306 |
+
"""
|
| 307 |
+
articles: List[Article] = []
|
| 308 |
+
|
| 309 |
+
for item in raw_articles:
|
| 310 |
+
if not isinstance(item, dict):
|
| 311 |
+
continue
|
| 312 |
+
|
| 313 |
+
# ββ Title ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 314 |
+
title = (item.get("title") or "").strip()
|
| 315 |
+
# NewsAPI sometimes puts "[Removed]" as a title for deleted articles
|
| 316 |
+
if not title or title == "[Removed]":
|
| 317 |
+
continue
|
| 318 |
+
|
| 319 |
+
# ββ URL ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 320 |
+
url = (item.get("url") or "").strip()
|
| 321 |
+
if not url or not url.startswith("http"):
|
| 322 |
+
continue
|
| 323 |
+
|
| 324 |
+
# ββ Description ββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½
|
| 325 |
+
description = (item.get("description") or "").strip()
|
| 326 |
+
# Skip "[Removed]" placeholder descriptions too
|
| 327 |
+
if description == "[Removed]":
|
| 328 |
+
description = ""
|
| 329 |
+
|
| 330 |
+
# ββ Image URL (camelCase: urlToImage) βββββββββββββββββββββββββ
|
| 331 |
+
image_url = (item.get("urlToImage") or "").strip()
|
| 332 |
+
|
| 333 |
+
# ββ Published Date (camelCase: publishedAt) βββββββββββββββββββ
|
| 334 |
+
# NewsAPI format is already ISO 8601 (e.g., "2026-03-03T06:00:00Z").
|
| 335 |
+
# Our Pydantic Article model accepts this directly β no conversion.
|
| 336 |
+
published_at = item.get("publishedAt") or ""
|
| 337 |
+
|
| 338 |
+
# ββ Source Name (nested object) βββββββββββββββββββββββββββββββ
|
| 339 |
+
# NewsAPI wraps the source as { "id": "...", "name": "..." }.
|
| 340 |
+
# We only want the 'name' string.
|
| 341 |
+
source_obj = item.get("source") or {}
|
| 342 |
+
raw_source_name = (source_obj.get("name") or "").strip()
|
| 343 |
+
|
| 344 |
+
# Append the region code so it's clear in the UI where
|
| 345 |
+
# this article came from, e.g., "The Verge (IN)" or "Wired (US)".
|
| 346 |
+
if raw_source_name:
|
| 347 |
+
source = f"{raw_source_name} ({region_code.upper()})"
|
| 348 |
+
else:
|
| 349 |
+
source = f"SauravKanchan ({region_code.upper()})"
|
| 350 |
+
|
| 351 |
+
# ββ Build Article βββββββββββββββββββββββββββββββββββββββββββββ
|
| 352 |
+
try:
|
| 353 |
+
article = Article(
|
| 354 |
+
title=title,
|
| 355 |
+
description=description,
|
| 356 |
+
url=url,
|
| 357 |
+
image_url=image_url,
|
| 358 |
+
published_at=published_at,
|
| 359 |
+
source=source,
|
| 360 |
+
# ββ ROUTING RULE ββββββββββββββββββββββββββββββββββββββ
|
| 361 |
+
# Pass through the aggregator's category.
|
| 362 |
+
# The keyword gate filters out off-topic articles.
|
| 363 |
+
# Unknown or empty categories safely route to
|
| 364 |
+
# the default 'News Articles' collection.
|
| 365 |
+
category=category,
|
| 366 |
+
)
|
| 367 |
+
articles.append(article)
|
| 368 |
+
|
| 369 |
+
except Exception as e:
|
| 370 |
+
logger.debug(
|
| 371 |
+
f"[SauravKanchan] Skipped item '{title[:50]}...': {e}"
|
| 372 |
+
)
|
| 373 |
+
continue
|
| 374 |
+
|
| 375 |
+
return articles
|
app/services/providers/thenewsapi/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# providers/thenewsapi/__init__.py
|
| 2 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
+
# This file marks the 'thenewsapi' folder as a Python package.
|
| 4 |
+
# To use TheNewsAPI provider, import it like this:
|
| 5 |
+
#
|
| 6 |
+
# from app.services.providers.thenewsapi.client import TheNewsAPIProvider
|
| 7 |
+
#
|
| 8 |
+
# This is a PAID provider β it requires the THENEWSAPI_API_KEY environment
|
| 9 |
+
# variable to be set. It has a daily_limit of 100 requests (free tier).
|
| 10 |
+
# It lives in the PAID_CHAIN, meaning it only fires if all providers above
|
| 11 |
+
# it in the chain (GNews, NewsAPI, NewsData) have already failed.
|
app/services/providers/thenewsapi/client.py
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
providers/thenewsapi/client.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
TheNewsAPI.com Provider for Segmento Pulse.
|
| 5 |
+
|
| 6 |
+
What this does:
|
| 7 |
+
Fetches fresh technology news articles from TheNewsAPI.com.
|
| 8 |
+
This is a paid API but has the cleanest JSON structure of all paid
|
| 9 |
+
providers β most of its field names even match our Pydantic Article model.
|
| 10 |
+
|
| 11 |
+
Free Tier Limits:
|
| 12 |
+
- 100 requests per day (resets midnight UTC)
|
| 13 |
+
- Requires an API key (THENEWSAPI_API_KEY in your .env file)
|
| 14 |
+
|
| 15 |
+
Where it sits in the pipeline:
|
| 16 |
+
PAID_CHAIN position 4 (after GNews β NewsAPI β NewsData).
|
| 17 |
+
Only fires if all three above it have already failed or hit their limits.
|
| 18 |
+
Once it returns articles, the paid chain stops β credits protected.
|
| 19 |
+
|
| 20 |
+
The special data quirk (categories array):
|
| 21 |
+
TheNewsAPI returns a 'categories' field as a LIST, not a single string.
|
| 22 |
+
Example: { "categories": ["tech", "science"] }
|
| 23 |
+
|
| 24 |
+
We grab only the FIRST item from that list.
|
| 25 |
+
Example: "tech"
|
| 26 |
+
|
| 27 |
+
This raw value ("tech") is then passed through our pipeline.
|
| 28 |
+
The keyword gate in data_validation.is_relevant_to_category() handles
|
| 29 |
+
whether the article truly belongs in our system.
|
| 30 |
+
|
| 31 |
+
We do NOT try to translate "tech" β "magazines" ourselves here.
|
| 32 |
+
That mapping belongs in the validation/data layer, not the fetcher layer.
|
| 33 |
+
Keep the fetcher dumb β let the pipeline be smart.
|
| 34 |
+
|
| 35 |
+
Client-side constraint note:
|
| 36 |
+
TheNewsAPI supports date filters (published_after, published_before) and
|
| 37 |
+
language filters (language=en). We use language=en to avoid non-English
|
| 38 |
+
articles. We do NOT apply date filters because the freshness gate in
|
| 39 |
+
data_validation.is_valid_article() handles that more accurately in IST.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
# ββ Standard Library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
import logging
|
| 44 |
+
from datetime import datetime, timezone
|
| 45 |
+
from typing import List, Optional
|
| 46 |
+
|
| 47 |
+
# ββ Third-party (already in requirements.txt) βββββββββββββββββββββββββββββββββ
|
| 48 |
+
import httpx # Async HTTP client
|
| 49 |
+
|
| 50 |
+
# ββ Internal ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
+
from app.services.providers.base import NewsProvider, ProviderStatus
|
| 52 |
+
from app.models import Article
|
| 53 |
+
from app.config import settings # Single source of truth for all keys
|
| 54 |
+
# Phase 16: Import the Redis counter utility to make the daily budget
|
| 55 |
+
# restart-proof. TheNewsAPI only allows 3 real calls per day on the free tier.
|
| 56 |
+
# Without Redis, a server restart resets request_count to 0 and lets us
|
| 57 |
+
# make 3 more calls β potentially 9+ calls on a restart-heavy day.
|
| 58 |
+
from app.services.utils.provider_state import (
|
| 59 |
+
get_provider_counter,
|
| 60 |
+
increment_provider_counter,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
logger = logging.getLogger(__name__)
|
| 64 |
+
|
| 65 |
+
# ββ Constants βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 66 |
+
|
| 67 |
+
# Base URL for all TheNewsAPI endpoints
|
| 68 |
+
THENEWSAPI_BASE_URL = "https://api.thenewsapi.com/v1/news/all"
|
| 69 |
+
|
| 70 |
+
# How long (seconds) to wait before giving up on a request
|
| 71 |
+
HTTP_TIMEOUT_SECONDS = 10.0
|
| 72 |
+
|
| 73 |
+
# How many articles to request per call. 25 is their recommended page size.
|
| 74 |
+
ARTICLES_PER_REQUEST = 25
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class TheNewsAPIProvider(NewsProvider):
|
| 78 |
+
"""
|
| 79 |
+
Fetches technology news from TheNewsAPI.com.
|
| 80 |
+
|
| 81 |
+
Paid provider β needs THENEWSAPI_API_KEY in your .env file.
|
| 82 |
+
Sits at position 4 in the PAID_CHAIN (last paid fallback).
|
| 83 |
+
100 requests/day on the free tier.
|
| 84 |
+
|
| 85 |
+
Usage (wired into the aggregator in Phase 5):
|
| 86 |
+
provider = TheNewsAPIProvider(api_key="your_key_here")
|
| 87 |
+
articles = await provider.fetch_news(category="ai", limit=25)
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
def __init__(self, api_key: Optional[str] = None):
|
| 91 |
+
super().__init__(api_key=api_key)
|
| 92 |
+
|
| 93 |
+
# Phase 16 Audit Fix: Corrected from 100 β 3.
|
| 94 |
+
#
|
| 95 |
+
# The free tier documentation lists "100 requests/day" but in practice
|
| 96 |
+
# the Community (free) tier is hard-capped at 3 requests per day.
|
| 97 |
+
# Our QA audit caught this discrepancy: with daily_limit=100, the old
|
| 98 |
+
# code would keep calling this API expecting 100 slots, burning all 3
|
| 99 |
+
# real calls immediately and then receiving 402s for the rest of the day.
|
| 100 |
+
#
|
| 101 |
+
# With daily_limit=3 + Redis persistence: we use at most 3 calls/day
|
| 102 |
+
# even across multiple server restarts. The 3rd call is reserved as an
|
| 103 |
+
# emergency slot β Redis budget enforcement kicks in at 2.
|
| 104 |
+
self.daily_limit = 3
|
| 105 |
+
|
| 106 |
+
# Category mapping: translate our internal category names into the
|
| 107 |
+
# categories that TheNewsAPI actually understands.
|
| 108 |
+
# TheNewsAPI uses these: tech, science, sports, business, health, entertainment, general
|
| 109 |
+
# We map our fine-grained categories to the closest match.
|
| 110 |
+
self.category_map = {
|
| 111 |
+
'ai': 'tech',
|
| 112 |
+
'data-security': 'tech',
|
| 113 |
+
'data-governance': 'tech',
|
| 114 |
+
'data-privacy': 'tech',
|
| 115 |
+
'data-engineering': 'tech',
|
| 116 |
+
'data-management': 'tech',
|
| 117 |
+
'business-intelligence': 'business',
|
| 118 |
+
'business-analytics': 'business',
|
| 119 |
+
'customer-data-platform': 'business',
|
| 120 |
+
'data-centers': 'tech',
|
| 121 |
+
'cloud-computing': 'tech',
|
| 122 |
+
'magazines': 'tech',
|
| 123 |
+
'data-laws': 'tech',
|
| 124 |
+
# Cloud sub-categories β all map to 'tech' in TheNewsAPI's world
|
| 125 |
+
'cloud-aws': 'tech',
|
| 126 |
+
'cloud-azure': 'tech',
|
| 127 |
+
'cloud-gcp': 'tech',
|
| 128 |
+
'cloud-oracle': 'tech',
|
| 129 |
+
'cloud-ibm': 'tech',
|
| 130 |
+
'cloud-alibaba': 'tech',
|
| 131 |
+
'cloud-digitalocean': 'tech',
|
| 132 |
+
'cloud-huawei': 'tech',
|
| 133 |
+
'cloud-cloudflare': 'tech',
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 137 |
+
# MAIN ENTRY POINT β called by the aggregator's PAID WATERFALL
|
| 138 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 139 |
+
|
| 140 |
+
async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
|
| 141 |
+
"""
|
| 142 |
+
Fetch technology articles from TheNewsAPI.com.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
category (str): Our internal category (e.g., "ai", "cloud-aws").
|
| 146 |
+
We look this up in self.category_map to get the
|
| 147 |
+
correct TheNewsAPI category keyword.
|
| 148 |
+
limit (int): Maximum number of articles to return.
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
List[Article]: Mapped Article objects. Returns [] on failure.
|
| 152 |
+
"""
|
| 153 |
+
# No API key means this provider cannot run.
|
| 154 |
+
# The aggregator will have already checked this via is_available(),
|
| 155 |
+
# but we double-check here for safety.
|
| 156 |
+
if not self.api_key:
|
| 157 |
+
logger.debug("[TheNewsAPI] No API key configured β skipping.")
|
| 158 |
+
return []
|
| 159 |
+
|
| 160 |
+
# ββ PHASE 16: Redis-backed daily budget guard ββββββββββββββββββββββββ
|
| 161 |
+
# Real free-tier limit: 3 calls/day (corrected in this phase).
|
| 162 |
+
# We check Redis FIRST before building any params or making any HTTP call.
|
| 163 |
+
#
|
| 164 |
+
# Why inside fetch_news and not inside is_available()?
|
| 165 |
+
# is_available() is a synchronous function on the base class.
|
| 166 |
+
# Redis calls are async (they use `await`). You cannot mix them:
|
| 167 |
+
# calling an async function from a sync function crashes at runtime.
|
| 168 |
+
# So we do the async Redis check here, at the very top of async fetch_news.
|
| 169 |
+
today_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
| 170 |
+
current_calls = await get_provider_counter("thenewsapi", today_str)
|
| 171 |
+
|
| 172 |
+
if current_calls >= self.daily_limit:
|
| 173 |
+
logger.warning(
|
| 174 |
+
"[TheNewsAPI] Daily Redis budget exhausted β %d/%d calls used today. "
|
| 175 |
+
"Skipping to protect the 3-call daily quota.",
|
| 176 |
+
current_calls, self.daily_limit
|
| 177 |
+
)
|
| 178 |
+
self.mark_rate_limited()
|
| 179 |
+
return []
|
| 180 |
+
|
| 181 |
+
try:
|
| 182 |
+
# Translate our internal category to TheNewsAPI's category keyword.
|
| 183 |
+
# If the category is not in our map, default to 'tech'.
|
| 184 |
+
api_category = self.category_map.get(category, "tech")
|
| 185 |
+
|
| 186 |
+
params = {
|
| 187 |
+
"api_token": self.api_key,
|
| 188 |
+
"language": "en", # English articles only
|
| 189 |
+
"categories": api_category, # TheNewsAPI category keyword
|
| 190 |
+
"limit": min(limit, ARTICLES_PER_REQUEST),
|
| 191 |
+
# NOTE: We deliberately do NOT add 'published_after' or
|
| 192 |
+
# 'published_before' date filters.
|
| 193 |
+
# TheNewsAPI supports them, but our freshness gate
|
| 194 |
+
# (is_valid_article in data_validation.py) already enforces
|
| 195 |
+
# the correct IST-based date boundary. Letting the gate handle
|
| 196 |
+
# it is safer and avoids timezone conversion bugs here.
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
|
| 200 |
+
logger.info("[TheNewsAPI] Fetching '%s' (api_category='%s')...", category, api_category)
|
| 201 |
+
response = await client.get(THENEWSAPI_BASE_URL, params=params)
|
| 202 |
+
|
| 203 |
+
# ββ Handle rate limit βββββββββββββββββββββββββββββββββββββ
|
| 204 |
+
if response.status_code == 429:
|
| 205 |
+
logger.warning("[TheNewsAPI] Hit 429 rate limit. Marking as rate-limited.")
|
| 206 |
+
self.mark_rate_limited()
|
| 207 |
+
return []
|
| 208 |
+
|
| 209 |
+
# ββ Handle authentication failure βββββββββββββββββββββββββ
|
| 210 |
+
if response.status_code == 401:
|
| 211 |
+
logger.error("[TheNewsAPI] 401 Unauthorized β API key is invalid or expired.")
|
| 212 |
+
self.status = ProviderStatus.ERROR
|
| 213 |
+
return []
|
| 214 |
+
|
| 215 |
+
# ββ Handle quota exhaustion βββββββββββββββββββββββββββββββ
|
| 216 |
+
if response.status_code == 402:
|
| 217 |
+
logger.warning("[TheNewsAPI] 402 Payment Required β daily quota exhausted.")
|
| 218 |
+
self.mark_rate_limited()
|
| 219 |
+
return []
|
| 220 |
+
|
| 221 |
+
# ββ Handle other non-200 responses ββββββββββββββββββββββββ
|
| 222 |
+
if response.status_code != 200:
|
| 223 |
+
logger.warning(f"[TheNewsAPI] Unexpected HTTP {response.status_code}.")
|
| 224 |
+
return []
|
| 225 |
+
|
| 226 |
+
# ββ Parse and map the response ββββββββββββββββββββββββββββββββββ
|
| 227 |
+
self.request_count += 1 # Keep RAM shadow in sync for debugging
|
| 228 |
+
data = response.json()
|
| 229 |
+
|
| 230 |
+
# TheNewsAPI wraps articles in a 'data' key at the top level
|
| 231 |
+
raw_articles = data.get("data", [])
|
| 232 |
+
|
| 233 |
+
if not raw_articles:
|
| 234 |
+
logger.info(f"[TheNewsAPI] No articles returned for category='{category}'.")
|
| 235 |
+
return []
|
| 236 |
+
|
| 237 |
+
articles = self._map_articles(raw_articles, category)
|
| 238 |
+
|
| 239 |
+
# ββ PHASE 16: Increment the Redis counter after a successful call ββ
|
| 240 |
+
# Only successful 200 responses count against the daily budget.
|
| 241 |
+
# 402/429/timeout failures do not consume a slot.
|
| 242 |
+
await increment_provider_counter("thenewsapi", today_str)
|
| 243 |
+
|
| 244 |
+
logger.info("[TheNewsAPI] Got %d articles for '%s'.", len(articles), category)
|
| 245 |
+
return articles
|
| 246 |
+
|
| 247 |
+
except httpx.TimeoutException:
|
| 248 |
+
logger.warning("[TheNewsAPI] Request timed out.")
|
| 249 |
+
return []
|
| 250 |
+
except Exception as e:
|
| 251 |
+
logger.error(f"[TheNewsAPI] Unexpected error: {e}", exc_info=True)
|
| 252 |
+
return []
|
| 253 |
+
|
| 254 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 255 |
+
# PRIVATE HELPER β maps raw JSON items to Article objects
|
| 256 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 257 |
+
|
| 258 |
+
def _map_articles(self, raw_articles: list, category: str) -> List[Article]:
|
| 259 |
+
"""
|
| 260 |
+
Convert TheNewsAPI JSON items into Segmento Pulse Article objects.
|
| 261 |
+
|
| 262 |
+
The mapping is almost 1-to-1 with our Pydantic model, which is why
|
| 263 |
+
this is the easiest of all paid providers to integrate.
|
| 264 |
+
|
| 265 |
+
One special case: 'categories' is a list, not a string.
|
| 266 |
+
We take [0] (the first item) as the article's category value.
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
raw_articles (list): The 'data' array from TheNewsAPI's response.
|
| 270 |
+
category (str): Our internal category (from the aggregator).
|
| 271 |
+
|
| 272 |
+
Returns:
|
| 273 |
+
List[Article]: Clean Article objects for the pipeline.
|
| 274 |
+
"""
|
| 275 |
+
articles: List[Article] = []
|
| 276 |
+
|
| 277 |
+
for item in raw_articles:
|
| 278 |
+
|
| 279 |
+
# ββ Title βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 280 |
+
title = (item.get("title") or "").strip()
|
| 281 |
+
if not title:
|
| 282 |
+
continue
|
| 283 |
+
|
| 284 |
+
# ββ URL βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 285 |
+
url = (item.get("url") or "").strip()
|
| 286 |
+
if not url or not url.startswith("http"):
|
| 287 |
+
continue
|
| 288 |
+
|
| 289 |
+
# ββ Description βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 290 |
+
# TheNewsAPI provides real summaries β a huge advantage over HN.
|
| 291 |
+
description = (item.get("description") or "").strip()
|
| 292 |
+
|
| 293 |
+
# ββ Image URL βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 294 |
+
# The field is ALREADY called 'image_url' in their API.
|
| 295 |
+
# This is the cleanest mapping of any provider we have integrated.
|
| 296 |
+
image_url = (item.get("image_url") or "").strip()
|
| 297 |
+
|
| 298 |
+
# ββ Published Date ββββββββββββββββββββββββββββββββββββββββββββ
|
| 299 |
+
# TheNewsAPI returns ISO 8601 format (e.g., "2024-03-03T06:00:00.000000Z").
|
| 300 |
+
# Our Pydantic Article model already handles this format in its
|
| 301 |
+
# published_at validator β no conversion needed.
|
| 302 |
+
published_at = item.get("published_at") or ""
|
| 303 |
+
|
| 304 |
+
# ββ Source Name βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 305 |
+
# TheNewsAPI's live response returns `source` as a plain string
|
| 306 |
+
# (the publisher domain, e.g. "techcrunch.com"), NOT as a nested
|
| 307 |
+
# dict like NewsAPI.org does. We handle both shapes defensively.
|
| 308 |
+
raw_source = item.get("source") or ""
|
| 309 |
+
if isinstance(raw_source, dict):
|
| 310 |
+
# Nested object shape: {"name": "TechCrunch", "url": "..."}
|
| 311 |
+
source = (raw_source.get("name") or "TheNewsAPI").strip()
|
| 312 |
+
else:
|
| 313 |
+
# Plain string shape: "techcrunch.com" β use it as-is.
|
| 314 |
+
source = str(raw_source).strip() or "TheNewsAPI"
|
| 315 |
+
|
| 316 |
+
# ββ Category ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 317 |
+
# TheNewsAPI returns categories as a LIST, e.g., ["tech", "science"]
|
| 318 |
+
# We take only the first item. Our keyword gate will verify relevance.
|
| 319 |
+
# ROUTING RULE: if the list is empty, fall back to our internal
|
| 320 |
+
# category name. Both "" and category will safely route to the
|
| 321 |
+
# default 'News Articles' collection if unrecognised.
|
| 322 |
+
raw_categories = item.get("categories") or []
|
| 323 |
+
if raw_categories and isinstance(raw_categories, list):
|
| 324 |
+
article_category = raw_categories[0]
|
| 325 |
+
else:
|
| 326 |
+
article_category = category # Fallback to aggregator's category
|
| 327 |
+
|
| 328 |
+
# ββ Build Article βββββββββββββββββββββββββββββββββββββββββββββ
|
| 329 |
+
try:
|
| 330 |
+
article = Article(
|
| 331 |
+
title=title,
|
| 332 |
+
description=description,
|
| 333 |
+
url=url,
|
| 334 |
+
image_url=image_url,
|
| 335 |
+
published_at=published_at,
|
| 336 |
+
source=source,
|
| 337 |
+
category=article_category,
|
| 338 |
+
)
|
| 339 |
+
articles.append(article)
|
| 340 |
+
|
| 341 |
+
except Exception as e:
|
| 342 |
+
logger.debug(
|
| 343 |
+
f"[TheNewsAPI] Skipped item url='{url[:60]}': {e}"
|
| 344 |
+
)
|
| 345 |
+
continue
|
| 346 |
+
|
| 347 |
+
return articles
|
app/services/providers/webz/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# providers/webz/__init__.py
|
| 2 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
+
# This file marks the 'webz' folder as a Python package.
|
| 4 |
+
# To use this provider, import it like this:
|
| 5 |
+
#
|
| 6 |
+
# from app.services.providers.webz.client import WebzProvider
|
| 7 |
+
#
|
| 8 |
+
# This is a PAID provider β requires WEBZ_API_KEY in your .env file.
|
| 9 |
+
# Position 6 in the PAID_CHAIN (deepest paid failover).
|
| 10 |
+
#
|
| 11 |
+
# ββ CRITICAL BUDGET WARNING βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 12 |
+
# Webz.io free tier: 1,000 calls per MONTH (not per day).
|
| 13 |
+
# daily_limit is set to 30 inside WebzProvider to pace usage to ~900/month.
|
| 14 |
+
# DO NOT increase daily_limit above 33 β doing so will exhaust the
|
| 15 |
+
# monthly budget before the month ends.
|
app/services/providers/webz/client.py
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
providers/webz/client.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
The Webz.io Provider for Segmento Pulse.
|
| 5 |
+
|
| 6 |
+
What this does:
|
| 7 |
+
Fetches enterprise-grade news articles from Webz.io's News API Lite.
|
| 8 |
+
Webz crawls 3.5 million articles per day from across the open web,
|
| 9 |
+
making it one of the richest news sources we have available.
|
| 10 |
+
|
| 11 |
+
Paid provider β needs WEBZ_API_KEY in your .env file.
|
| 12 |
+
Position 6 in the PAID_CHAIN (absolute final paid failover).
|
| 13 |
+
|
| 14 |
+
ββ THE MONTHLY BUDGET PROBLEM AND HOW WE SOLVE IT ββββββββββββββββββββββββββ
|
| 15 |
+
|
| 16 |
+
Webz free tier gives us 1,000 calls per MONTH β not per day.
|
| 17 |
+
Our scheduler runs many categories every hour. Without a limit, we would
|
| 18 |
+
exhaust the entire 1,000-call monthly budget in less than 48 hours.
|
| 19 |
+
|
| 20 |
+
Our fix: daily_limit = 30 inside this class.
|
| 21 |
+
The quota tracker caps us at 30 calls per calendar day.
|
| 22 |
+
30 calls/day Γ 30 days = 900 calls/month β safely under 1,000.
|
| 23 |
+
This paces the budget across the whole month as an even, predictable cost.
|
| 24 |
+
|
| 25 |
+
Math visible to future engineers:
|
| 26 |
+
1,000 calls Γ· 30 days = 33.3 calls/day max to exactly hit the limit.
|
| 27 |
+
We use 30 to leave a 10% safety margin for edge cases (month resets,
|
| 28 |
+
server restarts that lose the quota counter's in-memory state, etc.).
|
| 29 |
+
|
| 30 |
+
ββ THE NESTED IMAGE PROBLEM AND HOW WE SOLVE IT βββββββββββββββββββββββββββββ
|
| 31 |
+
|
| 32 |
+
Webz does not put images at the top level of each article object.
|
| 33 |
+
Instead, the image is buried inside a nested 'thread' object like this:
|
| 34 |
+
|
| 35 |
+
{
|
| 36 |
+
"title": "Article Title",
|
| 37 |
+
"url": "https://...",
|
| 38 |
+
"thread": {
|
| 39 |
+
"site_full": "techcrunch.com", β source name is here too
|
| 40 |
+
"main_image": "https://..." β image is here
|
| 41 |
+
},
|
| 42 |
+
"text": "Full article body (thousands of words)..."
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
Our fix: We safely "drill down" using chained .get() calls.
|
| 46 |
+
thread = item.get("thread") or {}
|
| 47 |
+
image_url = thread.get("main_image") or ""
|
| 48 |
+
|
| 49 |
+
If 'thread' is missing β {} (empty dict, no crash)
|
| 50 |
+
If 'main_image' is missing β "" (empty string, no crash)
|
| 51 |
+
Either way, the pipeline gets a clean empty string for the fallback image.
|
| 52 |
+
|
| 53 |
+
ββ THE FULL TEXT BODY PROBLEM AND HOW WE SOLVE IT ββββββββββββββββββββββββββ
|
| 54 |
+
|
| 55 |
+
Webz provides the COMPLETE article body in the 'text' field β this can be
|
| 56 |
+
thousands of words. Storing that in our database is too large and risks
|
| 57 |
+
reproducing copyright-protected content.
|
| 58 |
+
|
| 59 |
+
Our fix: Truncate to the first 200 characters (same approach as Phase 8).
|
| 60 |
+
200 characters is enough for a preview. Our newsletter system uses the
|
| 61 |
+
description field but also has its own 160-char cap, so anything beyond
|
| 62 |
+
200 already has no use downstream.
|
| 63 |
+
"""
|
| 64 |
+
|
| 65 |
+
# ββ Standard Library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 66 |
+
import logging
|
| 67 |
+
from datetime import datetime, timezone
|
| 68 |
+
from typing import List, Optional
|
| 69 |
+
|
| 70 |
+
# ββ Third-party (already in requirements.txt) βββββββββββββββββββββββββββββββββ
|
| 71 |
+
import httpx # Async HTTP client
|
| 72 |
+
|
| 73 |
+
# ββ Internal ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 74 |
+
from app.services.providers.base import NewsProvider, ProviderStatus
|
| 75 |
+
from app.models import Article
|
| 76 |
+
from app.config import settings
|
| 77 |
+
# Phase 16: Import the Redis counter utility for dual-layer budget protection.
|
| 78 |
+
# Webz has the strictest budget of all three paid providers β 1,000 calls per
|
| 79 |
+
# MONTH. Without restart-proof counters, a restart-heavy day can exhaust the
|
| 80 |
+
# entire monthly budget in a few hours. Two Redis keys protect us:
|
| 81 |
+
# 1. Daily key ("webz", today_str) β caps us at 30/day
|
| 82 |
+
# 2. Monthly key ("webz_month", month_str) β caps us at 900/month total
|
| 83 |
+
from app.services.utils.provider_state import (
|
| 84 |
+
get_provider_counter,
|
| 85 |
+
increment_provider_counter,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
logger = logging.getLogger(__name__)
|
| 89 |
+
|
| 90 |
+
# ββ Constants βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
+
|
| 92 |
+
# Webz.io News API Lite endpoint
|
| 93 |
+
WEBZ_API_URL = "https://api.webz.io/newsApiLite"
|
| 94 |
+
|
| 95 |
+
# Request timeout in seconds. Enterprise APIs are usually fast.
|
| 96 |
+
HTTP_TIMEOUT_SECONDS = 12.0
|
| 97 |
+
|
| 98 |
+
# Articles to request per call. Keeping this modest saves the budget
|
| 99 |
+
# because Webz deducts from quota based on results returned, not just calls.
|
| 100 |
+
ARTICLES_PER_REQUEST = 10
|
| 101 |
+
|
| 102 |
+
# Maximum characters to keep from the article body for the description field.
|
| 103 |
+
# Matches Phase 8's WorldNewsAI approach for consistency.
|
| 104 |
+
DESCRIPTION_MAX_CHARS = 200
|
| 105 |
+
|
| 106 |
+
# Category β search query translation.
|
| 107 |
+
# Webz uses free-text query strings (like Google search), so we convert
|
| 108 |
+
# our internal category slugs into descriptive keyword phrases that maximise
|
| 109 |
+
# the quality of results from Webz's index.
|
| 110 |
+
CATEGORY_QUERY_MAP = {
|
| 111 |
+
'ai': 'artificial intelligence machine learning',
|
| 112 |
+
'data-security': 'data security cybersecurity breach hacking',
|
| 113 |
+
'data-governance': 'data governance compliance policy',
|
| 114 |
+
'data-privacy': 'data privacy GDPR regulation',
|
| 115 |
+
'data-engineering': 'data engineering pipeline ETL spark',
|
| 116 |
+
'data-management': 'data management master data catalog',
|
| 117 |
+
'business-intelligence': 'business intelligence analytics BI tools',
|
| 118 |
+
'business-analytics': 'business analytics data-driven decisions',
|
| 119 |
+
'customer-data-platform': 'customer data platform CDP personalization',
|
| 120 |
+
'data-centers': 'data center infrastructure hyperscaler',
|
| 121 |
+
'cloud-computing': 'cloud computing technology platform',
|
| 122 |
+
'magazines': 'technology news innovation',
|
| 123 |
+
'data-laws': 'AI regulation data law privacy act',
|
| 124 |
+
'cloud-aws': 'Amazon AWS cloud services',
|
| 125 |
+
'cloud-azure': 'Microsoft Azure cloud platform',
|
| 126 |
+
'cloud-gcp': 'Google Cloud Platform GCP services',
|
| 127 |
+
'cloud-oracle': 'Oracle Cloud OCI database',
|
| 128 |
+
'cloud-ibm': 'IBM Cloud Red Hat OpenShift',
|
| 129 |
+
'cloud-alibaba': 'Alibaba Cloud Aliyun technology',
|
| 130 |
+
'cloud-digitalocean': 'DigitalOcean cloud developer platform',
|
| 131 |
+
'cloud-huawei': 'Huawei Cloud services technology',
|
| 132 |
+
'cloud-cloudflare': 'Cloudflare CDN security network',
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
class WebzProvider(NewsProvider):
|
| 137 |
+
"""
|
| 138 |
+
Fetches enterprise-grade news articles from Webz.io News API Lite.
|
| 139 |
+
|
| 140 |
+
Paid provider β 1,000 calls/month free tier, paced to 30/day.
|
| 141 |
+
Position 6 in the PAID_CHAIN (deepest paid failover).
|
| 142 |
+
Only fires when all 5 providers above it have failed or hit limits.
|
| 143 |
+
Requires WEBZ_API_KEY in the .env file.
|
| 144 |
+
|
| 145 |
+
Usage (wired in Phase 10):
|
| 146 |
+
provider = WebzProvider(api_key="your_key_here")
|
| 147 |
+
articles = await provider.fetch_news(category="ai", limit=10)
|
| 148 |
+
"""
|
| 149 |
+
|
| 150 |
+
def __init__(self, api_key: Optional[str] = None):
|
| 151 |
+
super().__init__(api_key=api_key)
|
| 152 |
+
|
| 153 |
+
# 30 calls/day Γ 30 days = 900/month β safely under the 1,000 cap.
|
| 154 |
+
# The quota tracker enforces this limit before each call.
|
| 155 |
+
# 10% safety margin included for server restart edge cases.
|
| 156 |
+
self.daily_limit = 30
|
| 157 |
+
|
| 158 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 159 |
+
# MAIN ENTRY POINT β called by the aggregator's PAID WATERFALL
|
| 160 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 161 |
+
|
| 162 |
+
async def fetch_news(self, category: str, limit: int = 10) -> List[Article]:
|
| 163 |
+
"""
|
| 164 |
+
Fetch news articles from Webz.io for the given category.
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
category (str): Our internal category slug (e.g., "ai").
|
| 168 |
+
Translated to a keyword query via CATEGORY_QUERY_MAP.
|
| 169 |
+
limit (int): Max articles to return. Kept at 10 to conserve
|
| 170 |
+
the monthly call budget (Webz charges per result).
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
List[Article]: Mapped Article objects. Returns [] on any failure.
|
| 174 |
+
"""
|
| 175 |
+
if not self.api_key:
|
| 176 |
+
logger.debug("[Webz] No API key configured β skipping.")
|
| 177 |
+
return []
|
| 178 |
+
|
| 179 |
+
# ββ PHASE 16: Dual-layer Redis budget guard ββββββββββββββββββββββββ
|
| 180 |
+
#
|
| 181 |
+
# Webz is the most budget-constrained provider we have: 1,000 calls/MONTH.
|
| 182 |
+
# We protect it with TWO independent Redis counters running in parallel.
|
| 183 |
+
#
|
| 184 |
+
# Gate 1 β DAILY: Stops at 30 calls/day to pace spending evenly.
|
| 185 |
+
# Redis key: "provider:state:webz:calls:2026-03-03" (TTL: 24h)
|
| 186 |
+
#
|
| 187 |
+
# Gate 2 β MONTHLY: Stops at 900 calls/month (10% safety margin on 1,000).
|
| 188 |
+
# Redis key: "provider:state:webz_month:calls:2026-03" (TTL: 30 days)
|
| 189 |
+
# Note: The key name includes the month string ("2026-03").
|
| 190 |
+
# When April starts, the key name changes to "2026-04" automatically
|
| 191 |
+
# β no manual cleanup needed. The old March key expires via TTL.
|
| 192 |
+
#
|
| 193 |
+
# Either gate being exhausted blocks the call completely.
|
| 194 |
+
# Fail-safe design: if Redis is down, both return 999999 β call is skipped.
|
| 195 |
+
today_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
| 196 |
+
month_str = datetime.now(timezone.utc).strftime("%Y-%m")
|
| 197 |
+
|
| 198 |
+
daily_calls = await get_provider_counter("webz", today_str)
|
| 199 |
+
monthly_calls = await get_provider_counter("webz_month", month_str)
|
| 200 |
+
|
| 201 |
+
# Hard monthly ceiling: 900 (leaving 100 as safety buffer on the 1,000 limit)
|
| 202 |
+
MONTHLY_HARD_LIMIT = 900
|
| 203 |
+
|
| 204 |
+
if daily_calls >= self.daily_limit:
|
| 205 |
+
logger.warning(
|
| 206 |
+
"[Webz] Daily Redis budget exhausted β %d/%d calls used today. "
|
| 207 |
+
"Skipping to protect the monthly quota.",
|
| 208 |
+
daily_calls, self.daily_limit
|
| 209 |
+
)
|
| 210 |
+
self.mark_rate_limited()
|
| 211 |
+
return []
|
| 212 |
+
|
| 213 |
+
if monthly_calls >= MONTHLY_HARD_LIMIT:
|
| 214 |
+
logger.warning(
|
| 215 |
+
"[Webz] Monthly Redis budget exhausted β %d/%d calls used this month. "
|
| 216 |
+
"No more Webz calls until next month to protect the 1,000-call limit.",
|
| 217 |
+
monthly_calls, MONTHLY_HARD_LIMIT
|
| 218 |
+
)
|
| 219 |
+
self.mark_rate_limited()
|
| 220 |
+
return []
|
| 221 |
+
|
| 222 |
+
# Translate our internal category slug into a Webz-friendly search phrase.
|
| 223 |
+
search_query = CATEGORY_QUERY_MAP.get(category, f"technology {category}")
|
| 224 |
+
|
| 225 |
+
params = {
|
| 226 |
+
"token": self.api_key,
|
| 227 |
+
"q": search_query,
|
| 228 |
+
"language": "english",
|
| 229 |
+
"size": min(limit, ARTICLES_PER_REQUEST),
|
| 230 |
+
# NOTE: No date filters applied here intentionally.
|
| 231 |
+
# Our freshness gate in data_validation.is_valid_article()
|
| 232 |
+
# handles date boundaries accurately using IST windows.
|
| 233 |
+
# Adding date filters here would add timezone conversion risk.
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
try:
|
| 237 |
+
async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
|
| 238 |
+
print(
|
| 239 |
+
f"[Webz] Fetching '{category}' "
|
| 240 |
+
f"(query='{search_query[:40]}...')..."
|
| 241 |
+
)
|
| 242 |
+
response = await client.get(WEBZ_API_URL, params=params)
|
| 243 |
+
|
| 244 |
+
# ββ HTTP 402: Monthly budget exhausted ββββββββββββββββββββ
|
| 245 |
+
# Webz uses 402 to mean "you have no more credits this month".
|
| 246 |
+
# We mark as rate-limited so the circuit breaker respects it.
|
| 247 |
+
if response.status_code == 402:
|
| 248 |
+
logger.warning(
|
| 249 |
+
"[Webz] HTTP 402 β monthly call budget exhausted. "
|
| 250 |
+
"No more calls until quota resets at month end."
|
| 251 |
+
)
|
| 252 |
+
self.mark_rate_limited()
|
| 253 |
+
return []
|
| 254 |
+
|
| 255 |
+
# ββ HTTP 401: Bad API key βββββββββββββββββββββββββββββββββ
|
| 256 |
+
if response.status_code == 401:
|
| 257 |
+
logger.error(
|
| 258 |
+
"[Webz] HTTP 401 β API key is invalid or expired. "
|
| 259 |
+
"Check WEBZ_API_KEY in your .env file."
|
| 260 |
+
)
|
| 261 |
+
self.status = ProviderStatus.ERROR
|
| 262 |
+
return []
|
| 263 |
+
|
| 264 |
+
# ββ HTTP 429: Too many requests (short-term rate limit) βββ
|
| 265 |
+
if response.status_code == 429:
|
| 266 |
+
logger.warning("[Webz] HTTP 429 β request rate exceeded.")
|
| 267 |
+
self.mark_rate_limited()
|
| 268 |
+
return []
|
| 269 |
+
|
| 270 |
+
# ββ Any other non-200 βββββββββββββββββββββββββββββββββββββ
|
| 271 |
+
if response.status_code != 200:
|
| 272 |
+
logger.warning(f"[Webz] Unexpected HTTP {response.status_code}.")
|
| 273 |
+
return []
|
| 274 |
+
|
| 275 |
+
# ββ Parse the response ββββββββββββββββββββββββββββββββββββ
|
| 276 |
+
self.request_count += 1 # Keep RAM shadow in sync for debugging
|
| 277 |
+
data = response.json()
|
| 278 |
+
|
| 279 |
+
# Webz wraps the article list in a 'posts' key at the top level.
|
| 280 |
+
raw_posts = data.get("posts", [])
|
| 281 |
+
|
| 282 |
+
if not raw_posts:
|
| 283 |
+
logger.info(f"[Webz] No articles returned for '{category}'.")
|
| 284 |
+
return []
|
| 285 |
+
|
| 286 |
+
articles = self._map_articles(raw_posts, category)
|
| 287 |
+
|
| 288 |
+
# ββ PHASE 16: Increment BOTH Redis counters after a successful call ββ
|
| 289 |
+
# The monthly counter uses a 30-day TTL (2592000 seconds).
|
| 290 |
+
# This is long enough to outlive any calendar month.
|
| 291 |
+
# The key name ("webz_month:calls:2026-03") changes with each month
|
| 292 |
+
# so old keys just fade away on their own without our help.
|
| 293 |
+
await increment_provider_counter("webz", today_str, expire_seconds=86400)
|
| 294 |
+
await increment_provider_counter("webz_month", month_str, expire_seconds=2592000)
|
| 295 |
+
|
| 296 |
+
logger.info("[Webz] Got %d articles for '%s'.", len(articles), category)
|
| 297 |
+
return articles
|
| 298 |
+
|
| 299 |
+
except httpx.TimeoutException:
|
| 300 |
+
logger.warning("[Webz] Request timed out.")
|
| 301 |
+
return []
|
| 302 |
+
except Exception as e:
|
| 303 |
+
logger.error(f"[Webz] Unexpected error: {e}", exc_info=True)
|
| 304 |
+
return []
|
| 305 |
+
|
| 306 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 307 |
+
# PRIVATE HELPER β maps raw JSON posts to Article objects
|
| 308 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 309 |
+
|
| 310 |
+
def _map_articles(self, raw_posts: list, category: str) -> List[Article]:
|
| 311 |
+
"""
|
| 312 |
+
Convert Webz.io JSON 'posts' items into Segmento Pulse Article objects.
|
| 313 |
+
|
| 314 |
+
Key challenges handled here:
|
| 315 |
+
1. Nested image β lives inside posts[].thread.main_image
|
| 316 |
+
2. Nested source β lives inside posts[].thread.site_full
|
| 317 |
+
3. Full text body β truncated to 200 characters
|
| 318 |
+
4. Published date β Webz uses ISO 8601, our model accepts it directly
|
| 319 |
+
|
| 320 |
+
Webz field β Article field
|
| 321 |
+
βββββββββββββββββββββββββββββββββββββββββ
|
| 322 |
+
title β title
|
| 323 |
+
url β url
|
| 324 |
+
thread.site_full β source (nested β safe .get() chain)
|
| 325 |
+
thread.main_image β image_url (nested β safe .get() chain)
|
| 326 |
+
published β published_at
|
| 327 |
+
text (truncated 200) β description
|
| 328 |
+
|
| 329 |
+
Args:
|
| 330 |
+
raw_posts (list): The 'posts' array from the API response.
|
| 331 |
+
category (str): The aggregator's category for routing.
|
| 332 |
+
|
| 333 |
+
Returns:
|
| 334 |
+
List[Article]: Clean Article objects ready for the pipeline.
|
| 335 |
+
"""
|
| 336 |
+
articles: List[Article] = []
|
| 337 |
+
|
| 338 |
+
for item in raw_posts:
|
| 339 |
+
if not isinstance(item, dict):
|
| 340 |
+
continue
|
| 341 |
+
|
| 342 |
+
# ββ Title ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 343 |
+
title = (item.get("title") or "").strip()
|
| 344 |
+
if not title:
|
| 345 |
+
continue
|
| 346 |
+
|
| 347 |
+
# ββ URL ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 348 |
+
url = (item.get("url") or "").strip()
|
| 349 |
+
if not url or not url.startswith("http"):
|
| 350 |
+
continue
|
| 351 |
+
|
| 352 |
+
# ββ Published Date ββββββββββββββββββββββββββββββββββββββββββββ
|
| 353 |
+
# Webz returns ISO 8601 format (e.g., "2026-03-03T06:00:00.000+0000").
|
| 354 |
+
# Our Article model's published_at validator handles this directly.
|
| 355 |
+
published_at = item.get("published") or ""
|
| 356 |
+
|
| 357 |
+
# ββ Nested: Source and Image ββββββββββββββββββββββββββββββββββ
|
| 358 |
+
# The 'thread' field is a nested dictionary containing both.
|
| 359 |
+
# We extract it once, then pull from it safely.
|
| 360 |
+
# If 'thread' is missing for any reason, we fall back to an empty
|
| 361 |
+
# dict {} so the chained .get() calls below don't crash.
|
| 362 |
+
thread = item.get("thread") or {}
|
| 363 |
+
|
| 364 |
+
# Source: the full domain name of the publishing site.
|
| 365 |
+
# Example: "techcrunch.com" or "thenextweb.com"
|
| 366 |
+
source = (thread.get("site_full") or "Webz").strip()
|
| 367 |
+
if not source:
|
| 368 |
+
source = "Webz"
|
| 369 |
+
|
| 370 |
+
# Image: the main article image from the thread context.
|
| 371 |
+
# Buried one level deep β safe because of the `or {}` fallback above.
|
| 372 |
+
image_url = (thread.get("main_image") or "").strip()
|
| 373 |
+
|
| 374 |
+
# ββ Description (TRUNCATED full article body) βββββββββββββββββ
|
| 375 |
+
# 'text' contains the complete article body β potentially thousands
|
| 376 |
+
# of words. We keep only the first 200 characters as a preview.
|
| 377 |
+
# This protects us from database bloat and copyright issues.
|
| 378 |
+
raw_text = (item.get("text") or "").strip()
|
| 379 |
+
if len(raw_text) > DESCRIPTION_MAX_CHARS:
|
| 380 |
+
description = raw_text[:DESCRIPTION_MAX_CHARS] + "..."
|
| 381 |
+
else:
|
| 382 |
+
description = raw_text
|
| 383 |
+
|
| 384 |
+
# ββ Build Article βββββββββββββββββββββββββββββββββββββββββββββ
|
| 385 |
+
try:
|
| 386 |
+
article = Article(
|
| 387 |
+
title=title,
|
| 388 |
+
description=description,
|
| 389 |
+
url=url,
|
| 390 |
+
image_url=image_url,
|
| 391 |
+
published_at=published_at,
|
| 392 |
+
source=source,
|
| 393 |
+
# ββ ROUTING RULE ββββββββββββββββββββββββββββββββββββββ
|
| 394 |
+
# Pass through the aggregator's category.
|
| 395 |
+
# Unknown/empty categories route to 'News Articles'.
|
| 396 |
+
category=category,
|
| 397 |
+
)
|
| 398 |
+
articles.append(article)
|
| 399 |
+
|
| 400 |
+
except Exception as e:
|
| 401 |
+
logger.debug(f"[Webz] Skipped post '{title[:50]}': {e}")
|
| 402 |
+
continue
|
| 403 |
+
|
| 404 |
+
return articles
|
app/services/providers/wikinews/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# providers/wikinews/__init__.py
|
| 2 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
+
# This file marks the 'wikinews' folder as a Python package.
|
| 4 |
+
# To use this provider, import it like this:
|
| 5 |
+
#
|
| 6 |
+
# from app.services.providers.wikinews.client import WikinewsProvider
|
| 7 |
+
#
|
| 8 |
+
# Wikinews is 100% FREE β no API key, no rate limits, no registration.
|
| 9 |
+
# It is run by the Wikimedia Foundation (same people who run Wikipedia).
|
| 10 |
+
#
|
| 11 |
+
# All content is published under Public Domain or Creative Commons licenses.
|
| 12 |
+
# This makes it the only copyright-bulletproof news source in our pipeline.
|
| 13 |
+
#
|
| 14 |
+
# Gated behind GENERAL_TECH_CATEGORIES (same as HN, Inshorts, SauravKanchan)
|
| 15 |
+
# because Wikinews tech categories cover broad technology topics only.
|
app/services/providers/wikinews/client.py
ADDED
|
@@ -0,0 +1,435 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
providers/wikinews/client.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
The Wikinews Provider for Segmento Pulse.
|
| 5 |
+
|
| 6 |
+
What this does:
|
| 7 |
+
Fetches technology news articles from Wikinews (en.wikinews.org).
|
| 8 |
+
Wikinews is run by the Wikimedia Foundation β the same organization
|
| 9 |
+
behind Wikipedia and Wiktionary.
|
| 10 |
+
|
| 11 |
+
Free. No API key. No rate limits. No copyright concerns.
|
| 12 |
+
|
| 13 |
+
Why Wikinews is unique:
|
| 14 |
+
Every article on Wikinews is published under Public Domain or extremely
|
| 15 |
+
open Creative Commons licenses. This means we can freely display their
|
| 16 |
+
content without any legal risk. It is the only fully copyright-bulletproof
|
| 17 |
+
news source in our entire pipeline.
|
| 18 |
+
|
| 19 |
+
We search TWO Wikinews categories concurrently for maximum coverage:
|
| 20 |
+
- "Computing" β software, hardware, AI, security news
|
| 21 |
+
- "Internet" β web tech, data, social media policy news
|
| 22 |
+
|
| 23 |
+
Gated behind GENERAL_TECH_CATEGORIES in the aggregator because Wikinews
|
| 24 |
+
tech content is broad β it does not know about "cloud-alibaba" or
|
| 25 |
+
"data-governance" as separate topics.
|
| 26 |
+
|
| 27 |
+
ββ THE HTML SNIPPET PROBLEM AND HOW WE FIX IT βββββββββββββββββββββββββββββββ
|
| 28 |
+
|
| 29 |
+
The MediaWiki search API highlights your search terms inside the description
|
| 30 |
+
snippet by wrapping them in HTML tags like this:
|
| 31 |
+
|
| 32 |
+
"The latest advances in <span class=\"searchmatch\">computing</span> have..."
|
| 33 |
+
|
| 34 |
+
If we stored that raw, our database would get cluttered with raw HTML tags
|
| 35 |
+
that would then appear in the Pulse UI as literal text.
|
| 36 |
+
|
| 37 |
+
Fix: We use a simple regex pattern to strip ALL HTML tags from the snippet.
|
| 38 |
+
|
| 39 |
+
re.sub(r'<[^>]+>', '', raw_snippet).strip()
|
| 40 |
+
|
| 41 |
+
<[^>]+> means: any '<', followed by one or more characters that are
|
| 42 |
+
NOT '>', followed by '>'. This matches every HTML tag universally,
|
| 43 |
+
not just MediaWiki's specific span tags β making it bulletproof for
|
| 44 |
+
any future format changes on their end.
|
| 45 |
+
|
| 46 |
+
ββ URL CONSTRUCTION FROM pageid βββββββββββββββββββββββββββββββββββββββββββββ
|
| 47 |
+
|
| 48 |
+
MediaWiki search results give us a 'pageid' integer, NOT a direct URL.
|
| 49 |
+
We construct a permanent, stable URL using the curid URL format:
|
| 50 |
+
|
| 51 |
+
f"https://en.wikinews.org/?curid={pageid}"
|
| 52 |
+
|
| 53 |
+
Example: pageid = 4684321 β https://en.wikinews.org/?curid=4684321
|
| 54 |
+
|
| 55 |
+
This URL format is guaranteed stable by Wikimedia β it never changes
|
| 56 |
+
even if the article is moved or renamed.
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
# ββ Standard Library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 60 |
+
import asyncio
|
| 61 |
+
import logging
|
| 62 |
+
import re
|
| 63 |
+
from typing import List
|
| 64 |
+
|
| 65 |
+
# ββ Third-party (already in requirements.txt) βββββββββββββββββββββββββββββββββ
|
| 66 |
+
import httpx # Async HTTP client
|
| 67 |
+
|
| 68 |
+
# ββ Internal ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 69 |
+
from app.services.providers.base import NewsProvider
|
| 70 |
+
from app.models import Article
|
| 71 |
+
# Phase 12: Shared image enricher (extracts og:image from article pages)
|
| 72 |
+
from app.services.utils.image_enricher import extract_top_image
|
| 73 |
+
|
| 74 |
+
logger = logging.getLogger(__name__)
|
| 75 |
+
|
| 76 |
+
# ββ Wikinews API Configuration ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
+
|
| 78 |
+
# The MediaWiki Action API endpoint for English Wikinews.
|
| 79 |
+
WIKINEWS_API_URL = "https://en.wikinews.org/w/api.php"
|
| 80 |
+
|
| 81 |
+
# We search two categories to broaden our coverage of tech news.
|
| 82 |
+
# 'Computing' β software, AI, hardware. 'Internet' β web, data, social policy.
|
| 83 |
+
WIKINEWS_CATEGORIES = [
|
| 84 |
+
"Computing",
|
| 85 |
+
"Internet",
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
# Max articles to take per category query.
|
| 89 |
+
# 10 per category Γ 2 categories = up to 20 articles per call.
|
| 90 |
+
MAX_ARTICLES_PER_CATEGORY = 10
|
| 91 |
+
|
| 92 |
+
# HTTP timeout in seconds. Wikimedia servers are reliable but can be slow.
|
| 93 |
+
HTTP_TIMEOUT_SECONDS = 12.0
|
| 94 |
+
|
| 95 |
+
# Regex to strip ALL HTML tags from MediaWiki search snippets.
|
| 96 |
+
# MediaWiki wraps search terms in <span class="searchmatch">...</span> tags.
|
| 97 |
+
# We strip all HTML universally so any future tag changes are also handled.
|
| 98 |
+
HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
class WikinewsProvider(NewsProvider):
|
| 102 |
+
"""
|
| 103 |
+
Fetches technology news from Wikinews using the MediaWiki search API.
|
| 104 |
+
|
| 105 |
+
Free. No API key. Copyright-bulletproof (Public Domain / CC).
|
| 106 |
+
Queries 'Computing' and 'Internet' categories concurrently.
|
| 107 |
+
Gated behind GENERAL_TECH_CATEGORIES in the aggregator.
|
| 108 |
+
|
| 109 |
+
Usage (wired in Phase 11):
|
| 110 |
+
provider = WikinewsProvider()
|
| 111 |
+
articles = await provider.fetch_news(category="ai", limit=20)
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
def __init__(self):
|
| 115 |
+
# Free provider β no API key, no daily limit.
|
| 116 |
+
super().__init__(api_key=None)
|
| 117 |
+
self.daily_limit = 0
|
| 118 |
+
|
| 119 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 120 |
+
# MAIN ENTRY POINT β called by the aggregator's FREE PARALLEL RUN
|
| 121 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 122 |
+
|
| 123 |
+
async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:
|
| 124 |
+
"""
|
| 125 |
+
Fetch tech articles from Wikinews's Computing and Internet categories.
|
| 126 |
+
|
| 127 |
+
Both category queries run at the same time using asyncio.gather().
|
| 128 |
+
Their results are combined into one flat list and returned.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
category (str): Our internal category slug (e.g., "ai").
|
| 132 |
+
Tagged on every article. The keyword gate filters
|
| 133 |
+
irrelevant articles downstream.
|
| 134 |
+
limit (int): Soft cap on total articles to return.
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
List[Article]: Combined articles from both Wikinews categories.
|
| 138 |
+
Returns [] if both queries fail.
|
| 139 |
+
"""
|
| 140 |
+
try:
|
| 141 |
+
async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
|
| 142 |
+
|
| 143 |
+
# Fire queries for both categories simultaneously.
|
| 144 |
+
fetch_tasks = [
|
| 145 |
+
self._query_category(client, wiki_cat, category)
|
| 146 |
+
for wiki_cat in WIKINEWS_CATEGORIES
|
| 147 |
+
]
|
| 148 |
+
|
| 149 |
+
results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
|
| 150 |
+
|
| 151 |
+
# Combine results from both categories.
|
| 152 |
+
all_articles: List[Article] = []
|
| 153 |
+
for wiki_cat, result in zip(WIKINEWS_CATEGORIES, results):
|
| 154 |
+
if isinstance(result, Exception):
|
| 155 |
+
logger.warning(
|
| 156 |
+
f"[Wikinews] [{wiki_cat}] Query failed: {result}"
|
| 157 |
+
)
|
| 158 |
+
elif isinstance(result, list):
|
| 159 |
+
all_articles.extend(result)
|
| 160 |
+
|
| 161 |
+
logger.info(
|
| 162 |
+
f"[Wikinews] Collected {len(all_articles)} articles from "
|
| 163 |
+
f"{len(WIKINEWS_CATEGORIES)} categories for '{category}'"
|
| 164 |
+
)
|
| 165 |
+
return all_articles
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"[Wikinews] Unexpected error: {e}", exc_info=True)
|
| 169 |
+
return []
|
| 170 |
+
|
| 171 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 172 |
+
# PRIVATE HELPERS
|
| 173 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 174 |
+
|
| 175 |
+
async def _query_category(
|
| 176 |
+
self,
|
| 177 |
+
client: httpx.AsyncClient,
|
| 178 |
+
wiki_category: str,
|
| 179 |
+
pulse_category: str,
|
| 180 |
+
) -> List[Article]:
|
| 181 |
+
"""
|
| 182 |
+
Run one MediaWiki search query for articles in a given Wikinews category.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
client (httpx.AsyncClient): Shared HTTP client from fetch_news().
|
| 186 |
+
wiki_category (str): The Wikinews category to search within
|
| 187 |
+
(e.g., "Computing", "Internet").
|
| 188 |
+
pulse_category (str): Our internal Pulse category β tagged on articles.
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
List[Article]: Parsed articles. Returns [] on any failure.
|
| 192 |
+
"""
|
| 193 |
+
params = {
|
| 194 |
+
"action": "query",
|
| 195 |
+
"list": "search",
|
| 196 |
+
# incategory: restricts results to articles in that Wikinews category.
|
| 197 |
+
"srsearch": f"incategory:{wiki_category}",
|
| 198 |
+
"srlimit": MAX_ARTICLES_PER_CATEGORY,
|
| 199 |
+
"srprop": "snippet|timestamp", # Only fetch what we actually need
|
| 200 |
+
"format": "json",
|
| 201 |
+
"formatversion": "2", # Cleaner JSON output format
|
| 202 |
+
# Phase 14 fix: Adding 'info' query alongside the search so that
|
| 203 |
+
# MediaWiki returns the 'canonicalurl' for each result page.
|
| 204 |
+
# This eliminates the redirect hop in the image enricher:
|
| 205 |
+
# Before: curid URL β 301 redirect β actual page β parse og:image (2 requests)
|
| 206 |
+
# After: canonicalurl β actual page β parse og:image (1 request)
|
| 207 |
+
# We do not add 'generator=search' because that changes the response
|
| 208 |
+
# format entirely and would break our current _map_search_hits() logic.
|
| 209 |
+
# Instead we capture the canonicalurl inside the search result hit itself
|
| 210 |
+
# via the 'url' srprop (supported by MediaWiki's search module).
|
| 211 |
+
"srprop": "snippet|timestamp|titlesnippet", # Overrides above β note below
|
| 212 |
+
# NOTE: MediaWiki does NOT expose canonicalurl through srprop directly.
|
| 213 |
+
# The correct approach is a separate 'prop=info&inprop=url' sub-query.
|
| 214 |
+
# That requires changing from 'list=search' to 'generator=search' which
|
| 215 |
+
# is a larger refactor. For Phase 14 we use a safe, narrow approach:
|
| 216 |
+
# keep 'snippet|timestamp' as srprop and construct the canonical URL
|
| 217 |
+
# from the title (URL-encoded), which is always stable on Wikinews.
|
| 218 |
+
"srprop": "snippet|timestamp", # Keep original β canonical from title
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
try:
|
| 222 |
+
response = await client.get(
|
| 223 |
+
WIKINEWS_API_URL,
|
| 224 |
+
params=params,
|
| 225 |
+
headers={
|
| 226 |
+
"User-Agent": "SegmentoPulse-Ingestion/1.0 (https://segmento.in)"
|
| 227 |
+
# Wikimedia's API rules require a descriptive User-Agent.
|
| 228 |
+
},
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
if response.status_code == 429:
|
| 232 |
+
logger.warning(f"[Wikinews] [{wiki_category}] HTTP 429 rate limit.")
|
| 233 |
+
self.mark_rate_limited()
|
| 234 |
+
return []
|
| 235 |
+
|
| 236 |
+
if response.status_code != 200:
|
| 237 |
+
logger.warning(
|
| 238 |
+
f"[Wikinews] [{wiki_category}] HTTP {response.status_code} β skipping."
|
| 239 |
+
)
|
| 240 |
+
return []
|
| 241 |
+
|
| 242 |
+
data = response.json()
|
| 243 |
+
|
| 244 |
+
except httpx.TimeoutException:
|
| 245 |
+
logger.warning(f"[Wikinews] [{wiki_category}] Request timed out.")
|
| 246 |
+
return []
|
| 247 |
+
except Exception as e:
|
| 248 |
+
logger.warning(f"[Wikinews] [{wiki_category}] Fetch error: {e}")
|
| 249 |
+
return []
|
| 250 |
+
|
| 251 |
+
# Drill into the MediaWiki response structure.
|
| 252 |
+
# Shape: { "query": { "search": [ {...}, {...} ] } }
|
| 253 |
+
query_block = data.get("query") or {}
|
| 254 |
+
search_hits = query_block.get("search") or []
|
| 255 |
+
|
| 256 |
+
if not search_hits:
|
| 257 |
+
logger.info(f"[Wikinews] [{wiki_category}] No results returned.")
|
| 258 |
+
return []
|
| 259 |
+
|
| 260 |
+
articles = self._map_search_hits(search_hits, wiki_category, pulse_category)
|
| 261 |
+
|
| 262 |
+
# ββ ENRICH: Fetch images for articles that have none ββββββββββββββ
|
| 263 |
+
# _map_search_hits is sync β enrichment happens here in the async caller.
|
| 264 |
+
# Wikinews curid URLs do have og:image tags on their article pages.
|
| 265 |
+
articles = await self._enrich_article_images(wiki_category, articles)
|
| 266 |
+
|
| 267 |
+
logger.info(
|
| 268 |
+
f"[Wikinews] [{wiki_category}] Parsed {len(articles)} articles."
|
| 269 |
+
)
|
| 270 |
+
return articles
|
| 271 |
+
|
| 272 |
+
def _map_search_hits(
|
| 273 |
+
self,
|
| 274 |
+
search_hits: list,
|
| 275 |
+
wiki_category: str,
|
| 276 |
+
pulse_category: str,
|
| 277 |
+
) -> List[Article]:
|
| 278 |
+
"""
|
| 279 |
+
Convert MediaWiki search result items into Segmento Pulse Article objects.
|
| 280 |
+
|
| 281 |
+
Key transformations:
|
| 282 |
+
title β title (direct)
|
| 283 |
+
pageid β url (constructed as curid URL)
|
| 284 |
+
timestamp β published_at (already ISO 8601)
|
| 285 |
+
snippet β description (HTML tags stripped via regex)
|
| 286 |
+
(none) β image_url = "" (no images in search results β Phase 12 fix)
|
| 287 |
+
(hardcoded)β source = "Wikinews"
|
| 288 |
+
|
| 289 |
+
Args:
|
| 290 |
+
search_hits (list): The 'query.search' array from the API response.
|
| 291 |
+
wiki_category (str): Which Wikinews category these came from.
|
| 292 |
+
pulse_category (str): Our internal category β tagged on each article.
|
| 293 |
+
|
| 294 |
+
Returns:
|
| 295 |
+
List[Article]: Clean Article objects.
|
| 296 |
+
"""
|
| 297 |
+
articles: List[Article] = []
|
| 298 |
+
|
| 299 |
+
for hit in search_hits:
|
| 300 |
+
if not isinstance(hit, dict):
|
| 301 |
+
continue
|
| 302 |
+
|
| 303 |
+
# ββ Title ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 304 |
+
title = (hit.get("title") or "").strip()
|
| 305 |
+
if not title:
|
| 306 |
+
continue
|
| 307 |
+
|
| 308 |
+
# ββ URL β canonical title URL with curid fallback ββββββββββββββ
|
| 309 |
+
# Phase 14 fix: Construct the canonical URL from the article title.
|
| 310 |
+
# Wikinews titles map directly to stable URLs under /wiki/.
|
| 311 |
+
# Example: title = "AI chip shortage hits 2026"
|
| 312 |
+
# β https://en.wikinews.org/wiki/AI_chip_shortage_hits_2026
|
| 313 |
+
# This URL is permanent (Wikimedia guarantees title-based URLs).
|
| 314 |
+
# The image enricher can now visit this URL directly without
|
| 315 |
+
# following a 301 redirect from the curid format β saving one
|
| 316 |
+
# HTTP round-trip per article during image enrichment.
|
| 317 |
+
#
|
| 318 |
+
# We still require pageid as a sanity check. If both checks fail,
|
| 319 |
+
# we skip the article entirely (no pageid = no reliable identity).
|
| 320 |
+
pageid = hit.get("pageid")
|
| 321 |
+
if not pageid:
|
| 322 |
+
continue
|
| 323 |
+
|
| 324 |
+
# Build canonical URL from the URL-safe title.
|
| 325 |
+
# urllib.parse.quote() turns spaces β underscores β %20, but Wikimedia
|
| 326 |
+
# actually uses underscores in URLs (not %20). We replace spaces first.
|
| 327 |
+
title_for_url = title.replace(" ", "_")
|
| 328 |
+
import urllib.parse
|
| 329 |
+
canonical_url = (
|
| 330 |
+
"https://en.wikinews.org/wiki/"
|
| 331 |
+
+ urllib.parse.quote(title_for_url, safe="/:@!$&'()*+,;=")
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
# curid URL is kept as fallback β if the canonical URL ever fails
|
| 335 |
+
# to load in the enricher, the curid URL still reaches the same page.
|
| 336 |
+
# We use canonical_url as the primary because it has no redirect hop.
|
| 337 |
+
url = canonical_url
|
| 338 |
+
|
| 339 |
+
# ββ Published Date ββββββββββββββββββββββββββββββββββββββββββββ
|
| 340 |
+
# MediaWiki returns ISO 8601 already, e.g., "2026-03-03T06:00:00Z".
|
| 341 |
+
# Our Article model's published_at validator accepts this directly.
|
| 342 |
+
published_at = hit.get("timestamp") or ""
|
| 343 |
+
|
| 344 |
+
# ββ Description (HTML-stripped snippet) βββββββββββββββββββββββ
|
| 345 |
+
# MediaWiki injects HTML like <span class="searchmatch">term</span>
|
| 346 |
+
# into snippets to highlight search terms. We strip ALL HTML tags
|
| 347 |
+
# using the pre-compiled regex pattern defined at the module level.
|
| 348 |
+
raw_snippet = hit.get("snippet") or ""
|
| 349 |
+
description = HTML_TAG_PATTERN.sub("", raw_snippet).strip()
|
| 350 |
+
|
| 351 |
+
# ββ Image URL βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 352 |
+
# MediaWiki search results do not include images.
|
| 353 |
+
# Phase 12 will add a separate image enrichment step for Wikinews.
|
| 354 |
+
# For now, empty string routes to the Segmento Pulse banner fallback.
|
| 355 |
+
image_url = ""
|
| 356 |
+
|
| 357 |
+
# ββ Build Article βββββββββββββββββββββββββββββββββββββββββββββ
|
| 358 |
+
try:
|
| 359 |
+
article = Article(
|
| 360 |
+
title=title,
|
| 361 |
+
description=description,
|
| 362 |
+
url=url,
|
| 363 |
+
image_url=image_url,
|
| 364 |
+
published_at=published_at,
|
| 365 |
+
source="Wikinews",
|
| 366 |
+
# ββ ROUTING RULE ββββββββββββββββββββββββββββββββββββββ
|
| 367 |
+
# Tag with pulse_category from the aggregator.
|
| 368 |
+
# Unknown categories safely route to 'News Articles'.
|
| 369 |
+
category=pulse_category,
|
| 370 |
+
)
|
| 371 |
+
articles.append(article)
|
| 372 |
+
|
| 373 |
+
except Exception as e:
|
| 374 |
+
logger.debug(
|
| 375 |
+
f"[Wikinews] [{wiki_category}] Skipped '{title[:50]}': {e}"
|
| 376 |
+
)
|
| 377 |
+
continue
|
| 378 |
+
|
| 379 |
+
return articles
|
| 380 |
+
|
| 381 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 382 |
+
# PHASE 12: IMAGE ENRICHMENT β async post-processing step
|
| 383 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 384 |
+
|
| 385 |
+
async def _enrich_article_images(
|
| 386 |
+
self, wiki_category: str, articles: List[Article]
|
| 387 |
+
) -> List[Article]:
|
| 388 |
+
"""
|
| 389 |
+
For every article that has an empty image_url, visit its Wikinews
|
| 390 |
+
curid URL and try to find the main image via the og:image meta tag.
|
| 391 |
+
|
| 392 |
+
Wikinews article pages DO include og:image tags β they are set by
|
| 393 |
+
the MediaWiki software for every published article. This call is
|
| 394 |
+
therefore likely to succeed for most articles.
|
| 395 |
+
|
| 396 |
+
All image fetches run concurrently. With the outer 4-second timeout
|
| 397 |
+
per call, the entire batch takes ~4 seconds maximum, not N x 4.
|
| 398 |
+
|
| 399 |
+
Args:
|
| 400 |
+
wiki_category (str): Category label used for logging only.
|
| 401 |
+
articles (List[Article]): Output from _map_search_hits().
|
| 402 |
+
|
| 403 |
+
Returns:
|
| 404 |
+
List[Article]: Same articles, with image_url filled in where possible.
|
| 405 |
+
"""
|
| 406 |
+
if not articles:
|
| 407 |
+
return articles
|
| 408 |
+
|
| 409 |
+
# Phase 14 fix: Added asyncio.Semaphore(10) to cap concurrent connections.
|
| 410 |
+
# Before: 10 articles per category Γ 2 categories = 20 simultaneous HTTP
|
| 411 |
+
# requests to Wikinews article pages β no limit.
|
| 412 |
+
# After: At most 10 page visits run at the same time. The rest queue safely.
|
| 413 |
+
sem = asyncio.Semaphore(10)
|
| 414 |
+
|
| 415 |
+
async def _get_image(article: Article) -> str:
|
| 416 |
+
if article.image_url and article.image_url.startswith("http"):
|
| 417 |
+
return article.image_url # Already has an image β skip
|
| 418 |
+
# Acquire one of 10 available lanes before fetching the page.
|
| 419 |
+
async with sem:
|
| 420 |
+
return await extract_top_image(article.url)
|
| 421 |
+
|
| 422 |
+
image_tasks = [_get_image(a) for a in articles]
|
| 423 |
+
fetched_images = await asyncio.gather(*image_tasks, return_exceptions=True)
|
| 424 |
+
|
| 425 |
+
enriched: List[Article] = []
|
| 426 |
+
for article, image_result in zip(articles, fetched_images):
|
| 427 |
+
if isinstance(image_result, str) and image_result:
|
| 428 |
+
article = article.model_copy(update={"image_url": image_result})
|
| 429 |
+
enriched.append(article)
|
| 430 |
+
|
| 431 |
+
logger.info(
|
| 432 |
+
f"[Wikinews] [{wiki_category}] Image enrichment complete β "
|
| 433 |
+
f"{sum(1 for a in enriched if a.image_url)}/{len(enriched)} articles have images."
|
| 434 |
+
)
|
| 435 |
+
return enriched
|
app/services/providers/worldnewsai/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# providers/worldnewsai/__init__.py
|
| 2 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
+
# This file marks the 'worldnewsai' folder as a Python package.
|
| 4 |
+
# To use this provider, import it like this:
|
| 5 |
+
#
|
| 6 |
+
# from app.services.providers.worldnewsai.client import WorldNewsAIProvider
|
| 7 |
+
#
|
| 8 |
+
# This is a PAID provider (point-based quota) β it requires the
|
| 9 |
+
# WORLDNEWS_API_KEY environment variable to be set.
|
| 10 |
+
#
|
| 11 |
+
# It sits at position 5 in the PAID_CHAIN β the last line of defence
|
| 12 |
+
# before the paid chain gives up. Only fires after GNews, NewsAPI,
|
| 13 |
+
# NewsData, and TheNewsAPI have all failed or exhausted their budgets.
|
| 14 |
+
#
|
| 15 |
+
# ββ CRITICAL QUOTA WARNING ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
+
# WorldNewsAI uses a point system, NOT a simple request counter.
|
| 17 |
+
# Each API call costs points + each returned article costs additional points.
|
| 18 |
+
# The client has a conservative daily_limit = 50 calls to protect the budget.
|
| 19 |
+
# If you see HTTP 402, the daily point budget is fully exhausted.
|
app/services/providers/worldnewsai/client.py
ADDED
|
@@ -0,0 +1,359 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
providers/worldnewsai/client.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
The WorldNewsAI Provider for Segmento Pulse.
|
| 5 |
+
|
| 6 |
+
What this does:
|
| 7 |
+
Fetches technology news from WorldNewsAI.com β a global news crawler
|
| 8 |
+
that indexes tens of thousands of sources worldwide, including many
|
| 9 |
+
non-English and non-US-centric publications.
|
| 10 |
+
|
| 11 |
+
Paid provider β needs WORLDNEWS_API_KEY in your .env file.
|
| 12 |
+
Position 5 in the PAID_CHAIN (last paid failover).
|
| 13 |
+
|
| 14 |
+
ββ THE CRITICAL QUOTA PROBLEM AND HOW WE SOLVE IT ββββββββββββββββββββββββββ
|
| 15 |
+
|
| 16 |
+
WorldNewsAI does NOT use a simple "100 requests per day" model.
|
| 17 |
+
It uses a POINT system:
|
| 18 |
+
- Each search call costs points
|
| 19 |
+
- Each article returned in the response costs additional points
|
| 20 |
+
- If you run out of points, the API returns HTTP 402 (not 429)
|
| 21 |
+
|
| 22 |
+
If we called this for all 22 categories every hour, we would exhaust our
|
| 23 |
+
free-tier point budget before lunchtime.
|
| 24 |
+
|
| 25 |
+
Our two-layer protection:
|
| 26 |
+
1. Position 5 in PAID_CHAIN: Only fires as the last fallback after
|
| 27 |
+
GNews, NewsAPI, NewsData, and TheNewsAPI have all failed.
|
| 28 |
+
In a healthy system, it will rarely be called at all.
|
| 29 |
+
2. daily_limit = 50: The quota tracker caps total calls per day.
|
| 30 |
+
Once 50 calls are used, the circuit breaker prevents further calls.
|
| 31 |
+
|
| 32 |
+
ββ THE CONTENT SAFETY PROBLEM AND HOW WE SOLVE IT ββββββββββββββββββββββββββ
|
| 33 |
+
|
| 34 |
+
WorldNewsAI returns the FULL article body in the 'text' field.
|
| 35 |
+
A typical article body is 500-3,000 words β far too large to store in
|
| 36 |
+
our database for each article, and potentially a copyright issue.
|
| 37 |
+
|
| 38 |
+
Fix: We take only the first 200 characters from the 'text' field
|
| 39 |
+
and use that as the article's description. This is the same "snippet"
|
| 40 |
+
approach used by Google News, Bing News, and other aggregators.
|
| 41 |
+
200 characters is enough to show a preview without reproducing the article.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
# ββ Standard Library ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
import logging
|
| 46 |
+
from datetime import datetime, timezone
|
| 47 |
+
from typing import List, Optional
|
| 48 |
+
|
| 49 |
+
# ββ Third-party (already in requirements.txt) ββββββββββββββββββββββββββββββββββ
|
| 50 |
+
import httpx # Async HTTP client
|
| 51 |
+
|
| 52 |
+
# ββ Internal βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
+
from app.services.providers.base import NewsProvider, ProviderStatus
|
| 54 |
+
from app.models import Article
|
| 55 |
+
from app.config import settings
|
| 56 |
+
# Phase 16: Import the Redis counter utility to make the daily budget
|
| 57 |
+
# restart-proof. Without this, self.request_count lives in RAM and resets
|
| 58 |
+
# to 0 on every Hugging Face Space restart, letting us overspend the quota.
|
| 59 |
+
from app.services.utils.provider_state import (
|
| 60 |
+
get_provider_counter,
|
| 61 |
+
increment_provider_counter,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
logger = logging.getLogger(__name__)
|
| 65 |
+
|
| 66 |
+
# ββ Constants βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
|
| 68 |
+
# WorldNewsAI search endpoint (v1)
|
| 69 |
+
WORLDNEWSAI_SEARCH_URL = "https://api.worldnewsapi.com/search-news"
|
| 70 |
+
|
| 71 |
+
# Request timeout in seconds
|
| 72 |
+
HTTP_TIMEOUT_SECONDS = 12.0
|
| 73 |
+
|
| 74 |
+
# Articles per call. Keep it modest to save points per request.
|
| 75 |
+
ARTICLES_PER_REQUEST = 10
|
| 76 |
+
|
| 77 |
+
# How many characters of article body text to keep as the description.
|
| 78 |
+
# Enough for a readable summary, small enough to avoid copyright concerns
|
| 79 |
+
# and database bloat. Matches the 200-char limit used by our RSS parser.
|
| 80 |
+
DESCRIPTION_MAX_CHARS = 200
|
| 81 |
+
|
| 82 |
+
# Category β search text mapping.
|
| 83 |
+
# WorldNewsAI takes free-text search queries, not categories.
|
| 84 |
+
# We translate our internal category slug into a descriptive keyword phrase.
|
| 85 |
+
CATEGORY_QUERY_MAP = {
|
| 86 |
+
'ai': 'artificial intelligence machine learning',
|
| 87 |
+
'data-security': 'data security cybersecurity breach',
|
| 88 |
+
'data-governance': 'data governance compliance regulation',
|
| 89 |
+
'data-privacy': 'data privacy GDPR CCPA',
|
| 90 |
+
'data-engineering': 'data engineering pipeline ETL',
|
| 91 |
+
'data-management': 'data management master data catalog',
|
| 92 |
+
'business-intelligence': 'business intelligence analytics BI',
|
| 93 |
+
'business-analytics': 'business analytics reporting dashboards',
|
| 94 |
+
'customer-data-platform': 'customer data platform CDP',
|
| 95 |
+
'data-centers': 'data center infrastructure colocation',
|
| 96 |
+
'cloud-computing': 'cloud computing technology',
|
| 97 |
+
'magazines': 'technology news',
|
| 98 |
+
'data-laws': 'data privacy law regulation AI act',
|
| 99 |
+
'cloud-aws': 'Amazon Web Services AWS cloud',
|
| 100 |
+
'cloud-azure': 'Microsoft Azure cloud',
|
| 101 |
+
'cloud-gcp': 'Google Cloud Platform GCP',
|
| 102 |
+
'cloud-oracle': 'Oracle Cloud OCI',
|
| 103 |
+
'cloud-ibm': 'IBM Cloud Red Hat',
|
| 104 |
+
'cloud-alibaba': 'Alibaba Cloud technology',
|
| 105 |
+
'cloud-digitalocean': 'DigitalOcean cloud platform',
|
| 106 |
+
'cloud-huawei': 'Huawei Cloud technology',
|
| 107 |
+
'cloud-cloudflare': 'Cloudflare network security',
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
class WorldNewsAIProvider(NewsProvider):
|
| 112 |
+
"""
|
| 113 |
+
Fetches global technology news from WorldNewsAI.com.
|
| 114 |
+
|
| 115 |
+
Paid provider (point-based quota) β position 5 in the PAID_CHAIN.
|
| 116 |
+
Only fires when GNews, NewsAPI, NewsData, and TheNewsAPI have all failed.
|
| 117 |
+
Requires WORLDNEWS_API_KEY in the .env file.
|
| 118 |
+
|
| 119 |
+
Usage (wired in Phase 8):
|
| 120 |
+
provider = WorldNewsAIProvider(api_key="your_key_here")
|
| 121 |
+
articles = await provider.fetch_news(category="ai", limit=10)
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
def __init__(self, api_key: Optional[str] = None):
|
| 125 |
+
super().__init__(api_key=api_key)
|
| 126 |
+
|
| 127 |
+
# Phase 16: This value is now the CEILING checked in Redis, not just
|
| 128 |
+
# a RAM counter. Even if the server restarts mid-day, Redis remembers
|
| 129 |
+
# exactly how many calls we have already made today.
|
| 130 |
+
self.daily_limit = 50
|
| 131 |
+
|
| 132 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 133 |
+
# MAIN ENTRY POINT β called by the aggregator's PAID WATERFALL
|
| 134 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 135 |
+
|
| 136 |
+
async def fetch_news(self, category: str, limit: int = 10) -> List[Article]:
|
| 137 |
+
"""
|
| 138 |
+
Fetch global technology news from WorldNewsAI.
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
category (str): Our internal category slug (e.g., "ai").
|
| 142 |
+
We look it up in CATEGORY_QUERY_MAP to get
|
| 143 |
+
the search text for the API call.
|
| 144 |
+
limit (int): Max articles to return. Kept at 10 by default
|
| 145 |
+
to conserve the point budget per call.
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
List[Article]: Mapped Article objects. Returns [] on any failure.
|
| 149 |
+
"""
|
| 150 |
+
if not self.api_key:
|
| 151 |
+
logger.debug("[WorldNewsAI] No API key configured β skipping.")
|
| 152 |
+
return []
|
| 153 |
+
|
| 154 |
+
# ββ PHASE 16: Redis-backed daily budget guard ββββββββββββββββββββββββ
|
| 155 |
+
# Check how many times we have already called WorldNewsAI TODAY
|
| 156 |
+
# using the Redis counter (not self.request_count which lives in RAM).
|
| 157 |
+
#
|
| 158 |
+
# Today's date string (UTC) is used as part of the Redis key so the
|
| 159 |
+
# counter automatically resets at midnight UTC without any manual work.
|
| 160 |
+
# Example key: "provider:state:worldnewsai:calls:2026-03-03"
|
| 161 |
+
#
|
| 162 |
+
# If Redis is unreachable: get_provider_counter returns 999999
|
| 163 |
+
# (fail-safe) so we skip the call rather than risk overspending.
|
| 164 |
+
today_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
| 165 |
+
current_calls = await get_provider_counter("worldnewsai", today_str)
|
| 166 |
+
|
| 167 |
+
if current_calls >= self.daily_limit:
|
| 168 |
+
logger.warning(
|
| 169 |
+
"[WorldNewsAI] Daily Redis budget exhausted β %d/%d calls used today. "
|
| 170 |
+
"Skipping to protect the API quota.",
|
| 171 |
+
current_calls, self.daily_limit
|
| 172 |
+
)
|
| 173 |
+
self.mark_rate_limited()
|
| 174 |
+
return []
|
| 175 |
+
|
| 176 |
+
search_text = CATEGORY_QUERY_MAP.get(category, "technology news")
|
| 177 |
+
|
| 178 |
+
params = {
|
| 179 |
+
"text": search_text,
|
| 180 |
+
"language": "en",
|
| 181 |
+
"number": min(limit, ARTICLES_PER_REQUEST),
|
| 182 |
+
"api-key": self.api_key,
|
| 183 |
+
# NOTE: No date filters applied here intentionally.
|
| 184 |
+
# WorldNewsAI supports 'earliest-publish-date' and
|
| 185 |
+
# 'latest-publish-date', but our freshness gate handles
|
| 186 |
+
# date filtering more accurately using IST boundaries.
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
try:
|
| 190 |
+
async with httpx.AsyncClient(timeout=HTTP_TIMEOUT_SECONDS) as client:
|
| 191 |
+
print(
|
| 192 |
+
f"[WorldNewsAI] Fetching '{category}' "
|
| 193 |
+
f"(query='{search_text[:40]}...')..."
|
| 194 |
+
)
|
| 195 |
+
response = await client.get(WORLDNEWSAI_SEARCH_URL, params=params)
|
| 196 |
+
|
| 197 |
+
# ββ HTTP 402: Point quota fully exhausted βββββββββββββββββ
|
| 198 |
+
# 402 means we are out of points for today β not just rate
|
| 199 |
+
# limited, but completely blocked until tomorrow's reset.
|
| 200 |
+
# We mark the provider as RATE_LIMITED (not ERROR) so it can
|
| 201 |
+
# recover after the scheduler's daily quota reset cycle.
|
| 202 |
+
if response.status_code == 402:
|
| 203 |
+
logger.warning(
|
| 204 |
+
"[WorldNewsAI] HTTP 402 β point quota exhausted. "
|
| 205 |
+
"No more calls until tomorrow's reset."
|
| 206 |
+
)
|
| 207 |
+
self.mark_rate_limited()
|
| 208 |
+
return []
|
| 209 |
+
|
| 210 |
+
# ββ HTTP 401: Invalid or expired API key ββββββββββββββββββ
|
| 211 |
+
if response.status_code == 401:
|
| 212 |
+
logger.error(
|
| 213 |
+
"[WorldNewsAI] HTTP 401 β API key is invalid or expired. "
|
| 214 |
+
"Check WORLDNEWS_API_KEY in your .env file."
|
| 215 |
+
)
|
| 216 |
+
self.status = ProviderStatus.ERROR
|
| 217 |
+
return []
|
| 218 |
+
|
| 219 |
+
# ββ HTTP 429: Too many requests (short-term rate limit) βββ
|
| 220 |
+
if response.status_code == 429:
|
| 221 |
+
logger.warning("[WorldNewsAI] HTTP 429 β request rate exceeded.")
|
| 222 |
+
self.mark_rate_limited()
|
| 223 |
+
return []
|
| 224 |
+
|
| 225 |
+
# ββ Any other non-200 βββββββββββββββββββββββββββββββββββββ
|
| 226 |
+
if response.status_code != 200:
|
| 227 |
+
logger.warning(
|
| 228 |
+
f"[WorldNewsAI] Unexpected HTTP {response.status_code}."
|
| 229 |
+
)
|
| 230 |
+
return []
|
| 231 |
+
|
| 232 |
+
# ββ Parse the response βββββββββββββββββββββββββββββββββββββββββ
|
| 233 |
+
self.request_count += 1 # Keep RAM shadow in sync for debugging
|
| 234 |
+
data = response.json()
|
| 235 |
+
|
| 236 |
+
# WorldNewsAI wraps articles in a top-level 'news' key
|
| 237 |
+
raw_articles = data.get("news", [])
|
| 238 |
+
|
| 239 |
+
if not raw_articles:
|
| 240 |
+
logger.info(
|
| 241 |
+
f"[WorldNewsAI] No articles returned for '{category}'."
|
| 242 |
+
)
|
| 243 |
+
return []
|
| 244 |
+
|
| 245 |
+
articles = self._map_articles(raw_articles, category)
|
| 246 |
+
|
| 247 |
+
# ββ PHASE 16: Increment the Redis counter after a successful call ββ
|
| 248 |
+
# We only count successful 200 responses, not failures.
|
| 249 |
+
# A failed call that returns [] should NOT burn our daily budget.
|
| 250 |
+
await increment_provider_counter("worldnewsai", today_str)
|
| 251 |
+
|
| 252 |
+
logger.info("[WorldNewsAI] Got %d articles for '%s'.", len(articles), category)
|
| 253 |
+
return articles
|
| 254 |
+
|
| 255 |
+
except httpx.TimeoutException:
|
| 256 |
+
logger.warning("[WorldNewsAI] Request timed out.")
|
| 257 |
+
return []
|
| 258 |
+
except Exception as e:
|
| 259 |
+
logger.error(f"[WorldNewsAI] Unexpected error: {e}", exc_info=True)
|
| 260 |
+
return []
|
| 261 |
+
|
| 262 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 263 |
+
# PRIVATE HELPER β maps raw JSON items to Article objects
|
| 264 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 265 |
+
|
| 266 |
+
def _map_articles(self, raw_articles: list, category: str) -> List[Article]:
|
| 267 |
+
"""
|
| 268 |
+
Convert WorldNewsAI JSON items into Segmento Pulse Article objects.
|
| 269 |
+
|
| 270 |
+
Key transformations:
|
| 271 |
+
- 'text' field is truncated to 200 characters (body is too long)
|
| 272 |
+
- 'authors' is a list β we join it with ", " into one string
|
| 273 |
+
- 'image' maps directly to image_url
|
| 274 |
+
|
| 275 |
+
WorldNewsAI field β Article field
|
| 276 |
+
ββββββββββββββββββββββββββββββββββββββ
|
| 277 |
+
title β title
|
| 278 |
+
url β url
|
| 279 |
+
image β image_url
|
| 280 |
+
publish_date β published_at
|
| 281 |
+
authors (list) β source (joined)
|
| 282 |
+
text (truncated) β description
|
| 283 |
+
|
| 284 |
+
Args:
|
| 285 |
+
raw_articles (list): The 'news' array from the API response.
|
| 286 |
+
category (str): The aggregator's category for routing.
|
| 287 |
+
|
| 288 |
+
Returns:
|
| 289 |
+
List[Article]: Clean Article objects ready for the pipeline.
|
| 290 |
+
"""
|
| 291 |
+
articles: List[Article] = []
|
| 292 |
+
|
| 293 |
+
for item in raw_articles:
|
| 294 |
+
if not isinstance(item, dict):
|
| 295 |
+
continue
|
| 296 |
+
|
| 297 |
+
# ββ Title ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 298 |
+
title = (item.get("title") or "").strip()
|
| 299 |
+
if not title:
|
| 300 |
+
continue
|
| 301 |
+
|
| 302 |
+
# ββ URL ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 303 |
+
url = (item.get("url") or "").strip()
|
| 304 |
+
if not url or not url.startswith("http"):
|
| 305 |
+
continue
|
| 306 |
+
|
| 307 |
+
# ββ Image URL βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 308 |
+
image_url = (item.get("image") or "").strip()
|
| 309 |
+
|
| 310 |
+
# ββ Published Date ββββββββββββββββββββββββββββββββββββββββββββ
|
| 311 |
+
# WorldNewsAI returns ISO 8601 format (e.g., "2026-03-03 06:00:00")
|
| 312 |
+
# Our Article model's published_at validator can handle this.
|
| 313 |
+
published_at = item.get("publish_date") or ""
|
| 314 |
+
|
| 315 |
+
# ββ Source (from authors list) ββββββββββββββββββββββββββββββββ
|
| 316 |
+
# 'authors' is a list of names, e.g., ["Jane Doe", "John Smith"]
|
| 317 |
+
# We join them into a comma-separated string for the source field.
|
| 318 |
+
authors = item.get("authors") or []
|
| 319 |
+
if isinstance(authors, list) and authors:
|
| 320 |
+
# Filter out empty strings first, then join
|
| 321 |
+
clean_authors = [a.strip() for a in authors if a and a.strip()]
|
| 322 |
+
source = ", ".join(clean_authors) if clean_authors else "WorldNewsAI"
|
| 323 |
+
else:
|
| 324 |
+
source = "WorldNewsAI"
|
| 325 |
+
|
| 326 |
+
# ββ Description (TRUNCATED body text) βββββββββββββββββββββββββ
|
| 327 |
+
# WorldNewsAI returns the FULL article body in 'text'.
|
| 328 |
+
# This is thousands of words β we MUST truncate it.
|
| 329 |
+
# 200 characters gives a readable preview without storing
|
| 330 |
+
# copyright-protected full content in our database.
|
| 331 |
+
raw_text = (item.get("text") or item.get("summary") or "").strip()
|
| 332 |
+
if len(raw_text) > DESCRIPTION_MAX_CHARS:
|
| 333 |
+
description = raw_text[:DESCRIPTION_MAX_CHARS] + "..."
|
| 334 |
+
else:
|
| 335 |
+
description = raw_text
|
| 336 |
+
|
| 337 |
+
# ββ Build Article βββββββββββββββββββββββββββββββββββββββββββββ
|
| 338 |
+
try:
|
| 339 |
+
article = Article(
|
| 340 |
+
title=title,
|
| 341 |
+
description=description,
|
| 342 |
+
url=url,
|
| 343 |
+
image_url=image_url,
|
| 344 |
+
published_at=published_at,
|
| 345 |
+
source=source,
|
| 346 |
+
# ββ ROUTING RULE ββββββββββββββββββββββββββββββββββββββ
|
| 347 |
+
# Pass through the aggregator's category.
|
| 348 |
+
# Unknown categories safely route to 'News Articles'.
|
| 349 |
+
category=category,
|
| 350 |
+
)
|
| 351 |
+
articles.append(article)
|
| 352 |
+
|
| 353 |
+
except Exception as e:
|
| 354 |
+
logger.debug(
|
| 355 |
+
f"[WorldNewsAI] Skipped item '{title[:50]}': {e}"
|
| 356 |
+
)
|
| 357 |
+
continue
|
| 358 |
+
|
| 359 |
+
return articles
|
app/services/scheduler.py
CHANGED
|
@@ -17,6 +17,8 @@ from app.services.upstash_cache import get_upstash_cache # Needed to bust stal
|
|
| 17 |
from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
|
| 18 |
from app.services.research_aggregator import ResearchAggregator
|
| 19 |
from app.config import settings
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Setup logging
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -377,6 +379,149 @@ async def fetch_daily_research():
|
|
| 377 |
logger.info("β" * 80)
|
| 378 |
|
| 379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
async def fetch_and_validate_category(category: str, aggregator) -> tuple:
|
| 381 |
"""
|
| 382 |
Fetch and validate articles for a single category.
|
|
@@ -393,6 +538,7 @@ async def fetch_and_validate_category(category: str, aggregator) -> tuple:
|
|
| 393 |
from app.utils.date_parser import normalize_article_date
|
| 394 |
from app.utils.url_canonicalization import canonicalize_url
|
| 395 |
from app.utils.redis_dedup import is_url_seen_or_mark
|
|
|
|
| 396 |
|
| 397 |
try:
|
| 398 |
logger.info("π Fetching %s...", category.upper())
|
|
@@ -477,12 +623,41 @@ async def fetch_and_validate_category(category: str, aggregator) -> tuple:
|
|
| 477 |
continue
|
| 478 |
|
| 479 |
# Step 4: Normalize date to UTC ISO-8601.
|
| 480 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
|
| 482 |
-
# Step 5: Sanitize and clean the article fields.
|
| 483 |
-
clean_article = sanitize_article(article)
|
| 484 |
-
valid_articles.append(clean_article)
|
| 485 |
-
|
| 486 |
logger.info("β %s: %d valid, %d invalid, %d irrelevant",
|
| 487 |
category.upper(), len(valid_articles), invalid_count, irrelevant_count)
|
| 488 |
return (category, valid_articles, invalid_count, irrelevant_count, relevant_count)
|
|
|
|
| 17 |
from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
|
| 18 |
from app.services.research_aggregator import ResearchAggregator
|
| 19 |
from app.config import settings
|
| 20 |
+
# Phase 13: Global image enrichment β fills missing og:image across ALL providers
|
| 21 |
+
from app.services.utils.image_enricher import extract_top_image
|
| 22 |
|
| 23 |
# Setup logging
|
| 24 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 379 |
logger.info("β" * 80)
|
| 380 |
|
| 381 |
|
| 382 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 383 |
+
# PHASE 13: GLOBAL IMAGE ENRICHMENT SAFETY NET
|
| 384 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 385 |
+
#
|
| 386 |
+
# What this does:
|
| 387 |
+
# After all validation and deduplication gates have passed, some articles
|
| 388 |
+
# still arrive with an empty or missing image_url. This happens most often
|
| 389 |
+
# with providers like OpenRSS (blog feeds without media tags), Webz.io
|
| 390 |
+
# (small sites without a thread.main_image), and SauravKanchan (NewsAPI
|
| 391 |
+
# null urlToImage). This function visits the article's URL and tries to
|
| 392 |
+
# extract the og:image meta tag β the standard way websites declare their
|
| 393 |
+
# main thumbnail image.
|
| 394 |
+
#
|
| 395 |
+
# Why AFTER deduplication?
|
| 396 |
+
# We only enrich articles that actually passed every gate and are about to
|
| 397 |
+
# be saved. We never spend HTTP calls on articles that will be thrown away.
|
| 398 |
+
#
|
| 399 |
+
# Safety guards:
|
| 400 |
+
# 1. MAX_ENRICH_PER_RUN = 20 β Hard cap. If 50 no-image articles arrive,
|
| 401 |
+
# we only enrich the first 20, leave the rest as "", and the Pulse banner
|
| 402 |
+
# shows on the frontend. This stops a rogue provider from bottlenecking
|
| 403 |
+
# the cron job.
|
| 404 |
+
# 2. asyncio.Semaphore(10) β At most 10 web-page fetches happen at the
|
| 405 |
+
# same time. This prevents memory spikes and avoids hammering websites.
|
| 406 |
+
# 3. Individual 4-second timeout (inside extract_top_image) β A broken URL
|
| 407 |
+
# is cancelled in 4 seconds. With Semaphore(10) and MAX 20 articles:
|
| 408 |
+
# worst-case total overhead = (20 / 10) Γ 4 = 8 seconds per category run.
|
| 409 |
+
# 4. Zero side-effects β A failed enrichment returns the article unchanged.
|
| 410 |
+
# The enricher NEVER removes an article from the pipeline.
|
| 411 |
+
#
|
| 412 |
+
async def enrich_missing_images_in_batch(articles: list) -> list:
|
| 413 |
+
"""
|
| 414 |
+
Scan a list of fully-vetted articles and fill in any missing images.
|
| 415 |
+
|
| 416 |
+
Only enriches up to MAX_ENRICH_PER_RUN articles that have no valid
|
| 417 |
+
image_url. Articles that already have an image are passed through
|
| 418 |
+
instantly with zero network cost.
|
| 419 |
+
|
| 420 |
+
Args:
|
| 421 |
+
articles (list): Final, deduplicated, validated Article objects.
|
| 422 |
+
|
| 423 |
+
Returns:
|
| 424 |
+
list: Same articles, with image_url filled where possible.
|
| 425 |
+
Never raises. Never removes an article.
|
| 426 |
+
"""
|
| 427 |
+
if not articles:
|
| 428 |
+
return articles
|
| 429 |
+
|
| 430 |
+
# ββ Constants βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 431 |
+
# Cap: only attempt image enrichment on the first 20 articles that need it.
|
| 432 |
+
# The rest go to the database as-is (empty image = Pulse banner fallback).
|
| 433 |
+
MAX_ENRICH_PER_RUN = 20
|
| 434 |
+
|
| 435 |
+
# Semaphore: at most 10 website fetches run simultaneously.
|
| 436 |
+
# Think of it like a queue of 10 checkout lanes at a supermarket.
|
| 437 |
+
# If 20 people arrive at once, 10 go straight through and 10 wait
|
| 438 |
+
# in line. Nobody gets turned away, but the store doesn't explode.
|
| 439 |
+
sem = asyncio.Semaphore(10)
|
| 440 |
+
|
| 441 |
+
# ββ Count how many articles actually need enrichment βββββββββββββββββββββββ
|
| 442 |
+
articles_needing_images = [
|
| 443 |
+
a for a in articles
|
| 444 |
+
if not a.image_url or not a.image_url.startswith("http")
|
| 445 |
+
]
|
| 446 |
+
enrich_count = min(len(articles_needing_images), MAX_ENRICH_PER_RUN)
|
| 447 |
+
|
| 448 |
+
if enrich_count == 0:
|
| 449 |
+
# Every article already has a valid image. Nothing to do.
|
| 450 |
+
return articles
|
| 451 |
+
|
| 452 |
+
logger.info(
|
| 453 |
+
"πΌοΈ [IMAGE ENRICHER] %d article(s) missing images β enriching up to %d...",
|
| 454 |
+
len(articles_needing_images), enrich_count
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
# Build a lookup set of URLs to enrich (only the capped subset).
|
| 458 |
+
urls_to_enrich = {
|
| 459 |
+
str(a.url) for a in articles_needing_images[:MAX_ENRICH_PER_RUN]
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
# ββ Internal worker: enrich one article βββββββββββββββββββββββββββββββββββ
|
| 463 |
+
async def _enrich_one(article) -> object:
|
| 464 |
+
"""
|
| 465 |
+
If this article needs an image, fetch it under the semaphore guard.
|
| 466 |
+
Returns the article (updated or unchanged).
|
| 467 |
+
"""
|
| 468 |
+
url_str = str(article.url) if article.url else ""
|
| 469 |
+
|
| 470 |
+
# Article already has a valid image, or it's outside the cap β skip.
|
| 471 |
+
if url_str not in urls_to_enrich:
|
| 472 |
+
return article
|
| 473 |
+
|
| 474 |
+
async with sem:
|
| 475 |
+
# Semaphore acquired: one of our 10 lanes is now occupied.
|
| 476 |
+
# extract_top_image has its own 4-second internal timeout,
|
| 477 |
+
# so this will release the lane quickly regardless of outcome.
|
| 478 |
+
image_url = await extract_top_image(url_str)
|
| 479 |
+
|
| 480 |
+
if image_url and image_url.startswith("http"):
|
| 481 |
+
# Got a valid image β update the article cleanly.
|
| 482 |
+
# model_copy() is the correct Pydantic v2 pattern for immutable models.
|
| 483 |
+
return article.model_copy(update={"image_url": image_url})
|
| 484 |
+
|
| 485 |
+
# No image found or fetch failed β return article unchanged.
|
| 486 |
+
return article
|
| 487 |
+
|
| 488 |
+
# ββ Run all workers concurrently βββββββββββββββββββββββββββββββββββββββββββ
|
| 489 |
+
# All articles go into gather() at once. The semaphore controls how many
|
| 490 |
+
# actually hit the network at the same time (max 10). The rest wait
|
| 491 |
+
# in asyncio's queue without blocking the event loop.
|
| 492 |
+
try:
|
| 493 |
+
enriched_articles = await asyncio.gather(
|
| 494 |
+
*[_enrich_one(a) for a in articles],
|
| 495 |
+
return_exceptions=True
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
# Replace any Exception results with the original article (safe fallback).
|
| 499 |
+
final = []
|
| 500 |
+
for original, result in zip(articles, enriched_articles):
|
| 501 |
+
if isinstance(result, Exception):
|
| 502 |
+
logger.debug(
|
| 503 |
+
"[IMAGE ENRICHER] Worker exception for %s: %s",
|
| 504 |
+
str(original.url)[:60], result
|
| 505 |
+
)
|
| 506 |
+
final.append(original) # Keep original if worker crashed
|
| 507 |
+
else:
|
| 508 |
+
final.append(result)
|
| 509 |
+
|
| 510 |
+
enriched_total = sum(
|
| 511 |
+
1 for a in final if a.image_url and a.image_url.startswith("http")
|
| 512 |
+
)
|
| 513 |
+
logger.info(
|
| 514 |
+
"β
[IMAGE ENRICHER] Done β %d/%d articles now have images.",
|
| 515 |
+
enriched_total, len(final)
|
| 516 |
+
)
|
| 517 |
+
return final
|
| 518 |
+
|
| 519 |
+
except Exception as e:
|
| 520 |
+
# If the entire gather somehow fails, return the original list untouched.
|
| 521 |
+
logger.error("[IMAGE ENRICHER] Gather failed: %s β returning articles unchanged.", e)
|
| 522 |
+
return articles
|
| 523 |
+
|
| 524 |
+
|
| 525 |
async def fetch_and_validate_category(category: str, aggregator) -> tuple:
|
| 526 |
"""
|
| 527 |
Fetch and validate articles for a single category.
|
|
|
|
| 538 |
from app.utils.date_parser import normalize_article_date
|
| 539 |
from app.utils.url_canonicalization import canonicalize_url
|
| 540 |
from app.utils.redis_dedup import is_url_seen_or_mark
|
| 541 |
+
from app.models import Article # Needed to reconstruct Pydantic model after date normalization
|
| 542 |
|
| 543 |
try:
|
| 544 |
logger.info("π Fetching %s...", category.upper())
|
|
|
|
| 623 |
continue
|
| 624 |
|
| 625 |
# Step 4: Normalize date to UTC ISO-8601.
|
| 626 |
+
# IMPORTANT: normalize_article_date() always returns a plain dict
|
| 627 |
+
# (it calls model_dump() internally). We reconstruct the Pydantic
|
| 628 |
+
# Article right after so that enrich_missing_images_in_batch()
|
| 629 |
+
# (Phase 13, below) gets the .image_url attribute it needs.
|
| 630 |
+
normalized_dict = normalize_article_date(article)
|
| 631 |
+
try:
|
| 632 |
+
article = Article(**normalized_dict)
|
| 633 |
+
except Exception:
|
| 634 |
+
# If reconstruction fails for any reason, skip this article.
|
| 635 |
+
# The dict is malformed β better to drop it than crash.
|
| 636 |
+
invalid_count += 1
|
| 637 |
+
continue
|
| 638 |
+
|
| 639 |
+
# Step 5: Article is now a clean Pydantic object with a normalized date.
|
| 640 |
+
# We intentionally do NOT call sanitize_article() yet β that step
|
| 641 |
+
# runs AFTER image enrichment below.
|
| 642 |
+
valid_articles.append(article)
|
| 643 |
+
|
| 644 |
+
# ββ PHASE 13: GLOBAL IMAGE ENRICHMENT βββββββββββββββββββββββββββββββββ
|
| 645 |
+
# This is the bottom of the funnel. Every article here has already:
|
| 646 |
+
# β Passed basic validation (title, URL, date exist)
|
| 647 |
+
# β Passed category relevance check
|
| 648 |
+
# β Passed Redis 48-hour deduplication (it is a NEW article)
|
| 649 |
+
# β Been date-normalized
|
| 650 |
+
# Articles are still Pydantic objects here β enrichment needs .image_url.
|
| 651 |
+
if valid_articles:
|
| 652 |
+
valid_articles = await enrich_missing_images_in_batch(valid_articles)
|
| 653 |
+
|
| 654 |
+
# ββ SANITIZE (after enrichment) ββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½β
|
| 655 |
+
# Now that images are filled, convert each Pydantic Article to a clean
|
| 656 |
+
# dict for Appwrite storage. sanitize_article() strips unsafe chars,
|
| 657 |
+
# trims lengths, and returns the final dict payload.
|
| 658 |
+
valid_articles = [sanitize_article(a) for a in valid_articles]
|
| 659 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 660 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
logger.info("β %s: %d valid, %d invalid, %d irrelevant",
|
| 662 |
category.upper(), len(valid_articles), invalid_count, irrelevant_count)
|
| 663 |
return (category, valid_articles, invalid_count, irrelevant_count, relevant_count)
|
app/services/utils/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/services/utils/__init__.py
|
| 2 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
+
# This folder contains shared helper utilities that are used by multiple
|
| 4 |
+
# providers. They are NOT providers themselves β they are small tools that
|
| 5 |
+
# providers can import to do common jobs.
|
| 6 |
+
#
|
| 7 |
+
# Current utilities:
|
| 8 |
+
# image_enricher.py β Extracts the main image from any article URL
|
app/services/utils/image_enricher.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app/services/utils/image_enricher.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
Shared Image Enrichment Utility for Segmento Pulse.
|
| 5 |
+
|
| 6 |
+
What this does:
|
| 7 |
+
Given any article URL, this tool visits the page and tries to find the
|
| 8 |
+
main (top) image that the website publisher chose for that article.
|
| 9 |
+
|
| 10 |
+
It does this by reading two standard HTML meta tags:
|
| 11 |
+
1. og:image β Open Graph (used by Facebook, LinkedIn, Twitter)
|
| 12 |
+
2. twitter:image β Twitter Card image
|
| 13 |
+
|
| 14 |
+
Almost every modern news website, blog, and tech publication sets at least
|
| 15 |
+
one of these tags. They are the industry-standard way to declare "this is
|
| 16 |
+
my article's main image".
|
| 17 |
+
|
| 18 |
+
ββ WHY WE USE bs4 + httpx INSTEAD OF newspaper4k ββββββββββββββββββββββββββββ
|
| 19 |
+
|
| 20 |
+
The user directive requested newspaper4k (a modern async fork of newspaper3k).
|
| 21 |
+
However, newspaper4k is not in our requirements.txt and would add a heavy new
|
| 22 |
+
dependency with many sub-packages (including lxml, Pillow, and others).
|
| 23 |
+
|
| 24 |
+
Our current stack already has everything we need:
|
| 25 |
+
β httpx β async HTTP client (already in requirements.txt)
|
| 26 |
+
β beautifulsoup4 β HTML parser (already in requirements.txt)
|
| 27 |
+
β lxml β fast XML/HTML parser (already in requirements.txt)
|
| 28 |
+
|
| 29 |
+
The og:image meta tag approach is exactly what newspaper4k uses internally
|
| 30 |
+
for its top_image property. We get the same result without a new dependency.
|
| 31 |
+
|
| 32 |
+
This decision follows our Version First-Scan Protocol: never add a library
|
| 33 |
+
when an existing installed library can do the same job.
|
| 34 |
+
|
| 35 |
+
ββ HOW THE TIMEOUT PROTECTION WORKS βββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
+
|
| 37 |
+
Some websites are slow, broken, or behind Cloudflare protection pages.
|
| 38 |
+
If we waited forever for them, our entire ingestion pipeline would freeze.
|
| 39 |
+
|
| 40 |
+
Two layers of protection:
|
| 41 |
+
1. httpx timeout: 3 seconds max to receive any response at all.
|
| 42 |
+
If the server doesn't respond in 3 seconds, httpx raises TimeoutException.
|
| 43 |
+
|
| 44 |
+
2. asyncio.wait_for: 4 seconds total ceiling for the entire function.
|
| 45 |
+
Even if httpx somehow hangs (rare), this outer guard kills it.
|
| 46 |
+
|
| 47 |
+
3. Universal try/except: Catches EVERYTHING. A bad image URL will NEVER
|
| 48 |
+
crash a provider. The worst it can do is return "".
|
| 49 |
+
|
| 50 |
+
The function signature is intentionally similar to newspaper4k's approach
|
| 51 |
+
so that future migration is a one-line change if newspaper4k is later added.
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
# ββ Standard Library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
import asyncio
|
| 56 |
+
import logging
|
| 57 |
+
from typing import Optional
|
| 58 |
+
|
| 59 |
+
# ββ Third-party (already in requirements.txt) βββββββββββββββββββββββββββββββββ
|
| 60 |
+
import httpx
|
| 61 |
+
from bs4 import BeautifulSoup
|
| 62 |
+
|
| 63 |
+
logger = logging.getLogger(__name__)
|
| 64 |
+
|
| 65 |
+
# ββ Timing constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 66 |
+
|
| 67 |
+
# How long to wait for the target website to respond (seconds).
|
| 68 |
+
# 3 seconds is generous enough for normal websites, short enough to not
|
| 69 |
+
# freeze our pipeline if a URL is broken or behind Cloudflare.
|
| 70 |
+
HTTP_FETCH_TIMEOUT = 3.0
|
| 71 |
+
|
| 72 |
+
# Hard outer ceiling for the entire extract_top_image() call.
|
| 73 |
+
# Even if httpx somehow hangs past its own timeout, asyncio.wait_for
|
| 74 |
+
# will forcibly cancel the task at this point.
|
| 75 |
+
OUTER_TIMEOUT_SECONDS = 4.0
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
async def extract_top_image(url: str) -> str:
|
| 79 |
+
"""
|
| 80 |
+
Visit an article URL and extract its main (top) image.
|
| 81 |
+
|
| 82 |
+
Looks for the image in two standard HTML meta tags, in this order:
|
| 83 |
+
1. <meta property="og:image" content="...">
|
| 84 |
+
2. <meta name="twitter:image" content="...">
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
url (str): Full article URL (must start with "http").
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
str: The image URL if found and valid. "" if not found or any error.
|
| 91 |
+
|
| 92 |
+
This function NEVER raises an exception. If anything goes wrong
|
| 93 |
+
(timeout, bad HTML, no meta tag found), it returns "" silently.
|
| 94 |
+
The pipeline treats "" as "no image" and shows the Pulse banner instead.
|
| 95 |
+
"""
|
| 96 |
+
if not url or not url.startswith("http"):
|
| 97 |
+
return ""
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
# Wrap everything in asyncio.wait_for so we have a hard ceiling.
|
| 101 |
+
# If _fetch_and_extract takes longer than OUTER_TIMEOUT_SECONDS, it
|
| 102 |
+
# is cancelled automatically and we return "" from the except block.
|
| 103 |
+
image_url = await asyncio.wait_for(
|
| 104 |
+
_fetch_and_extract(url),
|
| 105 |
+
timeout=OUTER_TIMEOUT_SECONDS,
|
| 106 |
+
)
|
| 107 |
+
return image_url
|
| 108 |
+
|
| 109 |
+
except asyncio.TimeoutError:
|
| 110 |
+
logger.debug(f"[ImageEnricher] Outer timeout for: {url[:60]}")
|
| 111 |
+
return ""
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logger.debug(f"[ImageEnricher] Failed for '{url[:60]}': {e}")
|
| 114 |
+
return ""
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
async def _fetch_and_extract(url: str) -> str:
|
| 118 |
+
"""
|
| 119 |
+
Internal helper: download the HTML and pull out the og:image tag.
|
| 120 |
+
|
| 121 |
+
Separated from extract_top_image() so asyncio.wait_for() has a clean
|
| 122 |
+
coroutine to cancel if needed.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
url (str): Full article URL.
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
str: Image URL from meta tag, or "" if none found.
|
| 129 |
+
"""
|
| 130 |
+
try:
|
| 131 |
+
async with httpx.AsyncClient(timeout=HTTP_FETCH_TIMEOUT) as client:
|
| 132 |
+
response = await client.get(
|
| 133 |
+
url,
|
| 134 |
+
headers={
|
| 135 |
+
# Some sites block requests without a browser User-Agent.
|
| 136 |
+
# We mimic a normal browser to get past basic protections.
|
| 137 |
+
"User-Agent": (
|
| 138 |
+
"Mozilla/5.0 (compatible; SegmentoPulse-ImageBot/1.0; "
|
| 139 |
+
"+https://segmento.in)"
|
| 140 |
+
),
|
| 141 |
+
# Tell the server we only need enough HTML to read the <head>.
|
| 142 |
+
# This does NOT guarantee the server sends less data, but it
|
| 143 |
+
# is polite and some servers respect it.
|
| 144 |
+
"Accept": "text/html",
|
| 145 |
+
},
|
| 146 |
+
follow_redirects=True,
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
if response.status_code != 200:
|
| 150 |
+
return ""
|
| 151 |
+
|
| 152 |
+
html = response.text
|
| 153 |
+
|
| 154 |
+
except Exception:
|
| 155 |
+
# Network error, timeout, SSL error, etc.
|
| 156 |
+
return ""
|
| 157 |
+
|
| 158 |
+
# ββ Parse the HTML and look for meta image tags βββββββββββββββββββββββββββ
|
| 159 |
+
# We only need the <head> section β everything in <body> is irrelevant
|
| 160 |
+
# and would slow down BeautifulSoup's parsing.
|
| 161 |
+
# NOTE: We pass only the first 10,000 characters to avoid processing huge
|
| 162 |
+
# HTML files. og:image is always in the <head> which is near the top.
|
| 163 |
+
try:
|
| 164 |
+
soup = BeautifulSoup(html[:10_000], "lxml")
|
| 165 |
+
except Exception:
|
| 166 |
+
# If lxml fails (malformed HTML), try the built-in html.parser
|
| 167 |
+
try:
|
| 168 |
+
soup = BeautifulSoup(html[:10_000], "html.parser")
|
| 169 |
+
except Exception:
|
| 170 |
+
return ""
|
| 171 |
+
|
| 172 |
+
# ββ Priority 1: Open Graph image (most reliable) βββββββββββββββββββββββββ
|
| 173 |
+
og_tag = soup.find("meta", property="og:image")
|
| 174 |
+
if og_tag:
|
| 175 |
+
image_url = (og_tag.get("content") or "").strip()
|
| 176 |
+
if image_url and image_url.startswith("http"):
|
| 177 |
+
logger.debug(f"[ImageEnricher] og:image found for {url[:50]}")
|
| 178 |
+
return image_url
|
| 179 |
+
|
| 180 |
+
# ββ Priority 2: Twitter Card image (common fallback) βββββββββββββββββββββ
|
| 181 |
+
tw_tag = soup.find("meta", attrs={"name": "twitter:image"})
|
| 182 |
+
if tw_tag:
|
| 183 |
+
image_url = (tw_tag.get("content") or "").strip()
|
| 184 |
+
if image_url and image_url.startswith("http"):
|
| 185 |
+
logger.debug(f"[ImageEnricher] twitter:image found for {url[:50]}")
|
| 186 |
+
return image_url
|
| 187 |
+
|
| 188 |
+
# No image tag found β return empty, let the banner fallback handle it.
|
| 189 |
+
logger.debug(f"[ImageEnricher] No meta image tag found for: {url[:60]}")
|
| 190 |
+
return ""
|
app/services/utils/provider_state.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app/services/utils/provider_state.py
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
Phase 15: Unified Redis State Architecture
|
| 5 |
+
|
| 6 |
+
What this does:
|
| 7 |
+
Saves and restores provider "state" β things like "when did we last call
|
| 8 |
+
OpenRSS?" and "how many times have we called Webz today?" β to our
|
| 9 |
+
Upstash Redis instance.
|
| 10 |
+
|
| 11 |
+
Why we need this:
|
| 12 |
+
Our backend runs on Hugging Face Spaces, which can restart at any time.
|
| 13 |
+
When a restart happens, all Python RAM is wiped. Without this utility:
|
| 14 |
+
- OpenRSS's 60-minute cooldown resets to 0, so we hammer them on
|
| 15 |
+
every restart and eventually get an IP ban.
|
| 16 |
+
- Webz's monthly budget counter resets, so we can burn our entire
|
| 17 |
+
month's calls in a single bad restart day.
|
| 18 |
+
|
| 19 |
+
With this utility:
|
| 20 |
+
- Even if the server restarts 10 times in an hour, Redis remembers
|
| 21 |
+
the exact timestamp of the last OpenRSS call and the exact number
|
| 22 |
+
of Webz calls made today. Provider quotas are now restart-proof.
|
| 23 |
+
|
| 24 |
+
How it works:
|
| 25 |
+
Two pairs of async functions:
|
| 26 |
+
1. Timestamps (for cooldown timers like OpenRSS):
|
| 27 |
+
get_provider_timestamp("openrss") β float (Unix timestamp)
|
| 28 |
+
set_provider_timestamp("openrss", time.time())
|
| 29 |
+
|
| 30 |
+
2. Counters (for daily/monthly budgets like Webz, WorldNewsAI):
|
| 31 |
+
get_provider_counter("webz", "2026-03-03") β int
|
| 32 |
+
increment_provider_counter("webz", "2026-03-03")
|
| 33 |
+
|
| 34 |
+
Redis key format:
|
| 35 |
+
Timestamps: provider:state:{provider_name}:last_fetch
|
| 36 |
+
Counters: provider:state:{provider_name}:calls:{date_key}
|
| 37 |
+
|
| 38 |
+
Mirrored directly from circuit_breaker.py's approach:
|
| 39 |
+
- Same get_upstash_cache() import
|
| 40 |
+
- Same _execute_command([...]) API
|
| 41 |
+
- Same fail-safe try/except pattern
|
| 42 |
+
|
| 43 |
+
Fail-open vs Fail-safe design:
|
| 44 |
+
- get_provider_timestamp: returns 0.0 on Redis failure
|
| 45 |
+
β Provider assumes "never fetched before" β allowed to run
|
| 46 |
+
β This is CORRECT for free providers (OpenRSS). Missing one cooldown
|
| 47 |
+
check is less dangerous than permanently blocking the provider.
|
| 48 |
+
|
| 49 |
+
- get_provider_counter: returns 999999 on Redis failure
|
| 50 |
+
β Provider assumes "budget exhausted" β safely skips the run
|
| 51 |
+
β This is CORRECT for paid providers (Webz, WorldNewsAI). We would
|
| 52 |
+
rather miss one run than accidentally overspend our API budget.
|
| 53 |
+
|
| 54 |
+
Thread safety:
|
| 55 |
+
asyncio is single-threaded. All functions below use `await`. Only one
|
| 56 |
+
coroutine runs at a time, so there are no race conditions to worry about
|
| 57 |
+
within a single Python process. No locks needed.
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
import logging
|
| 61 |
+
from typing import Optional
|
| 62 |
+
|
| 63 |
+
logger = logging.getLogger(__name__)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ββ Key Builders βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
# Centralizing the key format here means if we ever need to change it,
|
| 68 |
+
# we change it in one place and every provider picks up the fix automatically.
|
| 69 |
+
|
| 70 |
+
def _timestamp_key(provider_name: str) -> str:
|
| 71 |
+
"""
|
| 72 |
+
Build the Redis key string for a provider's last-fetch timestamp.
|
| 73 |
+
|
| 74 |
+
Example:
|
| 75 |
+
provider_name = "openrss"
|
| 76 |
+
β "provider:state:openrss:last_fetch"
|
| 77 |
+
"""
|
| 78 |
+
return f"provider:state:{provider_name}:last_fetch"
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _counter_key(provider_name: str, date_key: str) -> str:
|
| 82 |
+
"""
|
| 83 |
+
Build the Redis key string for a provider's daily call counter.
|
| 84 |
+
|
| 85 |
+
date_key is normally a date string like "2026-03-03" so the key
|
| 86 |
+
automatically changes every day without needing a manual reset.
|
| 87 |
+
|
| 88 |
+
Example:
|
| 89 |
+
provider_name = "webz", date_key = "2026-03-03"
|
| 90 |
+
β "provider:state:webz:calls:2026-03-03"
|
| 91 |
+
"""
|
| 92 |
+
return f"provider:state:{provider_name}:calls:{date_key}"
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ββ Timestamp Functions (for cooldown timers) βββββββββββββββββββββββββββββββββ
|
| 96 |
+
|
| 97 |
+
async def get_provider_timestamp(provider_name: str) -> float:
|
| 98 |
+
"""
|
| 99 |
+
Read the last-fetch timestamp for a provider from Redis.
|
| 100 |
+
|
| 101 |
+
Returns a Unix timestamp (seconds since 1970). If Redis is unavailable
|
| 102 |
+
or the key doesn't exist yet, returns 0.0 so the provider treats it as
|
| 103 |
+
"never fetched before" and is allowed to run immediately.
|
| 104 |
+
|
| 105 |
+
This is the FAIL-OPEN design β when in doubt, let the provider run.
|
| 106 |
+
Suitable for free providers with cooldown timers (OpenRSS).
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
provider_name (str): Short name like "openrss".
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
float: Unix timestamp of the last fetch, or 0.0 if not found.
|
| 113 |
+
"""
|
| 114 |
+
try:
|
| 115 |
+
from app.services.upstash_cache import get_upstash_cache
|
| 116 |
+
cache = get_upstash_cache()
|
| 117 |
+
|
| 118 |
+
key = _timestamp_key(provider_name)
|
| 119 |
+
# Redis GET returns a string like "1740000000.123" or None if missing.
|
| 120 |
+
raw_value = await cache._execute_command(["GET", key])
|
| 121 |
+
|
| 122 |
+
if raw_value is None:
|
| 123 |
+
# Key doesn't exist yet β provider has never fetched before.
|
| 124 |
+
return 0.0
|
| 125 |
+
|
| 126 |
+
# Parse the string back to a float.
|
| 127 |
+
return float(raw_value)
|
| 128 |
+
|
| 129 |
+
except Exception as e:
|
| 130 |
+
# Redis is down, unreachable, or returned something unexpected.
|
| 131 |
+
# Fail open: return 0.0 so the provider is allowed to run.
|
| 132 |
+
# This is the safe direction for free providers β one extra call
|
| 133 |
+
# is far less dangerous than permanently blocking the provider.
|
| 134 |
+
logger.warning(
|
| 135 |
+
"[provider_state] get_provider_timestamp('%s') failed (%s) "
|
| 136 |
+
"β returning 0.0 (fail-open: provider will be allowed to run).",
|
| 137 |
+
provider_name, e
|
| 138 |
+
)
|
| 139 |
+
return 0.0
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
async def set_provider_timestamp(
|
| 143 |
+
provider_name: str,
|
| 144 |
+
timestamp: float,
|
| 145 |
+
expire_seconds: int = 7200, # Default TTL: 2 hours
|
| 146 |
+
) -> None:
|
| 147 |
+
"""
|
| 148 |
+
Save a provider's last-fetch timestamp to Redis.
|
| 149 |
+
|
| 150 |
+
Always call this BEFORE you start the actual network request, not after.
|
| 151 |
+
If you save it AFTER and the request crashes halfway through, the provider
|
| 152 |
+
will think "I was never blocked" and fire again immediately on the next
|
| 153 |
+
scheduler cycle β the exact opposite of what the cooldown is supposed to do.
|
| 154 |
+
|
| 155 |
+
The TTL (expire_seconds) is a safety net. If the key is never explicitly
|
| 156 |
+
deleted, Redis will remove it automatically after 2 hours so it doesn't
|
| 157 |
+
sit in memory forever. 2 hours is safely above the 60-minute cooldown.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
provider_name (str): Short name like "openrss".
|
| 161 |
+
timestamp (float): Unix timestamp (use time.time() to get the current one).
|
| 162 |
+
expire_seconds (int): How long to keep this key in Redis. Default: 7200s (2h).
|
| 163 |
+
"""
|
| 164 |
+
try:
|
| 165 |
+
from app.services.upstash_cache import get_upstash_cache
|
| 166 |
+
cache = get_upstash_cache()
|
| 167 |
+
|
| 168 |
+
key = _timestamp_key(provider_name)
|
| 169 |
+
# Store the float as a string. Redis stores all values as strings anyway.
|
| 170 |
+
# "SET key value EX seconds" β sets both the value and the TTL in one call.
|
| 171 |
+
await cache._execute_command(["SET", key, str(timestamp), "EX", expire_seconds])
|
| 172 |
+
|
| 173 |
+
logger.debug(
|
| 174 |
+
"[provider_state] Saved last_fetch timestamp for '%s' to Redis (TTL=%ds).",
|
| 175 |
+
provider_name, expire_seconds
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
# Redis write failed. This is recoverable β the cooldown will just
|
| 180 |
+
# fall back to RAM-based tracking for this run. Log it and move on.
|
| 181 |
+
logger.warning(
|
| 182 |
+
"[provider_state] set_provider_timestamp('%s') failed (%s) "
|
| 183 |
+
"β cooldown state will not survive a server restart for this run.",
|
| 184 |
+
provider_name, e
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# ββ Counter Functions (for daily/monthly API budgets) βββββββββββββββββββββββββ
|
| 189 |
+
|
| 190 |
+
async def get_provider_counter(provider_name: str, date_key: str) -> int:
|
| 191 |
+
"""
|
| 192 |
+
Read a provider's call counter for a specific date from Redis.
|
| 193 |
+
|
| 194 |
+
If Redis is unavailable or the key doesn't exist, returns 999999.
|
| 195 |
+
This is the FAIL-SAFE design β when in doubt, assume the budget is
|
| 196 |
+
exhausted and skip the call. Much better than accidentally burning
|
| 197 |
+
a month's worth of Webz or WorldNewsAI credits on a bad restart day.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
provider_name (str): Short name like "webz" or "worldnewsai".
|
| 201 |
+
date_key (str): Date string like "2026-03-03" (use UTC date).
|
| 202 |
+
Using today's UTC date as the key means the
|
| 203 |
+
counter automatically resets each morning without
|
| 204 |
+
any manual cleanup β yesterday's key just expires.
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
int: Number of API calls made today, or 999999 if Redis is down.
|
| 208 |
+
"""
|
| 209 |
+
try:
|
| 210 |
+
from app.services.upstash_cache import get_upstash_cache
|
| 211 |
+
cache = get_upstash_cache()
|
| 212 |
+
|
| 213 |
+
key = _counter_key(provider_name, date_key)
|
| 214 |
+
raw_value = await cache._execute_command(["GET", key])
|
| 215 |
+
|
| 216 |
+
if raw_value is None:
|
| 217 |
+
# No calls made today yet β counter starts at 0.
|
| 218 |
+
return 0
|
| 219 |
+
|
| 220 |
+
return int(raw_value)
|
| 221 |
+
|
| 222 |
+
except Exception as e:
|
| 223 |
+
# Redis is down. Fail SAFE: return a huge number so the provider
|
| 224 |
+
# thinks its budget is exhausted and skips this run.
|
| 225 |
+
# One missed run costs us nothing. One overspent budget could cost us money.
|
| 226 |
+
logger.warning(
|
| 227 |
+
"[provider_state] get_provider_counter('%s', '%s') failed (%s) "
|
| 228 |
+
"β returning 999999 (fail-safe: provider will be skipped this run).",
|
| 229 |
+
provider_name, date_key, e
|
| 230 |
+
)
|
| 231 |
+
return 999999
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
async def increment_provider_counter(
|
| 235 |
+
provider_name: str,
|
| 236 |
+
date_key: str,
|
| 237 |
+
amount: int = 1,
|
| 238 |
+
expire_seconds: int = 86400, # Default TTL: 24 hours (one full day)
|
| 239 |
+
) -> None:
|
| 240 |
+
"""
|
| 241 |
+
Increment a provider's daily call counter in Redis by `amount`.
|
| 242 |
+
|
| 243 |
+
Uses Redis INCR (atomic increment) which is safe to call concurrently
|
| 244 |
+
from multiple requests β though since we run single-process asyncio,
|
| 245 |
+
this is mostly a good practice rather than a strict requirement here.
|
| 246 |
+
|
| 247 |
+
After incrementing, we always refresh the TTL with EXPIRE. This means
|
| 248 |
+
even if the key was created yesterday and is still sitting around, it
|
| 249 |
+
gets a fresh 24-hour life from the moment we update it.
|
| 250 |
+
|
| 251 |
+
Args:
|
| 252 |
+
provider_name (str): Short name like "webz" or "worldnewsai".
|
| 253 |
+
date_key (str): Date string like "2026-03-03" (use UTC date).
|
| 254 |
+
amount (int): How much to add to the counter. Default: 1.
|
| 255 |
+
expire_seconds (int): Key TTL. Default: 86400s (24 hours).
|
| 256 |
+
"""
|
| 257 |
+
try:
|
| 258 |
+
from app.services.upstash_cache import get_upstash_cache
|
| 259 |
+
cache = get_upstash_cache()
|
| 260 |
+
|
| 261 |
+
key = _counter_key(provider_name, date_key)
|
| 262 |
+
|
| 263 |
+
# INCRBY key amount β atomically adds `amount` to the counter.
|
| 264 |
+
# If the key doesn't exist yet, Redis creates it at 0 and then adds amount.
|
| 265 |
+
await cache._execute_command(["INCRBY", key, str(amount)])
|
| 266 |
+
|
| 267 |
+
# Refresh the TTL so the key doesn't expire mid-day.
|
| 268 |
+
# EXPIRE key seconds β resets the countdown timer on the key.
|
| 269 |
+
await cache._execute_command(["EXPIRE", key, str(expire_seconds)])
|
| 270 |
+
|
| 271 |
+
logger.debug(
|
| 272 |
+
"[provider_state] Incremented call counter for '%s' on '%s' by %d.",
|
| 273 |
+
provider_name, date_key, amount
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
except Exception as e:
|
| 277 |
+
# Redis write failed. The counter won't reflect this call in Redis,
|
| 278 |
+
# but in-memory tracking (request_count) still works. Log and continue.
|
| 279 |
+
logger.warning(
|
| 280 |
+
"[provider_state] increment_provider_counter('%s', '%s') failed (%s) "
|
| 281 |
+
"β this call will not be counted in Redis. In-memory limit still applies.",
|
| 282 |
+
provider_name, date_key, e
|
| 283 |
+
)
|
app/utils/data_validation.py
CHANGED
|
@@ -268,158 +268,272 @@ def calculate_quality_score(article: Dict) -> int:
|
|
| 268 |
return min(max(score, 0), 100)
|
| 269 |
|
| 270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> bool:
|
| 272 |
"""
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
"""
|
| 281 |
-
#
|
| 282 |
if hasattr(article, 'model_dump'):
|
| 283 |
article_dict = article.model_dump()
|
| 284 |
elif hasattr(article, 'dict'):
|
| 285 |
article_dict = article.dict()
|
| 286 |
else:
|
| 287 |
article_dict = article
|
| 288 |
-
|
| 289 |
-
#
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
'ai', 'artificial intelligence', 'machine learning', 'deep learning',
|
| 295 |
-
'neural network', 'gpt', 'llm', 'chatgpt', 'generative ai',
|
| 296 |
-
'computer vision', 'nlp', 'natural language', 'transformer'
|
| 297 |
-
],
|
| 298 |
-
'data-security': [
|
| 299 |
-
'security', 'cybersecurity', 'data breach', 'hacking', 'vulnerability',
|
| 300 |
-
'encryption', 'malware', 'ransomware', 'firewall', 'threat'
|
| 301 |
-
],
|
| 302 |
-
'data-governance': [
|
| 303 |
-
'governance', 'compliance', 'regulation', 'audit', 'policy',
|
| 304 |
-
'data quality', 'metadata', 'lineage', 'stewardship'
|
| 305 |
-
],
|
| 306 |
-
'data-privacy': [
|
| 307 |
-
'privacy', 'gdpr', 'ccpa', 'consent', 'personal data',
|
| 308 |
-
'pii', 'anonymization', 'data protection', 'privacy law'
|
| 309 |
-
],
|
| 310 |
-
'data-engineering': [
|
| 311 |
-
'data engineering', 'pipeline', 'etl', 'big data', 'spark',
|
| 312 |
-
'hadoop', 'kafka', 'airflow', 'data warehouse', 'snowflake'
|
| 313 |
-
],
|
| 314 |
-
'data-management': [
|
| 315 |
-
'data management', 'master data', 'mdm', 'data catalog',
|
| 316 |
-
'data quality', 'data lineage', 'data stewardship',
|
| 317 |
-
'data governance', 'data integration', 'reference data'
|
| 318 |
-
],
|
| 319 |
-
'business-intelligence': [
|
| 320 |
-
'business intelligence', 'bi', 'analytics', 'dashboard',
|
| 321 |
-
'tableau', 'power bi', 'looker', 'reporting', 'kpi'
|
| 322 |
-
],
|
| 323 |
-
'business-analytics': [
|
| 324 |
-
'analytics', 'analysis', 'insights', 'metrics', 'data-driven',
|
| 325 |
-
'business analytics', 'predictive', 'forecasting'
|
| 326 |
-
],
|
| 327 |
-
'customer-data-platform': [
|
| 328 |
-
'cdp', 'customer data', 'customer platform', 'crm',
|
| 329 |
-
'customer experience', 'personalization', 'segmentation'
|
| 330 |
-
],
|
| 331 |
-
'data-centers': [
|
| 332 |
-
'data center', 'data centre', 'datacenter', 'server', 'infrastructure',
|
| 333 |
-
'colocation', 'edge computing', 'hyperscale'
|
| 334 |
-
],
|
| 335 |
-
'cloud-computing': [
|
| 336 |
-
'cloud', 'aws', 'azure', 'google cloud', 'gcp', 'salesforce',
|
| 337 |
-
'alibaba cloud', 'tencent cloud', 'huawei cloud', 'cloudflare',
|
| 338 |
-
'saas', 'paas', 'iaas', 'serverless', 'kubernetes'
|
| 339 |
-
],
|
| 340 |
-
# ββ Cloud sub-categories (each maps to a specific provider) ββββββββββ
|
| 341 |
-
'cloud-aws': [
|
| 342 |
-
'aws', 'amazon web services', 's3', 'ec2', 'lambda',
|
| 343 |
-
'cloudfront', 'sagemaker', 'dynamodb', 'amazon'
|
| 344 |
-
],
|
| 345 |
-
'cloud-azure': [
|
| 346 |
-
'azure', 'microsoft azure', 'azure devops', 'azure ml',
|
| 347 |
-
'azure openai', 'microsoft cloud'
|
| 348 |
-
],
|
| 349 |
-
'cloud-gcp': [
|
| 350 |
-
'gcp', 'google cloud', 'bigquery', 'vertex ai',
|
| 351 |
-
'cloud run', 'dataflow', 'google cloud platform'
|
| 352 |
-
],
|
| 353 |
-
'cloud-oracle': [
|
| 354 |
-
'oracle cloud', 'oci', 'oracle database', 'oracle fusion',
|
| 355 |
-
'oracle cloud infrastructure'
|
| 356 |
-
],
|
| 357 |
-
'cloud-ibm': [
|
| 358 |
-
'ibm cloud', 'ibm watson', 'red hat', 'openshift', 'ibm z'
|
| 359 |
-
],
|
| 360 |
-
'cloud-alibaba': [
|
| 361 |
-
'alibaba cloud', 'aliyun', 'alicloud'
|
| 362 |
-
],
|
| 363 |
-
'cloud-digitalocean': [
|
| 364 |
-
'digitalocean', 'droplet', 'app platform'
|
| 365 |
-
],
|
| 366 |
-
'cloud-huawei': [
|
| 367 |
-
'huawei cloud', 'huaweicloud'
|
| 368 |
-
],
|
| 369 |
-
'cloud-cloudflare': [
|
| 370 |
-
'cloudflare', 'cloudflare workers', 'cloudflare r2',
|
| 371 |
-
'cloudflare pages', 'zero trust'
|
| 372 |
-
],
|
| 373 |
-
# ββ Content / publishing categories βββββββββββββββββββββββββββββββββββ
|
| 374 |
-
'medium-article': [
|
| 375 |
-
'medium', 'article', 'blog', 'writing', 'publishing',
|
| 376 |
-
'content', 'story', 'author', 'blogging'
|
| 377 |
-
],
|
| 378 |
-
'magazines': [
|
| 379 |
-
'technology', 'tech', 'innovation', 'digital', 'startup',
|
| 380 |
-
'software', 'hardware', 'gadget'
|
| 381 |
-
]
|
| 382 |
-
}
|
| 383 |
-
|
| 384 |
-
# Get keywords for this category
|
| 385 |
-
keywords = CATEGORY_KEYWORDS.get(category, [])
|
| 386 |
-
|
| 387 |
-
if not keywords:
|
| 388 |
-
# Unknown category - allow (don't reject)
|
| 389 |
return True
|
| 390 |
-
|
| 391 |
-
#
|
| 392 |
-
# We
|
| 393 |
-
#
|
| 394 |
-
#
|
| 395 |
-
#
|
| 396 |
-
#
|
| 397 |
-
|
|
|
|
| 398 |
description = (article_dict.get('description') or '').lower()
|
| 399 |
|
| 400 |
-
# Extract the URL path safely.
|
| 401 |
raw_url = article_dict.get('url') or ''
|
| 402 |
url_str = str(raw_url).lower()
|
| 403 |
try:
|
| 404 |
parsed_url = urlparse(url_str)
|
| 405 |
-
# Replace hyphens and slashes with spaces so
|
| 406 |
-
#
|
| 407 |
url_words = parsed_url.path.replace('-', ' ').replace('/', ' ')
|
| 408 |
except Exception:
|
| 409 |
url_words = ''
|
| 410 |
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
#
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
# Can increase to 2+ for stricter filtering
|
| 418 |
-
if matches >= 1:
|
| 419 |
return True
|
| 420 |
-
|
| 421 |
-
#
|
| 422 |
-
print(
|
|
|
|
|
|
|
|
|
|
| 423 |
return False
|
| 424 |
|
| 425 |
|
|
|
|
| 268 |
return min(max(score, 0), 100)
|
| 269 |
|
| 270 |
|
| 271 |
+
# ==============================================================================
|
| 272 |
+
# MASTER CATEGORY TAXONOMY (Phase 19 β Expanded Entity-Based Keywords)
|
| 273 |
+
# ==============================================================================
|
| 274 |
+
#
|
| 275 |
+
# This dictionary is the SINGLE SOURCE OF TRUTH for category routing.
|
| 276 |
+
# Every category has a rich list of keywords covering:
|
| 277 |
+
# β’ The topic itself (e.g., "machine learning")
|
| 278 |
+
# β’ Major companies (e.g., "openai", "anthropic")
|
| 279 |
+
# β’ Flagship products (e.g., "chatgpt", "sagemaker")
|
| 280 |
+
# β’ Industry acronyms (e.g., "llm", "etl", "gcp")
|
| 281 |
+
#
|
| 282 |
+
# β οΈ IMPORTANT β word-boundary safety:
|
| 283 |
+
# Short acronyms like "ai", "bi", "aws" MUST live here β we protect them
|
| 284 |
+
# with \b regex word boundaries in COMPILED_CATEGORY_REGEX below.
|
| 285 |
+
# Do NOT add single-letter keywords; they can never be safe.
|
| 286 |
+
#
|
| 287 |
+
# NOTE: 'cloud-computing' is kept here because it is an active category in
|
| 288 |
+
# config.py, news_aggregator.py, and several providers. Removing it would
|
| 289 |
+
# break article routing for all generic cloud news. β Phase 19
|
| 290 |
+
# ==============================================================================
|
| 291 |
+
CATEGORY_KEYWORDS = {
|
| 292 |
+
|
| 293 |
+
# ββ Artificial Intelligence ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 294 |
+
'ai': [
|
| 295 |
+
'artificial intelligence', 'machine learning', 'deep learning',
|
| 296 |
+
'neural network', 'gpt', 'llm', 'chatgpt', 'generative ai',
|
| 297 |
+
'computer vision', 'nlp', 'natural language processing', 'transformer',
|
| 298 |
+
'openai', 'anthropic', 'sam altman', 'claude', 'gemini', 'mistral',
|
| 299 |
+
'llama', 'copilot', 'midjourney', 'stable diffusion', 'hugging face',
|
| 300 |
+
'rag', 'vector database', 'prompt engineering', 'agi', 'agentic ai',
|
| 301 |
+
],
|
| 302 |
+
|
| 303 |
+
# ββ Cloud β generic umbrella category (must stay: used in config.py) ββββββ
|
| 304 |
+
'cloud-computing': [
|
| 305 |
+
'cloud computing', 'cloud services', 'aws', 'azure', 'google cloud',
|
| 306 |
+
'gcp', 'salesforce', 'alibaba cloud', 'tencent cloud', 'huawei cloud',
|
| 307 |
+
'cloudflare', 'saas', 'paas', 'iaas', 'serverless', 'kubernetes',
|
| 308 |
+
'multi-cloud', 'hybrid cloud',
|
| 309 |
+
],
|
| 310 |
+
|
| 311 |
+
# ββ Cloud sub-categories (provider-specific) βββββββββββββββββββββββββββββββ
|
| 312 |
+
'cloud-aws': [
|
| 313 |
+
'aws', 'amazon web services', 's3', 'ec2', 'lambda', 'cloudfront',
|
| 314 |
+
'sagemaker', 'dynamodb', 'amazon bedrock', 'aws reinvent',
|
| 315 |
+
'fargate', 'aws graviton', 'elastic beanstalk',
|
| 316 |
+
],
|
| 317 |
+
'cloud-azure': [
|
| 318 |
+
'azure', 'microsoft azure', 'azure devops', 'azure ml',
|
| 319 |
+
'azure openai', 'microsoft cloud', 'azure synapse', 'cosmos db',
|
| 320 |
+
'azure arc', 'microsoft entra',
|
| 321 |
+
],
|
| 322 |
+
'cloud-gcp': [
|
| 323 |
+
'gcp', 'google cloud', 'bigquery', 'vertex ai', 'cloud run',
|
| 324 |
+
'dataflow', 'google kubernetes engine', 'gke', 'google spanner',
|
| 325 |
+
'anthos', 'cloud sql', 'gemini for google cloud',
|
| 326 |
+
],
|
| 327 |
+
'cloud-alibaba': [
|
| 328 |
+
'alibaba cloud', 'aliyun', 'alicloud', 'polar db', 'maxcompute',
|
| 329 |
+
'elastic compute service', 'tongyi qianwen', 'qwen',
|
| 330 |
+
],
|
| 331 |
+
'cloud-huawei': [
|
| 332 |
+
'huawei cloud', 'huaweicloud', 'pangu model',
|
| 333 |
+
'harmonyos', 'kunpeng', 'ascend ai',
|
| 334 |
+
],
|
| 335 |
+
'cloud-digitalocean': [
|
| 336 |
+
'digitalocean', 'digital ocean', 'do droplet', 'digitalocean spaces',
|
| 337 |
+
'digitalocean app platform', 'managed kubernetes', 'cloudways',
|
| 338 |
+
],
|
| 339 |
+
'cloud-oracle': [
|
| 340 |
+
'oracle cloud', 'oci', 'oracle database', 'oracle fusion',
|
| 341 |
+
'oracle cloud infrastructure', 'mysql heatwave', 'oracle apex',
|
| 342 |
+
],
|
| 343 |
+
'cloud-ibm': [
|
| 344 |
+
'ibm cloud', 'ibm watson', 'red hat', 'openshift',
|
| 345 |
+
'ibm z', 'watsonx', 'ibm mainframe',
|
| 346 |
+
],
|
| 347 |
+
'cloud-cloudflare': [
|
| 348 |
+
'cloudflare', 'cloudflare workers', 'cloudflare r2',
|
| 349 |
+
'cloudflare pages', 'zero trust',
|
| 350 |
+
],
|
| 351 |
+
|
| 352 |
+
# ββ Data Engineering βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 353 |
+
'data-engineering': [
|
| 354 |
+
'data engineering', 'data pipeline', 'etl', 'elt', 'big data',
|
| 355 |
+
'apache spark', 'hadoop', 'kafka', 'airflow', 'data warehouse',
|
| 356 |
+
'snowflake', 'databricks', 'dbt', 'fivetran', 'apache iceberg',
|
| 357 |
+
'delta lake', 'data lakehouse',
|
| 358 |
+
],
|
| 359 |
+
|
| 360 |
+
# ββ Data Security βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 361 |
+
'data-security': [
|
| 362 |
+
'security', 'cybersecurity', 'data breach', 'hacking', 'vulnerability',
|
| 363 |
+
'encryption', 'malware', 'ransomware', 'firewall', 'zero trust',
|
| 364 |
+
'phishing', 'soc2', 'infosec', 'penetration testing',
|
| 365 |
+
],
|
| 366 |
+
|
| 367 |
+
# ββ Data Governance βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 368 |
+
'data-governance': [
|
| 369 |
+
'data governance', 'compliance', 'regulation', 'audit', 'data policy',
|
| 370 |
+
'metadata management', 'data lineage', 'data stewardship',
|
| 371 |
+
'regulatory compliance',
|
| 372 |
+
],
|
| 373 |
+
|
| 374 |
+
# ββ Data Privacy ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 375 |
+
'data-privacy': [
|
| 376 |
+
'data privacy', 'gdpr', 'ccpa', 'user consent', 'personal data',
|
| 377 |
+
'pii', 'anonymization', 'data protection', 'privacy law',
|
| 378 |
+
'hipaa', 'cookie tracking',
|
| 379 |
+
],
|
| 380 |
+
|
| 381 |
+
# ββ Data Management βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 382 |
+
'data-management': [
|
| 383 |
+
'data management', 'master data', 'mdm', 'data catalog',
|
| 384 |
+
'data quality', 'reference data', 'data lifecycle', 'data architecture',
|
| 385 |
+
],
|
| 386 |
+
|
| 387 |
+
# ββ Business Intelligence βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 388 |
+
'business-intelligence': [
|
| 389 |
+
'business intelligence', 'bi', 'analytics dashboard', 'tableau',
|
| 390 |
+
'power bi', 'looker', 'data reporting', 'kpi', 'quicksight', 'qlik',
|
| 391 |
+
],
|
| 392 |
+
|
| 393 |
+
# ββ Business Analytics ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 394 |
+
'business-analytics': [
|
| 395 |
+
'data analytics', 'data analysis', 'business insights', 'business metrics',
|
| 396 |
+
'data-driven', 'business analytics', 'predictive analytics', 'forecasting',
|
| 397 |
+
],
|
| 398 |
+
|
| 399 |
+
# ββ Customer Data Platform ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 400 |
+
'customer-data-platform': [
|
| 401 |
+
'cdp', 'customer data platform', 'crm', 'customer experience',
|
| 402 |
+
'personalization engine', 'audience segmentation',
|
| 403 |
+
'segment.com', 'salesforce data cloud',
|
| 404 |
+
],
|
| 405 |
+
|
| 406 |
+
# ββ Data Centers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 407 |
+
'data-centers': [
|
| 408 |
+
'data center', 'data centre', 'datacenter', 'server rack', 'colocation',
|
| 409 |
+
'edge computing', 'hyperscale', 'hpc', 'liquid cooling',
|
| 410 |
+
'data center cooling',
|
| 411 |
+
],
|
| 412 |
+
|
| 413 |
+
# ββ Publishing categories βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 414 |
+
'medium-article': [
|
| 415 |
+
'medium', 'article', 'blog', 'writing', 'publishing',
|
| 416 |
+
'content', 'story', 'author', 'blogging',
|
| 417 |
+
],
|
| 418 |
+
'magazines': [
|
| 419 |
+
'technology', 'tech', 'innovation', 'digital', 'startup',
|
| 420 |
+
'software', 'hardware', 'gadget',
|
| 421 |
+
],
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
# ==============================================================================
|
| 426 |
+
# PRE-COMPILED REGEX ENGINE (Phase 19 β Word-Boundary Patterns)
|
| 427 |
+
# ==============================================================================
|
| 428 |
+
#
|
| 429 |
+
# Problem this solves:
|
| 430 |
+
# Old code: "ai" in text β matches "tr[ai]n", "ava[i]lable" β garbage hits.
|
| 431 |
+
# New code: \bai\b in text β only "AI" as a standalone word β clean hits.
|
| 432 |
+
#
|
| 433 |
+
# Why pre-compile?
|
| 434 |
+
# Building a regex from scratch takes CPU time. If we do it inside the
|
| 435 |
+
# validation function, it runs once per article Γ 22 categories = thousands of
|
| 436 |
+
# compilations per scheduler cycle. By compiling ONCE at import time and
|
| 437 |
+
# storing the result, all subsequent lookups are instant memory reads.
|
| 438 |
+
#
|
| 439 |
+
# How each pattern is built:
|
| 440 |
+
# For every keyword in a category we do:
|
| 441 |
+
# re.escape(keyword) β safely escapes dots, plus signs, brackets etc.
|
| 442 |
+
# \b ... \b β word boundaries so "aws" won't match "kawasaki"
|
| 443 |
+
# All keywords in one category are joined with | (OR), so a single
|
| 444 |
+
# re.search() call checks every keyword at once β maximum speed.
|
| 445 |
+
#
|
| 446 |
+
# Example β 'ai' category compiles to:
|
| 447 |
+
# \bartificial intelligence\b|\bmachine learning\b|\bgpt\b|\bllm\b|...
|
| 448 |
+
# ==============================================================================
|
| 449 |
+
def _build_category_regex(keywords: list) -> 're.Pattern':
|
| 450 |
+
"""
|
| 451 |
+
Turn a list of keywords into one pre-compiled word-boundary OR pattern.
|
| 452 |
+
|
| 453 |
+
Example:
|
| 454 |
+
['gpt', 'llm', 'openai']
|
| 455 |
+
β re.compile(r'\\bgpt\\b|\\bllm\\b|\\bopenai\\b', re.IGNORECASE)
|
| 456 |
+
"""
|
| 457 |
+
parts = [r'\b' + re.escape(kw) + r'\b' for kw in keywords]
|
| 458 |
+
return re.compile('|'.join(parts), re.IGNORECASE)
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
# This dict is built ONCE when the server starts.
|
| 462 |
+
# Key = category slug (e.g. 'ai', 'cloud-aws')
|
| 463 |
+
# Value = compiled regex (e.g. re.compile(r'\bgpt\b|\bllm\b|...'))
|
| 464 |
+
COMPILED_CATEGORY_REGEX: dict = {
|
| 465 |
+
category: _build_category_regex(keywords)
|
| 466 |
+
for category, keywords in CATEGORY_KEYWORDS.items()
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
|
| 470 |
def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> bool:
|
| 471 |
"""
|
| 472 |
+
Check whether an article belongs to the given category.
|
| 473 |
+
|
| 474 |
+
Uses pre-compiled word-boundary regex patterns (built once at server start)
|
| 475 |
+
so that:
|
| 476 |
+
β’ Short acronyms like "ai", "bi", "aws" only match as full words.
|
| 477 |
+
"trail" β does NOT match 'ai' anymore.
|
| 478 |
+
"kubernot" β does NOT match 'gcp' anymore.
|
| 479 |
+
β’ Multi-word phrases like "openai" or "sagemaker" are matched exactly.
|
| 480 |
+
β’ Unknown categories automatically pass (return True) so we don't
|
| 481 |
+
accidentally drop articles routed to categories we haven't mapped yet.
|
| 482 |
+
|
| 483 |
+
Scans: article title + description + URL path (all lowercased).
|
| 484 |
+
|
| 485 |
+
Returns:
|
| 486 |
+
True β article is relevant (at least 1 keyword matches).
|
| 487 |
+
False β no keyword matched; article is rejected for this category.
|
| 488 |
"""
|
| 489 |
+
# ββ Step 1: Convert to dict safely ββββββββββββββββββββββββββββββββββββββββ
|
| 490 |
if hasattr(article, 'model_dump'):
|
| 491 |
article_dict = article.model_dump()
|
| 492 |
elif hasattr(article, 'dict'):
|
| 493 |
article_dict = article.dict()
|
| 494 |
else:
|
| 495 |
article_dict = article
|
| 496 |
+
|
| 497 |
+
# ββ Step 2: Look up the pre-compiled pattern for this category ββββββββββββ
|
| 498 |
+
pattern = COMPILED_CATEGORY_REGEX.get(category)
|
| 499 |
+
|
| 500 |
+
if pattern is None:
|
| 501 |
+
# Category not in our taxonomy β let it pass rather than silently drop.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
return True
|
| 503 |
+
|
| 504 |
+
# ββ Step 3: Build the search text βββββββββββββββββββββββββββββββββββββββββ
|
| 505 |
+
# We scan three sources:
|
| 506 |
+
# β’ title β the headline, most reliable signal
|
| 507 |
+
# β’ description β body summary, adds context
|
| 508 |
+
# β’ url_words β URL path with hyphens β spaces.
|
| 509 |
+
# Catches articles with empty descriptions like Google RSS.
|
| 510 |
+
# e.g. "/aws-launches-sagemaker-feature" β "aws launches sagemaker feature"
|
| 511 |
+
title = (article_dict.get('title') or '').lower()
|
| 512 |
description = (article_dict.get('description') or '').lower()
|
| 513 |
|
|
|
|
| 514 |
raw_url = article_dict.get('url') or ''
|
| 515 |
url_str = str(raw_url).lower()
|
| 516 |
try:
|
| 517 |
parsed_url = urlparse(url_str)
|
| 518 |
+
# Replace hyphens and slashes with spaces so URL path words
|
| 519 |
+
# are treated as individual tokens by the word-boundary regex.
|
| 520 |
url_words = parsed_url.path.replace('-', ' ').replace('/', ' ')
|
| 521 |
except Exception:
|
| 522 |
url_words = ''
|
| 523 |
|
| 524 |
+
search_text = f"{title} {description} {url_words}"
|
| 525 |
+
|
| 526 |
+
# ββ Step 4: Run the compiled regex ββββββββββββββββββββββββββββββββββββββββ
|
| 527 |
+
# re.search() returns a Match object on the FIRST hit, or None.
|
| 528 |
+
# The pattern already has re.IGNORECASE compiled in β no need to lower() again.
|
| 529 |
+
if pattern.search(search_text):
|
|
|
|
|
|
|
| 530 |
return True
|
| 531 |
+
|
| 532 |
+
# No match β log the rejection for monitoring.
|
| 533 |
+
print(
|
| 534 |
+
f"π« Rejected '{article_dict.get('title', 'Unknown')[:50]}' "
|
| 535 |
+
f"from {category} (0 keyword matches)"
|
| 536 |
+
)
|
| 537 |
return False
|
| 538 |
|
| 539 |
|