Spaces:

WORKWITHSHAFISK
/

segmentopulse-backend

Paused

App Files Files Community

SHAFI commited on 23 days ago

Commit

82bd507

1 Parent(s): 1c599fe

the full ingestion pipline of the pulse is upgraded, worked on flaws and rectified them, solved over 10 major flaws

Browse files

Files changed (16) hide show

app/routes/monitoring.py +2 -2
app/routes/news.py +4 -4
app/services/adaptive_scheduler.py +114 -22
app/services/api_quota.py +96 -1
app/services/news_aggregator.py +177 -35
app/services/news_providers.py +40 -0
app/services/scheduler.py +270 -65
app/services/upstash_cache.py +36 -24
app/utils/data_validation.py +60 -3
app/utils/redis_dedup.py +102 -0
app/verify_manual_fetch.py +0 -87
app/verify_simple.py +0 -30
data/velocity_tracking.json +92 -92
docs/appwrite_schema.md +0 -77
docs/phase2_implementation_guide.md +0 -269
requirements.txt +0 -0

app/routes/monitoring.py CHANGED Viewed

@@ -137,8 +137,8 @@ async def cache_health_check():
         # Test connectivity with a simple PING
         test_key = "_health_check_test"
-        cache.set(test_key, "ok", ttl=10)
-        result = cache.get(test_key)
         if result == "ok":
             return {

         # Test connectivity with a simple PING
         test_key = "_health_check_test"
+        await cache.set(test_key, "ok", ttl=10)
+        result = await cache.get(test_key)
         if result == "ok":
             return {

app/routes/news.py CHANGED Viewed

@@ -59,7 +59,7 @@ async def get_news_by_category(
         # Try Upstash cache first (5 min TTL)
         if upstash_cache.enabled:
-            cached_data = upstash_cache.get(cache_key)
             if cached_data:
                 return NewsResponse(
                     success=True,
@@ -134,7 +134,7 @@ async def get_news_by_category(
         # Cache the result (5 min TTL)
         if upstash_cache.enabled:
-            upstash_cache.set(
                 cache_key,
                 {"articles": articles, "has_more": has_more, "next_cursor": next_cursor},
                 ttl=300  # 5 minutes
@@ -160,7 +160,7 @@ async def get_rss_feed(provider: str):
         # Check Upstash cache
         cache_key = f"rss:{provider}"
         if upstash_cache.enabled:
-            cached_data = upstash_cache.get(cache_key)
             if cached_data:
                 return NewsResponse(
                     success=True,
@@ -176,7 +176,7 @@ async def get_rss_feed(provider: str):
         # Cache in Upstash (10 min TTL for RSS feeds)
         if upstash_cache.enabled:
-            upstash_cache.set(cache_key, articles, ttl=600)
         return NewsResponse(
             success=True,

         # Try Upstash cache first (5 min TTL)
         if upstash_cache.enabled:
+            cached_data = await upstash_cache.get(cache_key)
             if cached_data:
                 return NewsResponse(
                     success=True,
         # Cache the result (5 min TTL)
         if upstash_cache.enabled:
+            await upstash_cache.set(
                 cache_key,
                 {"articles": articles, "has_more": has_more, "next_cursor": next_cursor},
                 ttl=300  # 5 minutes
         # Check Upstash cache
         cache_key = f"rss:{provider}"
         if upstash_cache.enabled:
+            cached_data = await upstash_cache.get(cache_key)
             if cached_data:
                 return NewsResponse(
                     success=True,
         # Cache in Upstash (10 min TTL for RSS feeds)
         if upstash_cache.enabled:
+            await upstash_cache.set(cache_key, articles, ttl=600)
         return NewsResponse(
             success=True,

app/services/adaptive_scheduler.py CHANGED Viewed

@@ -18,6 +18,7 @@ from datetime import datetime
 from typing import Dict, List
 import json
 import os
 class AdaptiveScheduler:
@@ -48,29 +49,76 @@ class AdaptiveScheduler:
                     'total_articles': 0
                 }
     def _load_velocity_data(self) -> Dict:
-        """Load velocity data from disk (persists across restarts)"""
-        data_file = 'data/velocity_tracking.json'
-        if os.path.exists(data_file):
-            try:
-                with open(data_file, 'r') as f:
-                    return json.load(f)
-            except Exception as e:
-                print(f"Warning: Failed to load velocity data: {e}")
         return {}
     def _save_velocity_data(self):
-        """Save velocity data to disk"""
-        data_file = 'data/velocity_tracking.json'
-        os.makedirs('data', exist_ok=True)
         try:
-            with open(data_file, 'w') as f:
-                json.dump(self.velocity_data, f, indent=2)
         except Exception as e:
-            print(f"Warning: Failed to save velocity data: {e}")
     def update_category_velocity(self, category: str, article_count: int):
         """
@@ -115,11 +163,55 @@ class AdaptiveScheduler:
             print(f"📊 {category.upper()}: Moderate velocity ({avg_count:.1f} avg) → 15min interval")
         data['interval'] = new_interval
-        # Persist to disk
-        self._save_velocity_data()
         return new_interval
     def get_interval(self, category: str) -> int:
         """Get current interval for a category"""

 from typing import Dict, List
 import json
 import os
+import httpx
 class AdaptiveScheduler:
                     'total_articles': 0
                 }
+    def _redis_key(self) -> str:
+        """Redis key where velocity data is stored permanently."""
+        return "segmento:adaptive_velocity_state"
+    def _redis_headers(self):
+        """Auth headers for the Upstash Redis REST API."""
+        return {"Authorization": f"Bearer {os.getenv('UPSTASH_REDIS_REST_TOKEN', '')}"}
+    def _redis_url(self) -> str:
+        """Base URL for the Upstash Redis REST API."""
+        return os.getenv("UPSTASH_REDIS_REST_URL", "")
     def _load_velocity_data(self) -> Dict:
+        """
+        Load velocity tracking data from Redis.
+        Fix #4 (Phase 7): The old version wrote to a local JSON file
+        (data/velocity_tracking.json). On cloud platforms (Render, Railway,
+        Heroku), local disks are wiped on every deploy, so the system kept
+        forgetting its trained intervals after restarts.
+        Redis is permanent — the key lives forever (no TTL) and the adaptive
+        scheduler's memory now survives deploys and server restarts.
+        """
+        redis_url = self._redis_url()
+        if not redis_url:
+            # Redis not configured — start with empty data (same as before).
+            return {}
+        try:
+            url = f"{redis_url}/get/{self._redis_key()}"
+            with httpx.Client(timeout=5.0) as client:
+                response = client.get(url, headers=self._redis_headers())
+                data = response.json()
+                # Upstash returns {"result": "<json string>"} or {"result": null}
+                raw = data.get("result")
+                if raw:
+                    return json.loads(raw)
+        except Exception as e:
+            print(f"[ADAPTIVE] Could not load velocity data from Redis ({e}) — starting fresh.")
         return {}
     def _save_velocity_data(self):
+        """
+        Save velocity tracking data to Redis (no expiry — keep forever).
+        Uses the Upstash REST API's SET command. No TTL is set so the data
+        persists indefinitely and we never lose our trained intervals.
+        """
+        redis_url = self._redis_url()
+        if not redis_url:
+            # Redis not configured — silently skip, same as before.
+            return
         try:
+            # Serialize the velocity dict to a JSON string.
+            payload = json.dumps(self.velocity_data)
+            # Upstash REST: POST /set/<key>  with body = value
+            # No EX or PX param = key never expires.
+            url = f"{redis_url}/set/{self._redis_key()}"
+            with httpx.Client(timeout=5.0) as client:
+                client.post(
+                    url,
+                    headers=self._redis_headers(),
+                    content=payload.encode("utf-8")
+                )
         except Exception as e:
+            print(f"[ADAPTIVE] Could not save velocity data to Redis ({e}) — data may be lost on restart.")
     def update_category_velocity(self, category: str, article_count: int):
         """
             print(f"📊 {category.upper()}: Moderate velocity ({avg_count:.1f} avg) → 15min interval")
         data['interval'] = new_interval
+        # NOTE: We no longer call _save_velocity_data() here.
+        # Reason: this method is sync, but it is called from an async job.
+        # Calling a blocking httpx.Client inside an async function freezes the
+        # entire event loop for up to 5 seconds on every category run.
+        # The caller (fetch_single_category_job) is responsible for awaiting
+        # async_persist() AFTER this method returns. That way the save
+        # happens asynchronously without blocking anything.
         return new_interval
+    async def async_persist(self):
+        """
+        Save velocity data to Redis using a non-blocking async HTTP call.
+        Why a separate method?
+        -----------------------
+        update_category_velocity() is a regular (sync) function because it is
+        called from many places, including some that are not async.
+        Putting an async HTTP call directly inside a sync function would block
+        the entire event loop — freezing FastAPI's ability to serve user
+        requests for up to 5 seconds.
+        The fix:
+          update_category_velocity() updates memory only (instant, no I/O).
+          async_persist() does the actual Redis write asynchronously.
+          The caller (fetch_single_category_job) awaits this after the update.
+        """
+        redis_url = self._redis_url()
+        if not redis_url:
+            return
+        try:
+            payload = json.dumps(self.velocity_data)
+            url = f"{redis_url}/set/{self._redis_key()}"
+            # httpx.AsyncClient never blocks the event loop.
+            # Even if the Upstash call takes 200ms, FastAPI keeps serving users.
+            async with httpx.AsyncClient(timeout=5.0) as client:
+                await client.post(
+                    url,
+                    headers=self._redis_headers(),
+                    content=payload.encode("utf-8")
+                )
+        except Exception as e:
+            print(
+                f"[ADAPTIVE] Could not persist velocity data to Redis ({e}) "
+                "\u2014 data is safe in memory for this session."
+            )
     def get_interval(self, category: str) -> int:
         """Get current interval for a category"""

app/services/api_quota.py CHANGED Viewed

@@ -4,7 +4,7 @@ Monitors API usage and prevents hitting rate limits
 """
 from typing import Dict, Optional
-from datetime import datetime, timedelta
 import logging
 logger = logging.getLogger(__name__)
@@ -140,6 +140,101 @@ class APIQuotaTracker:
         return stats
 # Global singleton
 _quota_tracker: Optional[APIQuotaTracker] = None

 """
 from typing import Dict, Optional
+from datetime import datetime, timedelta, date
 import logging
 logger = logging.getLogger(__name__)
         return stats
+    # --------------------------------------------------------------------------
+    # REDIS-BACKED ASYNC METHODS  (Phase 3 additions)
+    # --------------------------------------------------------------------------
+    # These two methods do the same job as can_make_call() and record_call(),
+    # but they also read and write from Upstash Redis.
+    #
+    # Why two sets of methods?  Because the old sync methods are called from
+    # places we do not want to change right now.  The new async ones are called
+    # only from news_aggregator.py, which is already async.
+    #
+    # Redis key format: quota:{provider}:{YYYY-MM-DD}
+    #   e.g.  quota:gnews:2026-02-26
+    # TTL: 86400 seconds (24 hours) — the key naturally disappears at the end
+    # of the day, which is the same as resetting the counter to zero at midnight.
+    # --------------------------------------------------------------------------
+    async def async_can_make_call(self, provider: str, calls: int = 1) -> bool:
+        """
+        Check if we can still call this paid provider today.
+        Reads the current call count from Redis first (so the answer survives
+        server restarts). Falls back to the in-memory count if Redis is down.
+        """
+        if provider not in self.quotas or "calls_per_day" not in self.quotas[provider]:
+            # Unknown or non-daily provider — allow the call.
+            return True
+        limit = self.quotas[provider]["calls_per_day"]
+        try:
+            from app.services.upstash_cache import get_upstash_cache
+            cache = get_upstash_cache()
+            redis_key = f"quota:{provider}:{date.today().isoformat()}"
+            # Ask Redis: how many calls have been made today so far?
+            raw = await cache._execute_command(["GET", redis_key])
+            used_today = int(raw) if raw is not None else 0
+            # Also sync in-memory so the sync path stays accurate.
+            self.quotas[provider]["calls_made"] = used_today
+            can_call = (used_today + calls) <= limit
+            if not can_call:
+                logger.warning(
+                    "[QUOTA] %s daily limit reached: %d/%d (Redis source)",
+                    provider.upper(), used_today, limit
+                )
+            return can_call
+        except Exception as e:
+            # Redis unavailable — fall back to the in-memory counter.
+            logger.debug("[QUOTA] Redis unavailable (%s) — using in-memory fallback.", e)
+            return self.can_make_call(provider, calls)
+    async def async_record_call(self, provider: str, calls: int = 1):
+        """
+        Record that we just used one API credit for this provider.
+        Writes to BOTH in-memory AND Redis so the count is correct
+        whether the server restarts or not.
+        """
+        if provider not in self.quotas or "calls_per_day" not in self.quotas[provider]:
+            return
+        # Always update in-memory immediately (zero latency fast path).
+        self.record_call(provider, calls)
+        # Then persist to Redis in the background so a restart does not lose the count.
+        try:
+            from app.services.upstash_cache import get_upstash_cache
+            cache = get_upstash_cache()
+            redis_key = f"quota:{provider}:{date.today().isoformat()}"
+            # INCR atomically adds 1 to the counter.
+            # If the key does not exist yet, Redis creates it and starts at 0.
+            await cache._execute_command(["INCR", redis_key])
+            # Make sure the key expires at the end of today (24-hour TTL).
+            # EXPIRE only sets it if not already set, so we do not keep
+            # resetting the TTL on every call.
+            await cache._execute_command(["EXPIRE", redis_key, 86400])
+            logger.debug(
+                "[QUOTA] Recorded call for %s in Redis (key: %s).",
+                provider.upper(), redis_key
+            )
+        except Exception as e:
+            # Redis write failed — in-memory was already updated, so we are still
+            # protected within this session. Log and move on.
+            logger.debug(
+                "[QUOTA] Redis write failed for %s (%s) — in-memory count still correct.",
+                provider.upper(), e
+            )
 # Global singleton
 _quota_tracker: Optional[APIQuotaTracker] = None

app/services/news_aggregator.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import httpx
 from typing import List, Dict, Optional
 from datetime import datetime
@@ -14,6 +15,8 @@ from app.services.news_providers import (
     OfficialCloudProvider
 )
 from app.config import settings
 class NewsAggregator:
     """Service for aggregating news from multiple sources with automatic failover"""
@@ -46,8 +49,27 @@ class NewsAggregator:
         # Official Cloud Provider (Strict Isolation)
         self.providers['official_cloud'] = OfficialCloudProvider()
-        # Provider priority order
-        self.provider_priority = settings.NEWS_PROVIDER_PRIORITY
         # Cloud provider RSS feeds
         self.cloud_rss_urls = {
@@ -65,53 +87,173 @@ class NewsAggregator:
             'provider_usage': {},
             'failover_count': 0
         }
     async def fetch_by_category(self, category: str) -> List[Article]:
         """
-        Fetch news by category using hybrid approach with automatic failover
-        Tries providers in priority order until successful
         """
-        self.stats['total_requests'] += 1
-        # Try each provider in priority order
-        for provider_name in self.provider_priority:
             provider = self.providers.get(provider_name)
-            # Skip if provider not configured
             if not provider:
                 continue
-            # Skip if provider is not available (rate limited)
             if not provider.is_available():
-                print(f"[SKIP] [{provider_name.upper()}] Not available (rate limited), trying next...")
-                self.stats['failover_count'] += 1
                 continue
             try:
-                print(f"[FETCH] [{provider_name.upper()}] Attempting to fetch '{category}' news...")
                 articles = await provider.fetch_news(category, limit=20)
-                # If we got articles, return them
                 if articles:
-                    # No need to print here, provider already printed success
-                    # Track usage statistics
-                    if provider_name not in self.stats['provider_usage']:
-                        self.stats['provider_usage'][provider_name] = 0
-                    self.stats['provider_usage'][provider_name] += 1
-                    return articles
                 else:
-                    print(f"[SKIP] [{provider_name.upper()}] No articles returned, trying next provider...")
             except Exception as e:
-                print(f"[ERROR] [{provider_name.upper()}] Error: {e}, trying next...")
-                self.stats['failover_count'] += 1
-                continue
-        # If all providers failed, return empty list
-        print(f"[WARN] [NEWS AGGREGATOR] All providers exhausted for '{category}' - no articles available")
-        return []
     async def fetch_from_provider(self, provider_name: str, category: str) -> List[Article]:
         """Fetch news specifically from a named provider (bypassing priority/failover)"""

+import asyncio
 import httpx
 from typing import List, Dict, Optional
 from datetime import datetime
     OfficialCloudProvider
 )
 from app.config import settings
+from app.services.api_quota import get_quota_tracker
+from app.services.circuit_breaker import get_circuit_breaker
 class NewsAggregator:
     """Service for aggregating news from multiple sources with automatic failover"""
         # Official Cloud Provider (Strict Isolation)
         self.providers['official_cloud'] = OfficialCloudProvider()
+        # ── Provider role lists ──────────────────────────────────────────────
+        # PAID_CHAIN: tried in order, stop after the first success (save credits)
+        # FREE_SOURCES: always tried, always in parallel (no cost, no limits)
+        self.PAID_CHAIN  = ['gnews', 'newsapi', 'newsdata']
+        self.FREE_SOURCES = ['google_rss', 'medium', 'official_cloud']
+        # Medium only publishes articles for a small set of topics.
+        # Calling it for 'data-centers' or 'cloud-oracle' would return nothing.
+        self.MEDIUM_SUPPORTED_CATEGORIES = {
+            'ai', 'data-science', 'cloud-computing', 'programming',
+            'technology', 'data-laws'
+        }
+        # Official Cloud RSS only makes sense for cloud-related categories.
+        self.CLOUD_CATEGORIES = {
+            c for c in [
+                'cloud-computing', 'cloud-aws', 'cloud-azure', 'cloud-gcp',
+                'cloud-oracle', 'cloud-ibm', 'cloud-alibaba', 'cloud-digitalocean',
+                'cloud-huawei', 'cloud-cloudflare'
+            ]
+        }
         # Cloud provider RSS feeds
         self.cloud_rss_urls = {
             'provider_usage': {},
             'failover_count': 0
         }
+        # Async lock — keeps stats correct when 22 category tasks share this one aggregator.
+        # Without this, two tasks updating the same counter at the same time could miss a count.
+        self._lock = asyncio.Lock()
+        # --- Phase 2 additions: infrastructure guards ---
+        # Which providers cost real API credits.
+        # Kept as a set for O(1) lookup inside the waterfall loop.
+        self.paid_providers = set(self.PAID_CHAIN)
+        # The Quota Tracker counts how many API calls we have made today.
+        # It is a module-level singleton — once created it lives in memory for the
+        # entire lifetime of the server process, surviving every hourly scheduler
+        # run without resetting. (It DOES reset if the server itself restarts;
+        # that is acceptable for now and noted as a future improvement.)
+        self.quota = get_quota_tracker()
+        # The Circuit Breaker watches each provider for repeated failures.
+        # If a provider fails 3 times in 5 minutes, we stop calling it for 1 hour
+        # (like hanging up on a broken phone line and trying it again later).
+        # It is also a module-level singleton — same lifetime as the quota tracker.
+        self.circuit = get_circuit_breaker()
     async def fetch_by_category(self, category: str) -> List[Article]:
         """
+        Fetch news from ALL available sources for a category.
+        Strategy (Phase 5 — True Multi-Source Aggregation):
+          STEP A ─ Paid Waterfall:
+            Try GNews → NewsAPI → NewsData in order.
+            Stop as soon as one returns articles.
+            We only want ONE paid call per category to stay inside our daily budget.
+            Think of it like: only knock on the first open door, don't ring every bell.
+          STEP B ─ Free Parallel Run (always runs, even if Step A succeeded):
+            Simultaneously fetch from Google RSS, Medium, and Official Cloud.
+            These are free and have no rate-limit cost, so we always want them.
+            Think of it like: sending postcards to all your free newspaper subscriptions.
+          STEP C ─ Combine:
+            Merge paid + free results into one big list.
+            Duplicates are fine here — the in-batch deduplication in scheduler.py
+            will clean them up right after this function returns.
         """
+        async with self._lock:
+            self.stats['total_requests'] += 1
+        combined_articles: List[Article] = []
+        # ======================================================================
+        # STEP A: PAID WATERFALL — one successful call is all we need
+        # ======================================================================
+        paid_success = False
+        for provider_name in self.PAID_CHAIN:
             provider = self.providers.get(provider_name)
+            # Skip if this paid provider was not configured (no API key set).
             if not provider:
                 continue
+            # Guard 1 ─ Circuit Breaker
+            if self.circuit.should_skip(provider_name):
+                print(f"[CIRCUIT] [{provider_name.upper()}] Circuit OPEN — skipping this run.")
+                async with self._lock:
+                    self.stats['failover_count'] += 1
+                continue
+            # Guard 2 ─ Quota Check (paid only)
+            if not await self.quota.async_can_make_call(provider_name):
+                print(f"[QUOTA]   [{provider_name.upper()}] Daily limit reached — skipping.")
+                continue
+            # Guard 3 ─ Provider's own 429 flag
             if not provider.is_available():
+                print(f"[SKIP]    [{provider_name.upper()}] Provider reported 429 — recording and skipping.")
+                self.circuit.record_failure(provider_name, error_type="rate_limit", status_code=429)
+                async with self._lock:
+                    self.stats['failover_count'] += 1
                 continue
             try:
+                print(f"[PAID]    [{provider_name.upper()}] Fetching '{category}'...")
                 articles = await provider.fetch_news(category, limit=20)
                 if articles:
+                    self.circuit.record_success(provider_name)
+                    await self.quota.async_record_call(provider_name)
+                    async with self._lock:
+                        self.stats['provider_usage'][provider_name] = \
+                            self.stats['provider_usage'].get(provider_name, 0) + 1
+                    combined_articles.extend(articles)
+                    paid_success = True
+                    print(f"[PAID]    [{provider_name.upper()}] Got {len(articles)} articles — stopping paid chain.")
+                    break  # ← KEY: one success is enough, protect our credits
                 else:
+                    print(f"[PAID]    [{provider_name.upper()}] No articles — trying next paid provider.")
             except Exception as e:
+                print(f"[ERROR]   [{provider_name.upper()}] Fetch failed: {e} — recording failure.")
+                self.circuit.record_failure(provider_name, error_type="exception")
+                async with self._lock:
+                    self.stats['failover_count'] += 1
+                continue  # try next paid provider
+        if not paid_success:
+            print(f"[PAID]    No paid provider delivered articles for '{category}'.")
+        # ======================================================================
+        # STEP B: FREE PARALLEL RUN — always fires, no cost
+        # ======================================================================
+        # We build a list of coroutines for free sources, but only include a
+        # provider if it actually supports this category (avoid pointless calls).
+        free_tasks: list = []
+        free_names: list = []  # track which name maps to which task result
+        # Google RSS supports ALL categories.
+        google_rss = self.providers.get('google_rss')
+        if google_rss and not self.circuit.should_skip('google_rss'):
+            if google_rss.is_available():
+                free_tasks.append(google_rss.fetch_news(category, limit=20))
+                free_names.append('google_rss')
+        # Medium only supports a small set of topics.
+        if category in self.MEDIUM_SUPPORTED_CATEGORIES:
+            medium = self.providers.get('medium')
+            if medium and not self.circuit.should_skip('medium'):
+                if medium.is_available():
+                    free_tasks.append(medium.fetch_news(category, limit=10))
+                    free_names.append('medium')
+        # Official Cloud RSS only makes sense for cloud-* categories.
+        if category in self.CLOUD_CATEGORIES:
+            official = self.providers.get('official_cloud')
+            if official and not self.circuit.should_skip('official_cloud'):
+                if official.is_available():
+                    free_tasks.append(official.fetch_news(category, limit=10))
+                    free_names.append('official_cloud')
+        if free_tasks:
+            print(f"[FREE]    Launching {len(free_tasks)} free source(s) in parallel for '{category}'...")
+            free_results = await asyncio.gather(*free_tasks, return_exceptions=True)
+            for name, result in zip(free_names, free_results):
+                if isinstance(result, Exception):
+                    print(f"[ERROR]   [{name.upper()}] Free fetch error: {result}")
+                    self.circuit.record_failure(name, error_type="exception")
+                elif isinstance(result, list) and result:
+                    self.circuit.record_success(name)
+                    combined_articles.extend(result)
+                    print(f"[FREE]    [{name.upper()}] Got {len(result)} articles.")
+                    async with self._lock:
+                        self.stats['provider_usage'][name] = \
+                            self.stats['provider_usage'].get(name, 0) + 1
+        # ======================================================================
+        # STEP C: RETURN COMBINED LIST
+        # ======================================================================
+        # Return everything we collected. Duplicates are expected and welcome —
+        # the in-batch dedup in scheduler.py (Phase 1) will strip them cleanly.
+        if combined_articles:
+            print(f"[DONE]    '{category}': {len(combined_articles)} total articles from all sources.")
+        else:
+            print(f"[WARN]    '{category}': No articles from any source this run.")
+        return combined_articles
     async def fetch_from_provider(self, provider_name: str, category: str) -> List[Article]:
         """Fetch news specifically from a named provider (bypassing priority/failover)"""

app/services/news_providers.py CHANGED Viewed

@@ -58,11 +58,21 @@ class GNewsProvider(NewsProvider):
             'data-governance': 'data governance compliance',
             'data-privacy': 'data privacy GDPR',
             'data-engineering': 'data engineering pipeline',
             'business-intelligence': 'business intelligence BI',
             'business-analytics': 'business analytics',
             'customer-data-platform': 'customer data platform CDP',
             'data-centers': 'data centers infrastructure',
             'cloud-computing': 'cloud computing AWS Azure Google Cloud Salesforce Alibaba Cloud Tencent Cloud Huawei Cloud Cloudflare',
             'medium-article': 'Medium article blog writing publishing',
             'magazines': 'technology news',
             'data-laws': 'data privacy law GDPR CCPA AI regulation compliance',
@@ -146,11 +156,21 @@ class NewsAPIProvider(NewsProvider):
             'data-governance': '"data governance" OR "data management" OR compliance',
             'data-privacy': '"data privacy" OR GDPR OR "privacy regulation"',
             'data-engineering': '"data engineering" OR "data pipeline" OR "big data"',
             'business-intelligence': '"business intelligence" OR "BI tools"',
             'business-analytics': '"business analytics" OR analytics',
             'customer-data-platform': '"customer data platform" OR CDP',
             'data-centers': '"data centers" OR "data centre"',
             'cloud-computing': '"cloud computing" OR AWS OR Azure OR "Google Cloud" OR Salesforce OR "Alibaba Cloud" OR "Tencent Cloud" OR "Huawei Cloud" OR Cloudflare',
             'medium-article': 'Medium OR "Medium article" OR "Medium blog" OR "Medium publishing"',
             'magazines': 'technology',
             'data-laws': '"data privacy law" OR GDPR OR CCPA OR "EU AI Act" OR "data protection act"',
@@ -225,11 +245,21 @@ class NewsDataProvider(NewsProvider):
             'data-governance': 'data governance,compliance',
             'data-privacy': 'data privacy,GDPR',
             'data-engineering': 'data engineering,big data',
             'business-intelligence': 'business intelligence',
             'business-analytics': 'business analytics',
             'customer-data-platform': 'customer data platform',
             'data-centers': 'data centers',
             'cloud-computing': 'cloud computing,AWS,Azure,Google Cloud,Salesforce,Alibaba Cloud,Tencent Cloud,Huawei Cloud,Cloudflare',
             'medium-article': 'Medium,article,blog,writing,publishing',
             'magazines': 'technology',
             'data-laws': 'data privacy law,GDPR,CCPA,AI regulation,compliance',
@@ -310,11 +340,21 @@ class GoogleNewsRSSProvider(NewsProvider):
             'data-governance': 'https://news.google.com/rss/search?q=data+governance+OR+data+management&hl=en-US&gl=US&ceid=US:en',
             'data-privacy': 'https://news.google.com/rss/search?q=data+privacy+OR+GDPR+OR+privacy+regulation&hl=en-US&gl=US&ceid=US:en',
             'data-engineering': 'https://news.google.com/rss/search?q=data+engineering+OR+data+pipeline+OR+big+data&hl=en-US&gl=US&ceid=US:en',
             'business-intelligence': 'https://news.google.com/rss/search?q=business+intelligence+OR+BI+tools&hl=en-US&gl=US&ceid=US:en',
             'business-analytics': 'https://news.google.com/rss/search?q=business+analytics&hl=en-US&gl=US&ceid=US:en',
             'customer-data-platform': 'https://news.google.com/rss/search?q=customer+data+platform+OR+CDP&hl=en-US&gl=US&ceid=US:en',
             'data-centers': 'https://news.google.com/rss/search?q=data+centers+OR+data+centre&hl=en-US&gl=US&ceid=US:en',
             'cloud-computing': 'https://news.google.com/rss/search?q=cloud+computing+OR+AWS+OR+Azure+OR+Google+Cloud+OR+Salesforce+OR+Alibaba+Cloud+OR+Tencent+Cloud+OR+Huawei+Cloud+OR+Cloudflare&hl=en-US&gl=US&ceid=US:en',
             'medium-article': 'https://news.google.com/rss/search?q=Medium+article+OR+Medium+blog+OR+Medium+publishing&hl=en-US&gl=US&ceid=US:en',
             'magazines': 'https://news.google.com/rss/headlines/section/topic/TECHNOLOGY?hl=en-US&gl=US&ceid=US:en',
             'data-laws': 'https://news.google.com/rss/search?q=data+privacy+law+OR+GDPR+OR+CCPA+OR+AI+Regulation&hl=en-US&gl=US&ceid=US:en',

             'data-governance': 'data governance compliance',
             'data-privacy': 'data privacy GDPR',
             'data-engineering': 'data engineering pipeline',
+            'data-management': 'data management master data MDM data catalog data quality',
             'business-intelligence': 'business intelligence BI',
             'business-analytics': 'business analytics',
             'customer-data-platform': 'customer data platform CDP',
             'data-centers': 'data centers infrastructure',
             'cloud-computing': 'cloud computing AWS Azure Google Cloud Salesforce Alibaba Cloud Tencent Cloud Huawei Cloud Cloudflare',
+            'cloud-aws': 'AWS Amazon Web Services S3 EC2 Lambda CloudFront SageMaker',
+            'cloud-azure': 'Microsoft Azure Azure DevOps Azure ML Azure OpenAI',
+            'cloud-gcp': 'Google Cloud Platform GCP BigQuery Vertex AI Cloud Run Dataflow',
+            'cloud-oracle': 'Oracle Cloud OCI Oracle Database Oracle Fusion',
+            'cloud-ibm': 'IBM Cloud IBM Watson Red Hat OpenShift IBM Z',
+            'cloud-alibaba': 'Alibaba Cloud Aliyun AliCloud',
+            'cloud-digitalocean': 'DigitalOcean Droplet App Platform',
+            'cloud-huawei': 'Huawei Cloud HuaweiCloud',
+            'cloud-cloudflare': 'Cloudflare Workers R2 Cloudflare Pages Zero Trust',
             'medium-article': 'Medium article blog writing publishing',
             'magazines': 'technology news',
             'data-laws': 'data privacy law GDPR CCPA AI regulation compliance',
             'data-governance': '"data governance" OR "data management" OR compliance',
             'data-privacy': '"data privacy" OR GDPR OR "privacy regulation"',
             'data-engineering': '"data engineering" OR "data pipeline" OR "big data"',
+            'data-management': '"data management" OR "master data" OR MDM OR "data catalog" OR "data quality" OR "data lineage"',
             'business-intelligence': '"business intelligence" OR "BI tools"',
             'business-analytics': '"business analytics" OR analytics',
             'customer-data-platform': '"customer data platform" OR CDP',
             'data-centers': '"data centers" OR "data centre"',
             'cloud-computing': '"cloud computing" OR AWS OR Azure OR "Google Cloud" OR Salesforce OR "Alibaba Cloud" OR "Tencent Cloud" OR "Huawei Cloud" OR Cloudflare',
+            'cloud-aws': 'AWS OR "Amazon Web Services" OR "Amazon S3" OR EC2 OR Lambda OR CloudFront OR SageMaker',
+            'cloud-azure': 'Azure OR "Microsoft Azure" OR "Azure DevOps" OR "Azure ML" OR "Azure OpenAI"',
+            'cloud-gcp': 'GCP OR "Google Cloud" OR BigQuery OR "Vertex AI" OR "Cloud Run" OR Dataflow',
+            'cloud-oracle': '"Oracle Cloud" OR OCI OR "Oracle Database" OR "Oracle Fusion"',
+            'cloud-ibm': '"IBM Cloud" OR "IBM Watson" OR "Red Hat" OR OpenShift OR "IBM Z"',
+            'cloud-alibaba': '"Alibaba Cloud" OR Aliyun OR AliCloud',
+            'cloud-digitalocean': 'DigitalOcean OR Droplet OR "App Platform"',
+            'cloud-huawei': '"Huawei Cloud" OR HuaweiCloud',
+            'cloud-cloudflare': 'Cloudflare OR "Cloudflare Workers" OR "Cloudflare R2" OR "Zero Trust"',
             'medium-article': 'Medium OR "Medium article" OR "Medium blog" OR "Medium publishing"',
             'magazines': 'technology',
             'data-laws': '"data privacy law" OR GDPR OR CCPA OR "EU AI Act" OR "data protection act"',
             'data-governance': 'data governance,compliance',
             'data-privacy': 'data privacy,GDPR',
             'data-engineering': 'data engineering,big data',
+            'data-management': 'data management,master data,MDM,data catalog,data quality,data lineage',
             'business-intelligence': 'business intelligence',
             'business-analytics': 'business analytics',
             'customer-data-platform': 'customer data platform',
             'data-centers': 'data centers',
             'cloud-computing': 'cloud computing,AWS,Azure,Google Cloud,Salesforce,Alibaba Cloud,Tencent Cloud,Huawei Cloud,Cloudflare',
+            'cloud-aws': 'AWS,Amazon Web Services,Amazon S3,EC2,Lambda,CloudFront,SageMaker',
+            'cloud-azure': 'Azure,Microsoft Azure,Azure DevOps,Azure ML,Azure OpenAI',
+            'cloud-gcp': 'GCP,Google Cloud Platform,BigQuery,Vertex AI,Cloud Run,Dataflow',
+            'cloud-oracle': 'Oracle Cloud,OCI,Oracle Database,Oracle Fusion',
+            'cloud-ibm': 'IBM Cloud,IBM Watson,Red Hat,OpenShift,IBM Z',
+            'cloud-alibaba': 'Alibaba Cloud,Aliyun,AliCloud',
+            'cloud-digitalocean': 'DigitalOcean,Droplet,App Platform',
+            'cloud-huawei': 'Huawei Cloud,HuaweiCloud',
+            'cloud-cloudflare': 'Cloudflare,Cloudflare Workers,Cloudflare R2,Zero Trust',
             'medium-article': 'Medium,article,blog,writing,publishing',
             'magazines': 'technology',
             'data-laws': 'data privacy law,GDPR,CCPA,AI regulation,compliance',
             'data-governance': 'https://news.google.com/rss/search?q=data+governance+OR+data+management&hl=en-US&gl=US&ceid=US:en',
             'data-privacy': 'https://news.google.com/rss/search?q=data+privacy+OR+GDPR+OR+privacy+regulation&hl=en-US&gl=US&ceid=US:en',
             'data-engineering': 'https://news.google.com/rss/search?q=data+engineering+OR+data+pipeline+OR+big+data&hl=en-US&gl=US&ceid=US:en',
+            'data-management': 'https://news.google.com/rss/search?q=%22data+management%22+OR+%22master+data%22+OR+MDM+OR+%22data+catalog%22&hl=en-US&gl=US&ceid=US:en',
             'business-intelligence': 'https://news.google.com/rss/search?q=business+intelligence+OR+BI+tools&hl=en-US&gl=US&ceid=US:en',
             'business-analytics': 'https://news.google.com/rss/search?q=business+analytics&hl=en-US&gl=US&ceid=US:en',
             'customer-data-platform': 'https://news.google.com/rss/search?q=customer+data+platform+OR+CDP&hl=en-US&gl=US&ceid=US:en',
             'data-centers': 'https://news.google.com/rss/search?q=data+centers+OR+data+centre&hl=en-US&gl=US&ceid=US:en',
             'cloud-computing': 'https://news.google.com/rss/search?q=cloud+computing+OR+AWS+OR+Azure+OR+Google+Cloud+OR+Salesforce+OR+Alibaba+Cloud+OR+Tencent+Cloud+OR+Huawei+Cloud+OR+Cloudflare&hl=en-US&gl=US&ceid=US:en',
+            'cloud-aws': 'https://news.google.com/rss/search?q=AWS+OR+%22Amazon+Web+Services%22+OR+%22Amazon+S3%22+OR+EC2+OR+Lambda&hl=en-US&gl=US&ceid=US:en',
+            'cloud-azure': 'https://news.google.com/rss/search?q=Azure+OR+%22Microsoft+Azure%22+OR+%22Azure+DevOps%22&hl=en-US&gl=US&ceid=US:en',
+            'cloud-gcp': 'https://news.google.com/rss/search?q=GCP+OR+%22Google+Cloud%22+OR+BigQuery+OR+%22Vertex+AI%22&hl=en-US&gl=US&ceid=US:en',
+            'cloud-oracle': 'https://news.google.com/rss/search?q=%22Oracle+Cloud%22+OR+OCI+OR+%22Oracle+Database%22&hl=en-US&gl=US&ceid=US:en',
+            'cloud-ibm': 'https://news.google.com/rss/search?q=%22IBM+Cloud%22+OR+%22IBM+Watson%22+OR+OpenShift&hl=en-US&gl=US&ceid=US:en',
+            'cloud-alibaba': 'https://news.google.com/rss/search?q=%22Alibaba+Cloud%22+OR+Aliyun&hl=en-US&gl=US&ceid=US:en',
+            'cloud-digitalocean': 'https://news.google.com/rss/search?q=DigitalOcean+OR+Droplet&hl=en-US&gl=US&ceid=US:en',
+            'cloud-huawei': 'https://news.google.com/rss/search?q=%22Huawei+Cloud%22+OR+HuaweiCloud&hl=en-US&gl=US&ceid=US:en',
+            'cloud-cloudflare': 'https://news.google.com/rss/search?q=Cloudflare+OR+%22Cloudflare+Workers%22+OR+%22Zero+Trust%22&hl=en-US&gl=US&ceid=US:en',
             'medium-article': 'https://news.google.com/rss/search?q=Medium+article+OR+Medium+blog+OR+Medium+publishing&hl=en-US&gl=US&ceid=US:en',
             'magazines': 'https://news.google.com/rss/headlines/section/topic/TECHNOLOGY?hl=en-US&gl=US&ceid=US:en',
             'data-laws': 'https://news.google.com/rss/search?q=data+privacy+law+OR+GDPR+OR+CCPA+OR+AI+Regulation&hl=en-US&gl=US&ceid=US:en',

app/services/scheduler.py CHANGED Viewed

@@ -51,6 +51,43 @@ CATEGORIES = [
     "cloud-cloudflare"
 ]
 async def fetch_all_news():
     """
@@ -78,10 +115,17 @@ async def fetch_all_news():
     total_irrelevant = 0
     category_stats = {}
-    # Parallel fetch all categories at once
     fetch_tasks = []
     for category in CATEGORIES:
-        task = fetch_and_validate_category(category)
         fetch_tasks.append(task)
     # Execute all fetches concurrently with error isolation
@@ -99,7 +143,9 @@ async def fetch_all_news():
             total_errors += 1
             continue
-        category, articles, invalid_count, irrelevant_count = result
         if not articles:
             logger.warning("⚠️  No valid articles for category: %s", category)
@@ -195,22 +241,119 @@ async def fetch_all_news():
     )
     # Update adaptive scheduler intervals
-    from app.services.adaptive_scheduler import get_adaptive_scheduler
-    adaptive = get_adaptive_scheduler(CATEGORIES)
     if adaptive:
-        # Update intervals based on this run's statistics
-        for category, stats in category_stats.items():
             if 'fetched' in stats:
-                new_interval = adaptive.update_category_velocity(
-                    category,
-                    stats['fetched']
-                )
-        # Print adaptive scheduler summary
         adaptive.print_summary()
 async def fetch_daily_research():
     """
     Background Job: Fetch Research Papers from ArXiv
@@ -232,76 +375,115 @@ async def fetch_daily_research():
     logger.info("═" * 80)
-async def fetch_and_validate_category(category: str) -> tuple:
     """
-    Fetch and validate articles for a single category
     Returns: (category, valid_articles, invalid_count, irrelevant_count)
     """
     from app.utils.data_validation import is_valid_article, sanitize_article, is_relevant_to_category
     from app.utils.date_parser import normalize_article_date
     try:
         logger.info("📌 Fetching %s...", category.upper())
-        # Fetch from external APIs
-        news_aggregator = NewsAggregator()
-        # Concurrent fetch from Main Chain + Medium + Official Cloud
-        main_task = news_aggregator.fetch_by_category(category)
-        medium_task = news_aggregator.fetch_from_provider('medium', category)
-        official_task = news_aggregator.fetch_from_provider('official_cloud', category)
-        results = await asyncio.gather(main_task, medium_task, official_task, return_exceptions=True)
-        # Combine results
-        raw_articles = []
-        # Result 0: Main Provider Chain
-        if isinstance(results[0], list):
-            raw_articles.extend(results[0])
-        # Result 1: Medium RSS
-        if isinstance(results[1], list):
-            if results[1]:
-                logger.info("   + Found %d Medium articles for %s", len(results[1]), category)
-                raw_articles.extend(results[1])
-        # Result 2: Official Cloud
-        if isinstance(results[2], list):
-            if results[2]:
-                logger.info("   + Found %d Official Cloud articles for %s", len(results[2]), category)
-                raw_articles.extend(results[2])
         if not raw_articles:
             return (category, [], 0, 0)
         # Validate, filter, and sanitize
         valid_articles = []
         invalid_count = 0
         irrelevant_count = 0
         for article in raw_articles:
-            # Step 1: Basic validation
             if not is_valid_article(article):
                 invalid_count += 1
                 continue
-            # Step 2: Category relevance check
             if not is_relevant_to_category(article, category):
                 irrelevant_count += 1
                 continue
-            # Step 3: Normalize date to UTC ISO-8601
             article = normalize_article_date(article)
-            # Step 4: Sanitize and clean
             clean_article = sanitize_article(article)
             valid_articles.append(clean_article)
-        logger.info("✓ %s: %d valid, %d invalid, %d irrelevant",
                     category.upper(), len(valid_articles), invalid_count, irrelevant_count)
-        return (category, valid_articles, invalid_count, irrelevant_count)
     except asyncio.TimeoutError:
         logger.error("⏱️  Timeout fetching %s (>30s)", category)
@@ -473,18 +655,41 @@ def start_scheduler():
     logger.info("⏰ [SCHEDULER] Initializing background scheduler...")
     logger.info("═" * 80)
-    # News Fetcher Job (Frequency: Every 1 hour)
-    scheduler.add_job(
-        fetch_all_news,
-        trigger=IntervalTrigger(hours=1),
-        id='fetch_all_news',
-        name='News Fetcher (every 1 hour)',
-        replace_existing=True
-    )
     logger.info("")
-    logger.info("✅ Job #1 Registered: 📰 News Fetcher")
-    logger.info("   ⏱️  Schedule: Every 1 hour")
-    logger.info("   📋 Task: Direct Fetch -> Deduplicate -> Store (Appwrite)")
     # Cleanup Job (Frequency: Every 30 minutes)
     scheduler.add_job(

     "cloud-cloudflare"
 ]
+# --------------------------------------------------------------------------
+# MODULE-LEVEL SINGLETONS (Phase 6)
+# --------------------------------------------------------------------------
+# These two objects are created ONCE when the server starts and are shared
+# by all 22 per-category jobs for the entire lifetime of the process.
+#
+# _shared_aggregator  — one NewsAggregator for all categories (Phase 1 fix).
+#   It holds provider state (quota counts, circuit-breaker) that must
+#   survive across job runs. Creating a new one for every job would reset
+#   all that carefully maintained state.
+#
+# _adaptive  — the AdaptiveScheduler that tracks how many articles each
+#   category produces and adjusts its fetch interval accordingly.
+#   Also persists to disk (data/velocity_tracking.json) so intervals
+#   survive server restarts.
+# --------------------------------------------------------------------------
+_shared_aggregator = None
+_adaptive          = None
+def _get_shared_aggregator():
+    """Return (creating if needed) the one shared NewsAggregator instance."""
+    global _shared_aggregator
+    if _shared_aggregator is None:
+        _shared_aggregator = NewsAggregator()
+        logger.info("[AGGREGATOR] Shared NewsAggregator created (singleton).")
+    return _shared_aggregator
+def _get_adaptive():
+    """Return (creating if needed) the one shared AdaptiveScheduler instance."""
+    global _adaptive
+    if _adaptive is None:
+        _adaptive = get_adaptive_scheduler(CATEGORIES)
+        logger.info("[ADAPTIVE] AdaptiveScheduler created for %d categories.", len(CATEGORIES))
+    return _adaptive
 async def fetch_all_news():
     """
     total_irrelevant = 0
     category_stats = {}
+    # Parallel fetch all categories at once.
+    # We create ONE shared aggregator here so all 22 category tasks share
+    # the same provider state (quota counts, circuit states, etc.).
+    # Fix #3 (Phase 7): Use the permanent module-level singleton instead of
+    # creating a fresh instance here. This ensures that even manual triggers
+    # respect the live quota counts and circuit-breaker state from the
+    # adaptive jobs that may already be running.
+    shared_aggregator = _get_shared_aggregator()
     fetch_tasks = []
     for category in CATEGORIES:
+        task = fetch_and_validate_category(category, shared_aggregator)
         fetch_tasks.append(task)
     # Execute all fetches concurrently with error isolation
             total_errors += 1
             continue
+        # Unpack 5-tuple — relevant_count (5th item) is not needed here,
+        # it is only used by fetch_single_category_job for adaptive velocity.
+        category, articles, invalid_count, irrelevant_count, _ = result
         if not articles:
             logger.warning("⚠️  No valid articles for category: %s", category)
     )
     # Update adaptive scheduler intervals
+    # (kept for backward compat — manual trigger may still call this)
+    adaptive = _get_adaptive()
     if adaptive:
+        for cat, stats in category_stats.items():
             if 'fetched' in stats:
+                adaptive.update_category_velocity(cat, stats['fetched'])
         adaptive.print_summary()
+async def fetch_single_category_job(category: str):
+    """
+    Per-category background job (Phase 6).
+    This is what each of the 22 adaptive jobs calls every N minutes.
+    It is a self-contained unit: fetch → validate → save → report → reschedule.
+    In plain English:
+      Think of this like a delivery driver who has a single route (one category).
+      After every delivery run, the dispatcher (adaptive scheduler) checks how
+      many packages were delivered. If the route is always busy (lots of news),
+      the driver gets sent out more often. If the route is quiet, the driver
+      waits longer before going out again.
+    """
+    aggregator = _get_shared_aggregator()
+    adaptive   = _get_adaptive()
+    logger.info("[ADAPTIVE JOB] Starting fetch for category: %s", category.upper())
+    try:
+        # Step 1: Fetch + validate (calls the full Phase 1-4 pipeline).
+        result = await fetch_and_validate_category(category, aggregator)
+        if isinstance(result, Exception):
+            logger.error("[ADAPTIVE JOB] %s fetch failed: %s", category, result)
+            return
+        # Unpack the 5-tuple returned by fetch_and_validate_category.
+        # relevant_count = articles that passed Steps 1+2 (valid + on-topic)
+        # but before Step 3 (Redis 48h dedup) filtered them.
+        # This is the true measure of how active a category's news feed is.
+        cat, articles, invalid_count, irrelevant_count, relevant_count = result
+        if not articles:
+            logger.info("[ADAPTIVE JOB] %s: No valid articles this run.", category.upper())
+            saved_count = 0
+        else:
+            # Step 2: Save to Appwrite.
+            appwrite_db   = get_appwrite_db()
+            cache_service = CacheService()
+            logger.info("[ADAPTIVE JOB] %s: Saving %d articles...", category.upper(), len(articles))
+            saved_count, duplicate_count, error_count, _ = await appwrite_db.save_articles(articles)
+            logger.info(
+                "[ADAPTIVE JOB] %s: %d saved, %d duplicates, %d errors, "
+                "%d invalid, %d irrelevant.",
+                category.upper(), saved_count, duplicate_count, error_count,
+                invalid_count, irrelevant_count
+            )
+            # Step 3: Update Redis article cache so the API serves fresh results.
+            try:
+                await cache_service.set(f"news:{category}", articles, ttl=settings.CACHE_TTL)
+            except Exception as cache_err:
+                logger.debug("[ADAPTIVE JOB] Redis cache update skipped: %s", cache_err)
+        # Step 4: Feed result count back to the adaptive scheduler.
+        # We use relevant_count (articles that passed validation + keyword relevance)
+        # rather than saved_count (articles actually new to Appwrite).
+        #
+        # Why? A busy category with a slow-updating RSS feed will have high
+        # relevant_count but low saved_count (we already have the articles).
+        # Using saved_count would incorrectly mark it as "quiet" and slow it down.
+        # relevant_count correctly reflects: "how much real news is out there?"
+        if adaptive:
+            # Fix #1 (Phase 7): Read old_interval NOW, before update_category_velocity
+            # overwrites data['interval'] inside the AdaptiveScheduler.
+            # The comparison new_interval != old_interval was always False before
+            # because we were reading the interval AFTER it was already updated.
+            old_interval = adaptive.get_interval(category)
+            # Now update velocity with the correct metric (in-memory only — instant).
+            new_interval = adaptive.update_category_velocity(category, relevant_count)
+            # Persist the updated velocity to Redis asynchronously.
+            # async_persist() uses httpx.AsyncClient so it never blocks the event loop.
+            # Think of it like dropping a letter in a post box — we do not stand
+            # and wait for the postman to deliver it. We just drop it and walk on.
+            await adaptive.async_persist()
+            # Step 5: If the interval genuinely changed, tell APScheduler
+            # to reschedule this specific job live — no server restart needed.
+            if new_interval != old_interval:
+                job_id = f"fetch_{category}"
+                try:
+                    scheduler.reschedule_job(
+                        job_id,
+                        trigger=IntervalTrigger(minutes=new_interval)
+                    )
+                    logger.info(
+                        "[ADAPTIVE] %s interval changed: %dmin → %dmin. Job rescheduled live.",
+                        category.upper(), old_interval, new_interval
+                    )
+                except Exception as reschedule_err:
+                    logger.warning(
+                        "[ADAPTIVE] Could not reschedule %s job: %s",
+                        job_id, reschedule_err
+                    )
+    except Exception as e:
+        logger.exception("[ADAPTIVE JOB] Unhandled error for category %s: %s", category, e)
 async def fetch_daily_research():
     """
     Background Job: Fetch Research Papers from ArXiv
     logger.info("═" * 80)
+async def fetch_and_validate_category(category: str, aggregator) -> tuple:
     """
+    Fetch and validate articles for a single category.
+    Args:
+        category:   The news category (e.g. 'ai', 'cloud-aws').
+        aggregator: The shared NewsAggregator instance for this run.
+                    Using a shared instance means all 22 parallel tasks
+                    share the same quota counters and circuit-breaker state.
     Returns: (category, valid_articles, invalid_count, irrelevant_count)
     """
     from app.utils.data_validation import is_valid_article, sanitize_article, is_relevant_to_category
     from app.utils.date_parser import normalize_article_date
+    from app.utils.url_canonicalization import canonicalize_url
+    from app.utils.redis_dedup import is_url_seen_or_mark
     try:
         logger.info("📌 Fetching %s...", category.upper())
+        # Ask the aggregator for all articles from all sources for this category.
+        # fetch_by_category (Phase 5) internally runs:
+        #   1. Paid waterfall  — GNews → NewsAPI → NewsData (stops on first success)
+        #   2. Free parallel   — Google RSS + Medium + Official Cloud, all at once
+        #   3. Returns the merged list
+        # We no longer need to call fetch_from_provider for medium/official_cloud
+        # separately here. That would duplicate the work Phase 5 already does.
+        raw_articles = await aggregator.fetch_by_category(category)
         if not raw_articles:
             return (category, [], 0, 0)
+        # ------------------------------------------------------------------
+        # IN-BATCH DEDUPLICATION
+        # ------------------------------------------------------------------
+        # When 3 providers run at the same time for the same category, they
+        # sometimes return the exact same article (e.g. a TechCrunch AI story
+        # can come from both GNews AND Google RSS in the same fetch cycle).
+        # We catch and remove these same-batch duplicates RIGHT HERE, before
+        # the expensive validation loop even starts.
+        # This is like a quick ID-card check at the entrance before people
+        # join the full security screening queue.
+        _seen_in_batch: set = set()
+        _deduplicated_raw = []
+        for _art in raw_articles:
+            _raw_url = str(_art.url) if _art.url else ''
+            _canonical = canonicalize_url(_raw_url) if _raw_url else ''
+            # If we have a valid canonical URL and we've already seen it → skip
+            if _canonical and _canonical in _seen_in_batch:
+                continue
+            if _canonical:
+                _seen_in_batch.add(_canonical)
+            _deduplicated_raw.append(_art)
+        _batch_dupes_removed = len(raw_articles) - len(_deduplicated_raw)
+        if _batch_dupes_removed > 0:
+            logger.info(
+                "   🔄 [BATCH DEDUP] %s: Removed %d within-batch duplicates before validation",
+                category.upper(), _batch_dupes_removed
+            )
+        raw_articles = _deduplicated_raw
+        # ------------------------------------------------------------------
         # Validate, filter, and sanitize
         valid_articles = []
         invalid_count = 0
         irrelevant_count = 0
+        relevant_count = 0   # articles that are valid + relevant, before Redis dedup
         for article in raw_articles:
+            # Step 1: Basic validation — must have a title, URL, and publication date.
             if not is_valid_article(article):
                 invalid_count += 1
                 continue
+            # Step 2: Category relevance check — title+description must match category keywords.
             if not is_relevant_to_category(article, category):
                 irrelevant_count += 1
                 continue
+            # Checkpoint: count articles that are valid AND relevant, but before
+            # the Redis 48-hour check strips out the ones we have already stored.
+            # This is the true "how much real news is in this category?" signal.
+            # The adaptive scheduler uses this number to decide fetch frequency.
+            # (Fix #2 - Phase 7: was using saved_count, which confused "quiet feed"
+            # with "feed we already have fully stored" — two very different things.)
+            relevant_count += 1
+            # Step 3: Redis 48-hour dedup check — THE MAIN BOUNCER.
+            # Check if we have already stored this exact article URL in the last 48 hours.
+            # If yes, skip silently — it's a repeat. If no, mark it as seen and continue.
+            # This stops the same article being saved every hour from a slow-updating RSS feed.
+            if await is_url_seen_or_mark(str(article.url) if article.url else ''):
+                logger.debug(
+                    "   [REDIS DEDUP] Skipped article already seen in last 48 hours: %s",
+                    str(article.url)[:80]
+                )
+                continue
+            # Step 4: Normalize date to UTC ISO-8601.
             article = normalize_article_date(article)
+            # Step 5: Sanitize and clean the article fields.
             clean_article = sanitize_article(article)
             valid_articles.append(clean_article)
+        logger.info("✓ %s: %d valid, %d invalid, %d irrelevant",
                     category.upper(), len(valid_articles), invalid_count, irrelevant_count)
+        return (category, valid_articles, invalid_count, irrelevant_count, relevant_count)
     except asyncio.TimeoutError:
         logger.error("⏱️  Timeout fetching %s (>30s)", category)
     logger.info("⏰ [SCHEDULER] Initializing background scheduler...")
     logger.info("═" * 80)
+    # ── Job #1: PER-CATEGORY ADAPTIVE NEWS FETCHERS (Phase 6) ───────────
+    # Instead of one giant job that fetches all 22 categories every hour,
+    # we register 22 individual jobs, each on its own timer.
+    #
+    # The timer for each category is read from the adaptive scheduler,
+    # which remembers how "active" each category was in past runs:
+    #   - 'ai' category gets lots of articles → runs every 5 minutes
+    #   - 'cloud-alibaba' is quiet → runs every 60 minutes
+    #   - Most categories start at 15 minutes (the default)
+    #
+    # After every run, the job updates its own timer if the velocity changed.
+    # No server restart needed.
+    # -------------------------------------------------------------------------
+    adaptive = _get_adaptive()   # initializes singleton + loads saved intervals
+    for idx, category in enumerate(CATEGORIES, start=1):
+        initial_interval = adaptive.get_interval(category)  # minutes
+        job_id = f"fetch_{category}"
+        scheduler.add_job(
+            fetch_single_category_job,
+            trigger=IntervalTrigger(minutes=initial_interval),
+            args=[category],
+            id=job_id,
+            name=f"News Fetcher: {category} (every {initial_interval}min)",
+            replace_existing=True
+        )
+        logger.info(
+            "   ✓ [%02d/%02d] %-30s → every %d min",
+            idx, len(CATEGORIES), category, initial_interval
+        )
     logger.info("")
+    logger.info("✅ Job #1 Group Registered: 📰 %d Adaptive News Fetchers", len(CATEGORIES))
+    logger.info("   Intervals range from 5 min (high-velocity) to 60 min (quiet)")
     # Cleanup Job (Frequency: Every 30 minutes)
     scheduler.add_job(

app/services/upstash_cache.py CHANGED Viewed

@@ -61,14 +61,25 @@ class UpstashCache:
         self.enabled = enabled
         self.default_ttl = default_ttl
-        # HTTP client with timeout
-        self.client = httpx.Client(
-            timeout=5.0,  # 5 second timeout
-            headers={
-                "Authorization": f"Bearer {rest_token}",
-                "Content-Type": "application/json"
-            }
-        )
         # Stats tracking
         self.stats = {
@@ -88,7 +99,7 @@ class UpstashCache:
             logger.info(f"   Free Tier: 256 MB data, 50 GB/month bandwidth")
             logger.info("=" * 70)
-    def _execute_command(self, command: list) -> Optional[Any]:
         """
         Execute Redis command via REST API
@@ -102,7 +113,8 @@ class UpstashCache:
             return None
         try:
-            response = self.client.post(
                 f"{self.rest_url}",
                 json=command
             )
@@ -120,7 +132,7 @@ class UpstashCache:
             self.stats['errors'] += 1
             return None
-    def get(self, key: str) -> Optional[Any]:
         """
         Get value from cache
@@ -134,7 +146,7 @@ class UpstashCache:
             return None
         try:
-            result = self._execute_command(["GET", key])
             if result is None:
                 self.stats['misses'] += 1
@@ -152,7 +164,7 @@ class UpstashCache:
             self.stats['errors'] += 1
             return None
-    def set(
         self,
         key: str,
         value: Any,
@@ -185,7 +197,7 @@ class UpstashCache:
             ttl_seconds = ttl if ttl is not None else self.default_ttl
             # SETEX command (set with expiration)
-            result = self._execute_command(["SETEX", key, ttl_seconds, serialized])
             if result == "OK" or result is not None:
                 self.stats['sets'] += 1
@@ -199,7 +211,7 @@ class UpstashCache:
             self.stats['errors'] += 1
             return False
-    def delete(self, key: str) -> bool:
         """
         Delete key from cache
@@ -213,7 +225,7 @@ class UpstashCache:
             return False
         try:
-            result = self._execute_command(["DEL", key])
             deleted = result == 1
             if deleted:
@@ -225,7 +237,7 @@ class UpstashCache:
             logger.error(f"❌ Cache delete error for {key}: {e}")
             return False
-    def invalidate_pattern(self, pattern: str) -> int:
         """
         Invalidate all keys matching pattern
@@ -240,14 +252,14 @@ class UpstashCache:
         try:
             # Get all matching keys
-            keys = self._execute_command(["KEYS", pattern])
             if not keys:
                 return 0
             # Delete all keys
             for key in keys:
-                self._execute_command(["DEL", key])
             logger.info(f"🗑️  Invalidated {len(keys)} keys matching '{pattern}'")
             return len(keys)
@@ -288,7 +300,7 @@ class UpstashCache:
         logger.info("=" * 70)
         logger.info("")
-    def health_check(self) -> bool:
         """
         Check if Upstash is reachable
@@ -296,7 +308,7 @@ class UpstashCache:
             True if healthy, False otherwise
         """
         try:
-            result = self._execute_command(["PING"])
             healthy = result == "PONG"
             if healthy:
@@ -310,10 +322,10 @@ class UpstashCache:
             logger.error(f"❌ Upstash health check error: {e}")
             return False
-    def close(self):
         """Close HTTP client"""
-        if hasattr(self, 'client'):
-            self.client.close()
 # Global singleton instance

         self.enabled = enabled
         self.default_ttl = default_ttl
+        # Stats tracking
+        self.stats = {
+            'hits': 0,
+            'misses': 0,
+            'sets': 0,
+            'errors': 0
+        }
+    def _get_client(self) -> httpx.AsyncClient:
+        """Lazy initialization of httpx client to avoid asyncio loop issues on Windows"""
+        if not hasattr(self, '_client') or self._client is None:
+            self._client = httpx.AsyncClient(
+                timeout=5.0,  # 5 second timeout
+                headers={
+                    "Authorization": f"Bearer {self.rest_token}",
+                    "Content-Type": "application/json"
+                }
+            )
+        return self._client
         # Stats tracking
         self.stats = {
             logger.info(f"   Free Tier: 256 MB data, 50 GB/month bandwidth")
             logger.info("=" * 70)
+    async def _execute_command(self, command: list) -> Optional[Any]:
         """
         Execute Redis command via REST API
             return None
         try:
+            client = self._get_client()
+            response = await client.post(
                 f"{self.rest_url}",
                 json=command
             )
             self.stats['errors'] += 1
             return None
+    async def get(self, key: str) -> Optional[Any]:
         """
         Get value from cache
             return None
         try:
+            result = await self._execute_command(["GET", key])
             if result is None:
                 self.stats['misses'] += 1
             self.stats['errors'] += 1
             return None
+    async def set(
         self,
         key: str,
         value: Any,
             ttl_seconds = ttl if ttl is not None else self.default_ttl
             # SETEX command (set with expiration)
+            result = await self._execute_command(["SETEX", key, ttl_seconds, serialized])
             if result == "OK" or result is not None:
                 self.stats['sets'] += 1
             self.stats['errors'] += 1
             return False
+    async def delete(self, key: str) -> bool:
         """
         Delete key from cache
             return False
         try:
+            result = await self._execute_command(["DEL", key])
             deleted = result == 1
             if deleted:
             logger.error(f"❌ Cache delete error for {key}: {e}")
             return False
+    async def invalidate_pattern(self, pattern: str) -> int:
         """
         Invalidate all keys matching pattern
         try:
             # Get all matching keys
+            keys = await self._execute_command(["KEYS", pattern])
             if not keys:
                 return 0
             # Delete all keys
             for key in keys:
+                await self._execute_command(["DEL", key])
             logger.info(f"🗑️  Invalidated {len(keys)} keys matching '{pattern}'")
             return len(keys)
         logger.info("=" * 70)
         logger.info("")
+    async def health_check(self) -> bool:
         """
         Check if Upstash is reachable
             True if healthy, False otherwise
         """
         try:
+            result = await self._execute_command(["PING"])
             healthy = result == "PONG"
             if healthy:
             logger.error(f"❌ Upstash health check error: {e}")
             return False
+    async def close(self):
         """Close HTTP client"""
+        if hasattr(self, '_client') and self._client is not None:
+            await self._client.aclose()
 # Global singleton instance

app/utils/data_validation.py CHANGED Viewed

@@ -238,6 +238,8 @@ def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> b
         article_dict = article
     # Category keyword dictionaries
     CATEGORY_KEYWORDS = {
         'ai': [
             'ai', 'artificial intelligence', 'machine learning', 'deep learning',
@@ -260,6 +262,11 @@ def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> b
             'data engineering', 'pipeline', 'etl', 'big data', 'spark',
             'hadoop', 'kafka', 'airflow', 'data warehouse', 'snowflake'
         ],
         'business-intelligence': [
             'business intelligence', 'bi', 'analytics', 'dashboard',
             'tableau', 'power bi', 'looker', 'reporting', 'kpi'
@@ -281,6 +288,40 @@ def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> b
             'alibaba cloud', 'tencent cloud', 'huawei cloud', 'cloudflare',
             'saas', 'paas', 'iaas', 'serverless', 'kubernetes'
         ],
         'medium-article': [
             'medium', 'article', 'blog', 'writing', 'publishing',
             'content', 'story', 'author', 'blogging'
@@ -298,11 +339,27 @@ def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> b
         # Unknown category - allow (don't reject)
         return True
-    # Combine title and description for checking
-    # FIX: Use (value or '') pattern to handle explicit None values from messy RSS feeds
     title = (article_dict.get('title') or '').lower()
     description = (article_dict.get('description') or '').lower()
-    text = f"{title} {description}"
     # Count keyword matches
     matches = sum(1 for keyword in keywords if keyword.lower() in text)

         article_dict = article
     # Category keyword dictionaries
+    # Each category has a list of words we scan for in the article's title,
+    # description, AND URL path. If at least one word matches, the article passes.
     CATEGORY_KEYWORDS = {
         'ai': [
             'ai', 'artificial intelligence', 'machine learning', 'deep learning',
             'data engineering', 'pipeline', 'etl', 'big data', 'spark',
             'hadoop', 'kafka', 'airflow', 'data warehouse', 'snowflake'
         ],
+        'data-management': [
+            'data management', 'master data', 'mdm', 'data catalog',
+            'data quality', 'data lineage', 'data stewardship',
+            'data governance', 'data integration', 'reference data'
+        ],
         'business-intelligence': [
             'business intelligence', 'bi', 'analytics', 'dashboard',
             'tableau', 'power bi', 'looker', 'reporting', 'kpi'
             'alibaba cloud', 'tencent cloud', 'huawei cloud', 'cloudflare',
             'saas', 'paas', 'iaas', 'serverless', 'kubernetes'
         ],
+        # ── Cloud sub-categories (each maps to a specific provider) ──────────
+        'cloud-aws': [
+            'aws', 'amazon web services', 's3', 'ec2', 'lambda',
+            'cloudfront', 'sagemaker', 'dynamodb', 'amazon'
+        ],
+        'cloud-azure': [
+            'azure', 'microsoft azure', 'azure devops', 'azure ml',
+            'azure openai', 'microsoft cloud'
+        ],
+        'cloud-gcp': [
+            'gcp', 'google cloud', 'bigquery', 'vertex ai',
+            'cloud run', 'dataflow', 'google cloud platform'
+        ],
+        'cloud-oracle': [
+            'oracle cloud', 'oci', 'oracle database', 'oracle fusion',
+            'oracle cloud infrastructure'
+        ],
+        'cloud-ibm': [
+            'ibm cloud', 'ibm watson', 'red hat', 'openshift', 'ibm z'
+        ],
+        'cloud-alibaba': [
+            'alibaba cloud', 'aliyun', 'alicloud'
+        ],
+        'cloud-digitalocean': [
+            'digitalocean', 'droplet', 'app platform'
+        ],
+        'cloud-huawei': [
+            'huawei cloud', 'huaweicloud'
+        ],
+        'cloud-cloudflare': [
+            'cloudflare', 'cloudflare workers', 'cloudflare r2',
+            'cloudflare pages', 'zero trust'
+        ],
+        # ── Content / publishing categories ───────────────────────────────────
         'medium-article': [
             'medium', 'article', 'blog', 'writing', 'publishing',
             'content', 'story', 'author', 'blogging'
         # Unknown category - allow (don't reject)
         return True
+    # Build the text we will search for keywords.
+    # We use title + description as the primary source.
+    # We also append the article's URL path because RSS feeds (especially Google News)
+    # often return empty descriptions. The URL itself usually tells you what the
+    # article is about — e.g. "/aws-launches-new-s3-feature" clearly contains 'aws' and 's3'.
+    # Hyphens and slashes are replaced with spaces so words can be matched individually.
     title = (article_dict.get('title') or '').lower()
     description = (article_dict.get('description') or '').lower()
+    # Extract the URL path safely.
+    raw_url = article_dict.get('url') or ''
+    url_str = str(raw_url).lower()
+    try:
+        parsed_url = urlparse(url_str)
+        # Replace hyphens and slashes with spaces so
+        # "/aws-new-s3-launch" becomes "aws new s3 launch".
+        url_words = parsed_url.path.replace('-', ' ').replace('/', ' ')
+    except Exception:
+        url_words = ''
+    text = f"{title} {description} {url_words}"
     # Count keyword matches
     matches = sum(1 for keyword in keywords if keyword.lower() in text)

app/utils/redis_dedup.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Redis URL Deduplication Bouncer
+================================
+This is the 48-hour memory for the ingestion pipeline.
+How it works (simple version):
+  Imagine a nightclub bouncer who keeps a list of everyone who came in
+  today and yesterday. If you try to enter again while still on the list,
+  you are turned away. After 48 hours, your name falls off the list and
+  you are welcome back.
+That is exactly what this module does for article URLs.
+Each article URL is:
+  1. Cleaned and normalized (canonicalized).
+  2. Converted to a short SHA-256 fingerprint (so we store 16 chars not full URLs).
+  3. Checked against Upstash Redis with the command: SET key 1 EX 172800 NX
+       - EX 172800 = expire after 172800 seconds = 48 hours
+       - NX         = only set if Not eXists
+Redis response:
+  - "OK"  → key did NOT exist → article is NEW → return False (not seen before)
+  - null  → key already existed → article is DUPLICATE → return True (seen before)
+Fallback:
+  If Upstash is not configured or Redis is unreachable, this function
+  safely returns False (treats every article as new). The Appwrite
+  database constraint is still the final safety net in that case.
+"""
+import logging
+from app.utils.url_canonicalization import canonicalize_url, get_url_hash
+from app.services.upstash_cache import get_upstash_cache
+logger = logging.getLogger(__name__)
+# Redis key prefix for URL deduplication keys.
+# Keeps our keys clearly separate from the article-cache keys.
+_KEY_PREFIX = "seen_url:"
+# 48 hours expressed in seconds.
+# This matches the cleanup janitor in scheduler.py which also deletes
+# articles older than 48 hours. When an article is deleted from the
+# database, its Redis key will also expire around the same time,
+# allowing the article to be re-ingested if it genuinely resurfaces.
+_TTL_SECONDS = 172_800  # 48 * 60 * 60
+async def is_url_seen_or_mark(raw_url: str) -> bool:
+    """
+    Check if we have seen this article URL in the last 48 hours.
+    If we have NOT seen it, mark it as seen so future checks catch it.
+    Args:
+        raw_url: The article URL (any format — we normalize it internally).
+    Returns:
+        True  → We have seen this URL before. It is a duplicate. Skip it.
+        False → This URL is brand new. The article was also marked in Redis
+                so the next run will correctly identify it as a duplicate.
+    """
+    if not raw_url:
+        # No URL means we cannot deduplicate. Let it through.
+        return False
+    try:
+        # Step 1: Normalize the URL so different versions of the same link
+        # (http vs https, trailing slash, utm_ params) all produce the same key.
+        canonical = canonicalize_url(str(raw_url))
+        # Step 2: Convert to a short hash so our Redis keys are tiny and uniform.
+        url_hash = get_url_hash(canonical)
+        redis_key = f"{_KEY_PREFIX}{url_hash}"
+        # Step 3: Get the Upstash client (shared singleton — already used by cache).
+        cache = get_upstash_cache()
+        # Step 4: Try to set the key WITH NX (only if it does not already exist).
+        # This is an atomic check-and-set: no race condition possible.
+        # Command: SET seen_url:{hash} 1 EX 172800 NX
+        result = await cache._execute_command(
+            ["SET", redis_key, "1", "EX", _TTL_SECONDS, "NX"]
+        )
+        if result == "OK":
+            # Redis successfully created the key → this URL is NEW.
+            return False  # Not a duplicate — let the article through.
+        else:
+            # Redis returned null → key already existed → DUPLICATE.
+            logger.debug("[REDIS DEDUP] Duplicate detected: %s", redis_key)
+            return True   # It's a duplicate — skip this article.
+    except Exception as e:
+        # Something went wrong with Redis (network error, timeout, etc.).
+        # We NEVER block an article because of a Redis failure.
+        # The Appwrite database will still catch true duplicates as a safety net.
+        logger.warning(
+            "[REDIS DEDUP] Redis check failed (%s) — letting article through as safe fallback.",
+            e
+        )
+        return False  # Safe fallback: treat as new article.

app/verify_manual_fetch.py DELETED Viewed

@@ -1,87 +0,0 @@
-import asyncio
-import logging
-from app.services.appwrite_db import get_appwrite_db
-from app.config import settings
-from appwrite.query import Query
-from datetime import datetime, timedelta
-# Setup logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger("Verifier")
-async def verify_stored_articles():
-    print("="*60)
-    print("🔍 VERIFYING FETCHED ARTICLES IN APPWRITE")
-    print("="*60)
-    appwrite_db = get_appwrite_db()
-    # Check a few key categories
-    categories_to_check = ["ai", "cloud-computing", "data-security"]
-    total_found = 0
-    for category in categories_to_check:
-        try:
-            # Query for articles created in the last 1 hour
-            # Note: 'created_at' is internal Appwrite, 'publishedAt' is article time
-            # We'll check 'publishedAt' as a proxy for recent content
-            # OR just list the latest documents
-            # Using the collection ID for the category (which is actually just the main collection with category filter in this architecture)
-            # Wait, the architecture uses specific collections for specific types OR one collection with category field?
-            # scheduler.py uses:
-            # settings.APPWRITE_COLLECTION_ID for "Regular News"
-            # settings.APPWRITE_CLOUD_COLLECTION_ID for "Cloud News"
-            # settings.APPWRITE_AI_COLLECTION_ID for "AI News"
-            collection_id = None
-            if category == "ai":
-                collection_id = settings.APPWRITE_AI_COLLECTION_ID
-            elif category == "cloud-computing":
-                collection_id = settings.APPWRITE_CLOUD_COLLECTION_ID
-            else:
-                 collection_id = settings.APPWRITE_COLLECTION_ID # Default/Data
-            print(f"\n📂 Checking collection for category: {category.upper()}")
-            print(f"   ID: {collection_id}")
-            if not collection_id:
-                print("   ⚠️  Collection ID not configured")
-                continue
-            response = appwrite_db.databases.list_documents(
-                database_id=settings.APPWRITE_DATABASE_ID,
-                collection_id=collection_id,
-                queries=[
-                    Query.limit(5),
-                    Query.order_desc('$createdAt') # Get most recently created
-                    # Query.equal('category', category) # Optional if collection is mixed
-                ]
-            )
-            count = len(response['documents'])
-            print(f"   ✅ Found {count} recent documents")
-            if count > 0:
-                for doc in response['documents']:
-                    title = doc.get('title', 'No Title')
-                    created_at = doc.get('$createdAt', 'Unknown')
-                    print(f"      - [{created_at}] {title[:60]}...")
-                    total_found += 1
-            else:
-                 print("   ❌ No documents found. Fetch may have failed.")
-        except Exception as e:
-            print(f"   ❌ Error querying collection: {e}")
-    print("\n" + "="*60)
-    if total_found > 0:
-        print(f"✅ VERIFICATION PASSED: Found {total_found} recent articles.")
-    else:
-        print("❌ VERIFICATION FAILED: No recent articles found.")
-    print("="*60)
-if __name__ == "__main__":
-    asyncio.run(verify_stored_articles())

app/verify_simple.py DELETED Viewed

@@ -1,30 +0,0 @@
-import asyncio
-import sys
-from app.services.appwrite_db import get_appwrite_db
-from app.config import settings
-from appwrite.query import Query
-async def verify():
-    print("STARTING VERIFICATION", flush=True)
-    try:
-        appwrite_db = get_appwrite_db()
-        print(f"DB Initialized: {appwrite_db.initialized}", flush=True)
-        # Check AI news
-        print(f"Checking AI News collection: {settings.APPWRITE_AI_COLLECTION_ID}", flush=True)
-        response = appwrite_db.databases.list_documents(
-            database_id=settings.APPWRITE_DATABASE_ID,
-            collection_id=settings.APPWRITE_AI_COLLECTION_ID,
-            queries=[Query.limit(5), Query.order_desc('$createdAt')]
-        )
-        print(f"Found {len(response['documents'])} docs", flush=True)
-        for doc in response['documents']:
-             print(f"- {doc.get('title', 'No Title')[:50]}...", flush=True)
-    except Exception as e:
-        print(f"ERROR: {e}", flush=True)
-    print("DONE", flush=True)
-if __name__ == "__main__":
-    asyncio.run(verify())

data/velocity_tracking.json CHANGED Viewed

@@ -1,210 +1,210 @@
 {
   "ai": {
-    "interval": 15,
     "history": [
-      26,
       0,
       0,
       6,
-      9
     ],
-    "last_fetch": "2026-02-14T13:58:05.405121",
-    "total_fetches": 21,
-    "total_articles": 283
   },
   "data-security": {
-    "interval": 15,
     "history": [
-      22,
       0,
       0,
       3,
-      2
     ],
-    "last_fetch": "2026-02-14T13:58:05.410615",
-    "total_fetches": 21,
-    "total_articles": 186
   },
   "data-governance": {
     "interval": 60,
     "history": [
-      19,
       0,
       0,
       3,
-      2
     ],
-    "last_fetch": "2026-02-14T13:58:05.416621",
-    "total_fetches": 21,
-    "total_articles": 178
   },
   "data-privacy": {
-    "interval": 15,
     "history": [
-      23,
       0,
       0,
       5,
       5
     ],
-    "last_fetch": "2026-02-14T13:58:05.421392",
-    "total_fetches": 21,
-    "total_articles": 210
   },
   "data-engineering": {
     "interval": 60,
     "history": [
-      14,
       0,
       0,
       2,
       5
     ],
-    "last_fetch": "2026-02-14T13:58:05.424148",
-    "total_fetches": 21,
-    "total_articles": 191
   },
   "data-management": {
     "interval": 15,
     "history": [
-      10,
       0,
       0,
       10,
       10
     ],
-    "last_fetch": "2026-02-14T13:58:05.430647",
-    "total_fetches": 21,
-    "total_articles": 271
   },
   "business-intelligence": {
-    "interval": 15,
     "history": [
-      23,
       0,
       0,
       8,
-      8
     ],
-    "last_fetch": "2026-02-14T13:58:05.435766",
-    "total_fetches": 21,
-    "total_articles": 255
   },
   "business-analytics": {
-    "interval": 15,
     "history": [
-      20,
       0,
       0,
       6,
-      6
     ],
-    "last_fetch": "2026-02-14T13:58:05.438090",
-    "total_fetches": 21,
-    "total_articles": 203
   },
   "customer-data-platform": {
     "interval": 15,
     "history": [
-      26,
       0,
       0,
       9,
       9
     ],
-    "last_fetch": "2026-02-14T13:58:05.440124",
-    "total_fetches": 21,
-    "total_articles": 247
   },
   "data-centers": {
-    "interval": 15,
     "history": [
-      27,
       0,
       0,
       6,
-      6
     ],
-    "last_fetch": "2026-02-14T13:58:05.443427",
-    "total_fetches": 21,
-    "total_articles": 318
   },
   "cloud-computing": {
-    "interval": 15,
     "history": [
-      24,
       0,
       0,
       5,
-      7
     ],
-    "last_fetch": "2026-02-14T13:58:05.448267",
-    "total_fetches": 21,
-    "total_articles": 400
   },
   "magazines": {
     "interval": 60,
     "history": [
-      0,
       0,
       0,
       6,
-      5
     ],
-    "last_fetch": "2026-02-14T13:58:05.451427",
-    "total_fetches": 21,
-    "total_articles": 106
   },
   "data-laws": {
     "interval": 15,
     "history": [
-      29,
       0,
       0,
       9,
-      9
     ],
-    "last_fetch": "2026-02-14T13:58:05.455106",
-    "total_fetches": 21,
-    "total_articles": 327
   },
   "cloud-aws": {
     "interval": 15,
     "history": [
-      29,
       0,
       0,
       10,
       10
     ],
-    "last_fetch": "2026-02-14T13:58:05.456886",
-    "total_fetches": 21,
-    "total_articles": 514
   },
   "cloud-azure": {
     "interval": 15,
     "history": [
-      20,
       0,
       0,
       10,
       10
     ],
-    "last_fetch": "2026-02-14T13:58:05.459585",
-    "total_fetches": 21,
-    "total_articles": 360
   },
   "cloud-gcp": {
     "interval": 60,
     "history": [
-      20,
       0,
       0,
       0,
       0
     ],
-    "last_fetch": "2026-02-14T13:58:05.464655",
-    "total_fetches": 21,
     "total_articles": 380
   },
   "cloud-oracle": {
@@ -216,8 +216,8 @@
       0,
       0
     ],
-    "last_fetch": "2026-02-14T13:58:05.467651",
-    "total_fetches": 21,
     "total_articles": 20
   },
   "cloud-ibm": {
@@ -229,8 +229,8 @@
       0,
       0
     ],
-    "last_fetch": "2026-02-14T13:58:05.471190",
-    "total_fetches": 21,
     "total_articles": 20
   },
   "cloud-alibaba": {
@@ -242,8 +242,8 @@
       0,
       0
     ],
-    "last_fetch": "2026-02-14T13:58:05.472734",
-    "total_fetches": 21,
     "total_articles": 20
   },
   "cloud-digitalocean": {
@@ -255,8 +255,8 @@
       0,
       0
     ],
-    "last_fetch": "2026-02-14T13:58:05.474379",
-    "total_fetches": 21,
     "total_articles": 20
   },
   "cloud-huawei": {
@@ -268,21 +268,21 @@
       0,
       0
     ],
-    "last_fetch": "2026-02-14T13:58:05.478287",
-    "total_fetches": 21,
     "total_articles": 20
   },
   "cloud-cloudflare": {
     "interval": 60,
     "history": [
-      20,
       0,
       0,
       0,
       0
     ],
-    "last_fetch": "2026-02-14T13:58:05.483222",
-    "total_fetches": 21,
     "total_articles": 360
   }
 }

 {
   "ai": {
+    "interval": 60,
     "history": [
       0,
       0,
       6,
+      9,
+      8
     ],
+    "last_fetch": "2026-02-18T12:47:11.378844",
+    "total_fetches": 22,
+    "total_articles": 291
   },
   "data-security": {
+    "interval": 60,
     "history": [
       0,
       0,
       3,
+      2,
+      6
     ],
+    "last_fetch": "2026-02-18T12:47:11.386589",
+    "total_fetches": 22,
+    "total_articles": 192
   },
   "data-governance": {
     "interval": 60,
     "history": [
       0,
       0,
       3,
+      2,
+      6
     ],
+    "last_fetch": "2026-02-18T12:47:11.406642",
+    "total_fetches": 22,
+    "total_articles": 184
   },
   "data-privacy": {
+    "interval": 60,
     "history": [
       0,
       0,
       5,
+      5,
       5
     ],
+    "last_fetch": "2026-02-18T12:47:11.415936",
+    "total_fetches": 22,
+    "total_articles": 215
   },
   "data-engineering": {
     "interval": 60,
     "history": [
       0,
       0,
       2,
+      5,
       5
     ],
+    "last_fetch": "2026-02-18T12:47:11.419143",
+    "total_fetches": 22,
+    "total_articles": 196
   },
   "data-management": {
     "interval": 15,
     "history": [
       0,
       0,
       10,
+      10,
       10
     ],
+    "last_fetch": "2026-02-18T12:47:11.429711",
+    "total_fetches": 22,
+    "total_articles": 281
   },
   "business-intelligence": {
+    "interval": 60,
     "history": [
       0,
       0,
       8,
+      8,
+      7
     ],
+    "last_fetch": "2026-02-18T12:47:11.431810",
+    "total_fetches": 22,
+    "total_articles": 262
   },
   "business-analytics": {
+    "interval": 60,
     "history": [
       0,
       0,
       6,
+      6,
+      7
     ],
+    "last_fetch": "2026-02-18T12:47:11.433518",
+    "total_fetches": 22,
+    "total_articles": 210
   },
   "customer-data-platform": {
     "interval": 15,
     "history": [
       0,
       0,
       9,
+      9,
       9
     ],
+    "last_fetch": "2026-02-18T12:47:11.435120",
+    "total_fetches": 22,
+    "total_articles": 256
   },
   "data-centers": {
+    "interval": 60,
     "history": [
       0,
       0,
       6,
+      6,
+      8
     ],
+    "last_fetch": "2026-02-18T12:47:11.436689",
+    "total_fetches": 22,
+    "total_articles": 326
   },
   "cloud-computing": {
+    "interval": 60,
     "history": [
       0,
       0,
       5,
+      7,
+      8
     ],
+    "last_fetch": "2026-02-18T12:47:11.437988",
+    "total_fetches": 22,
+    "total_articles": 408
   },
   "magazines": {
     "interval": 60,
     "history": [
       0,
       0,
       6,
+      5,
+      1
     ],
+    "last_fetch": "2026-02-18T12:47:11.439034",
+    "total_fetches": 22,
+    "total_articles": 107
   },
   "data-laws": {
     "interval": 15,
     "history": [
       0,
       0,
       9,
+      9,
+      10
     ],
+    "last_fetch": "2026-02-18T12:47:11.440244",
+    "total_fetches": 22,
+    "total_articles": 337
   },
   "cloud-aws": {
     "interval": 15,
     "history": [
       0,
       0,
       10,
+      10,
       10
     ],
+    "last_fetch": "2026-02-18T12:47:11.441717",
+    "total_fetches": 22,
+    "total_articles": 524
   },
   "cloud-azure": {
     "interval": 15,
     "history": [
       0,
       0,
       10,
+      10,
       10
     ],
+    "last_fetch": "2026-02-18T12:47:11.443201",
+    "total_fetches": 22,
+    "total_articles": 370
   },
   "cloud-gcp": {
     "interval": 60,
     "history": [
+      0,
       0,
       0,
       0,
       0
     ],
+    "last_fetch": "2026-02-18T12:47:11.444443",
+    "total_fetches": 22,
     "total_articles": 380
   },
   "cloud-oracle": {
       0,
       0
     ],
+    "last_fetch": "2026-02-18T12:47:11.445683",
+    "total_fetches": 22,
     "total_articles": 20
   },
   "cloud-ibm": {
       0,
       0
     ],
+    "last_fetch": "2026-02-18T12:47:11.446826",
+    "total_fetches": 22,
     "total_articles": 20
   },
   "cloud-alibaba": {
       0,
       0
     ],
+    "last_fetch": "2026-02-18T12:47:11.450091",
+    "total_fetches": 22,
     "total_articles": 20
   },
   "cloud-digitalocean": {
       0,
       0
     ],
+    "last_fetch": "2026-02-18T12:47:11.455437",
+    "total_fetches": 22,
     "total_articles": 20
   },
   "cloud-huawei": {
       0,
       0
     ],
+    "last_fetch": "2026-02-18T12:47:11.461316",
+    "total_fetches": 22,
     "total_articles": 20
   },
   "cloud-cloudflare": {
     "interval": 60,
     "history": [
+      0,
       0,
       0,
       0,
       0
     ],
+    "last_fetch": "2026-02-18T12:47:11.464407",
+    "total_fetches": 22,
     "total_articles": 360
   }
 }

docs/appwrite_schema.md DELETED Viewed

@@ -1,77 +0,0 @@
-# Appwrite Database Schema Configuration
-# Instructions for setting up indexes in Appwrite Console
-## Collection: articles
-### Attributes
-```json
-{
-  "$id": "string (16 chars, auto-generated)",
-  "url_hash": "string (16 chars, required, unique)",
-  "title": "string (500 chars, required)",
-  "url": "string (2048 chars, required)",
-  "description": "string (2000 chars, optional)",
-  "image_url": "string (1000 chars, optional)",
-  "published_at": "string (50 chars, required, ISO format)",
-  "category": "string (50 chars, required)",
-  "source": "string (200 chars, optional)",
-  "fetched_at": "string (50 chars, required, ISO format)",
-  "slug": "string (200 chars, optional)",
-  "quality_score": "integer (optional, default: 50)"
-}
-```
-### Indexes (CRITICAL FOR PERFORMANCE)
-#### 1. Primary Index: url_hash (Unique Constraint)
-- **Type:** unique
-- **Attribute:** url_hash
-- **Order:** ASC
-- **Purpose:** Prevents duplicate articles at database level
-- **Impact:** Enforces data integrity, eliminates dedup logic in code
-#### 2. Composite Index: category + published_at (MOST IMPORTANT)
-- **Type:** key
-- **Attributes:** [category, published_at]
-- **Orders:** [ASC, DESC]
-- **Purpose:** Powers the main query: "Get latest articles for category X"
-- **Impact:** 40x faster than full table scan
-- **Query Example:**
-  ```sql
-  WHERE category = 'ai' ORDER BY published_at DESC LIMIT 20
-  ```
-#### 3. Index: published_at (For Global Feed)
-- **Type:** key
-- **Attribute:** published_at
-- **Order:** DESC
-- **Purpose:** Get latest articles across all categories
-- **Impact:** Fast global news feed
-- **Query Example:**
-  ```sql
-  ORDER BY published_at DESC LIMIT 50
-  ```
-#### 4. Index: source (For Analytics)
-- **Type:** key
-- **Attribute:** source
-- **Order:** ASC
-- **Purpose:** Provider statistics and filtering
-- **Impact:** Fast source-based queries
-## Setup Instructions
-### Via Appwrite Console:
-1. Go to Databases → articles collection
-2. Click "Indexes" tab
-3. Add each index with the specifications above
-### Expected Performance Gains:
-- List query (category filter): 40x faster
-- Global feed query: 30x faster
-- Deduplication: Automatic (no code needed)
-## Migration Notes
-- Existing articles will be automatically indexed
-- Index creation may take a few minutes for large collections
-- No downtime required

docs/phase2_implementation_guide.md DELETED Viewed

@@ -1,269 +0,0 @@
-# Phase 2: Database Schema Enhancement - Implementation Guide
-## Overview
-This guide walks you through adding indexes and new fields to your Appwrite database for FAANG-level performance.
----
-## Step 1: Add New Attributes (Appwrite Console)
-### Navigate to Database
-1. Go to https://cloud.appwrite.io/console
-2. Select your project
-3. Go to Databases → Your Database → `articles` collection
-4. Click "Attributes" tab
-### Add New Attributes
-#### Attribute 1: slug
-- **Key:** `slug`
-- **Type:** String
-- **Size:** 200
-- **Required:** No (will be populated by migration)
-- **Default:** "" (empty string)
-- **Purpose:** SEO-friendly URL slugs
-#### Attribute 2: quality_score
-- **Key:** `quality_score`
-- **Type:** Integer
-- **Required:** No
-- **Default:** 50
-- **Min:** 0
-- **Max:** 100
-- **Purpose:** Article quality ranking
-### Click "Create" for each attribute
----
-## Step 2: Create Indexes (Critical for Performance!)
-### Navigate to Indexes
-1. In the same collection, click "Indexes" tab
-2. Click "Create Index" button
-### Index 1: url_hash (UNIQUE CONSTRAINT)
-- **Key:** `idx_url_hash_unique`
-- **Type:** Unique
-- **Attributes:** Select `url_hash`
-- **Order:** ASC
-- **Purpose:** Prevents duplicate articles automatically
-- **Impact:** Database-level deduplication
-### Index 2: category + published_at (COMPOSITE - MOST IMPORTANT!)
-- **Key:** `idx_category_published`
-- **Type:** Key
-- **Attributes:** Select `category` AND `published_at` (in that order)
-- **Orders:** `category` ASC, `published_at` DESC
-- **Purpose:** Powers main query: "Get latest AI articles"
-- **Impact:** 40x faster than without index
-### Index 3: published_at (GLOBAL FEED)
-- **Key:** `idx_published_desc`
-- **Type:** Key
-- **Attributes:** Select `published_at`
-- **Order:** DESC
-- **Purpose:** Get latest articles across all categories
-- **Impact:** Fast global news feed
-### Index 4: source (ANALYTICS)
-- **Key:** `idx_source`
-- **Type:** Key
-- **Attributes:** Select `source`
-- **Order:** ASC
-- **Purpose:** Provider statistics
-- **Impact:** Fast source-based filtering
-### Click "Create" for each index
----
-## Step 3: Run Migration Script
-The migration script will backfill `slug` and `quality_score` for all existing articles.
-### Option A: Manual Run (Recommended for first time)
-```bash
-# Navigate to backend directory
-cd SegmentoPulse/backend
-# Activate virtual environment (if using)
-source venv/bin/activate  # Linux/Mac
-# or
-.venv\Scripts\activate  # Windows
-# Run migration script
-python scripts/migrate_article_fields.py
-```
-**Expected Output:**
-```
-========================================================
-📊 Appwrite Article Migration Script
-========================================================
-Database: segmento_db
-Collection: articles
-📥 Fetching articles 1 to 100...
-📝 Processing 100 articles...
-  ✓ Updated: Google Announces New AI... (score: 85)
-  ✓ Updated: Data Security Report 2026... (score: 70)
-  ...
-📥 Fetching articles 101 to 200...
-...
-========================================================
-📊 MIGRATION SUMMARY
-========================================================
-✅ Updated: 1,250 articles
-⏭️  Skipped: 0 articles
-❌ Errors: 0 articles
-📈 Total Processed: 1,250
-========================================================
-```
-### Option B: Via Admin API (Future)
-```bash
-# Trigger via admin endpoint (once implemented)
-curl -X POST http://localhost:8000/api/admin/migrate/articles
-```
----
-## Step 4: Verify Implementation
-### Test 1: Check Indexes Are Used
-```python
-# In Python console
-from app.services.appwrite_db import get_appwrite_db
-db = get_appwrite_db()
-articles = await db.get_articles('ai', limit=20)
-# Should see in logs:
-# ✓ Retrieved 20 articles for 'ai' (offset: 0, projection: ON)
-```
-### Test 2: Check New Fields Are Populated
-```python
-# Verify slug and quality_score exist
-for article in articles[:5]:
-    print(f"{article.get('title')}")
-    print(f"  Slug: {article.get('slug')}")
-    print(f"  Quality: {article.get('quality_score')}")
-    print()
-```
-**Expected:**
-```
-Google Announces New AI Model
-  Slug: google-announces-new-ai-model
-  Quality: 85
-Apple Vision Pro 2 Released
-  Slug: apple-vision-pro-2-released
-  Quality: 90
-```
-### Test 3: Verify Deduplication
-```bash
-# Try to trigger a news fetch manually
-curl -X POST http://localhost:8000/api/admin/scheduler/fetch-now
-# Check logs for:
-# ✅ ai: 20 fetched, 2 saved, 18 duplicates
-```
----
-## Step 5: Monitor Performance
-### Before Indexes (Baseline)
-```bash
-# Query time without indexes: ~2000ms for 1000+ articles
-```
-### After Indexes (Expected)
-```bash
-# Query time with indexes: ~50ms (40x faster!) ✅
-```
-### Check Index Usage (Appwrite Console)
-1. Go to your collection
-2. Click "Indexes" tab
-3. Each index should show usage statistics
----
-## Troubleshooting
-### Issue: "Attribute already exists"
-- **Solution:** The attribute was already created. Skip to next step.
-### Issue: "Index creation failed"
-- **Cause:** May need to specify different index type or attributes
-- **Solution:** Check Appwrite documentation for your SDK version
-### Issue: Migration script can't find articles
-- **Cause:** Wrong database/collection ID
-- **Solution:** Verify environment variables:
-  ```bash
-  echo $APPWRITE_DATABASE_ID
-  echo $APPWRITE_COLLECTION_ID
-  ```
-### Issue: Migration is slow
-- **Cause:** Large collection (10k+ articles)
-- **Solution:** This is normal. Script processes 100 articles at a time.
-- **Time estimate:** ~1 minute per 1,000 articles
----
-## Rollback Plan (If Needed)
-### Remove Attributes (if needed)
-1. Go to Appwrite Console → Attributes
-2. Click ⋮ menu next to `slug` or `quality_score`
-3. Select "Delete"
-### Remove Indexes
-1. Go to Appwrite Console → Indexes
-2. Click ⋮ menu next to index
-3. Select "Delete"
-**Note:** Deleting indexes won't delete data, just the index structure.
----
-## Performance Impact Summary
-| Operation | Before | After | Improvement |
-|-----------|--------|-------|-------------|
-| **Category Query** | 2000ms | 50ms | **40x faster** |
-| **Duplicate Check** | App logic | DB unique constraint | **Automatic** |
-| **Deduplication Rate** | ~47% | ~47% | **More reliable** |
-| **Quality Ranking** | Not possible | Sort by score | **New feature** |
----
-## Next Steps
-After completing Phase 2:
-- [ ] Verify all indexes are created
-- [ ] Run migration script successfully
-- [ ] Test query performance
-- [ ] Move to Phase 3: Cursor Pagination
----
-## Questions?
-- **How often should I re-run migration?** Never. New articles automatically get slug and quality_score.
-- **What if I add more articles?** They'll automatically have the new fields from the updated save_articles() method.
-- **Can I skip indexes?** No! Indexes are critical for performance at scale.

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ