Spaces:

WORKWITHSHAFISK
/

segmentopulse-backend

Paused

App Files Files Community

SHAFI commited on Feb 26

Commit

32e07da

1 Parent(s): d3e3e7e

fixed time issue

Browse files

Files changed (3) hide show

app/services/news_providers.py +27 -12
app/services/scheduler.py +26 -1
app/utils/data_validation.py +21 -12

app/services/news_providers.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import httpx
 from typing import List, Optional, Dict
 from datetime import datetime, timezone, timedelta
 from abc import ABC, abstractmethod
 from app.models import Article
 import os
@@ -87,12 +88,21 @@ class GNewsProvider(NewsProvider):
             query = self.category_map.get(category, category)
             url = f"{self.base_url}/search"
-            # Build a window from midnight UTC today to right now.
-            # This is a strict CALENDAR DAY filter, not a rolling 24-hour window.
-            # A job running at 11:59 PM will still only fetch today's articles,
-            # not anything from yesterday.
-            _now    = datetime.now(timezone.utc)
-            _cutoff = _now.replace(hour=0, minute=0, second=0, microsecond=0)  # 00:00:00 UTC today
             params = {
                 'q': query,
@@ -100,8 +110,8 @@ class GNewsProvider(NewsProvider):
                 'country': 'us',
                 'max': min(limit, 10),  # GNews free tier max 10
                 'apikey': self.api_key,
-                'from': _cutoff.strftime('%Y-%m-%dT%H:%M:%SZ'),
-                'to':   _now.strftime('%Y-%m-%dT%H:%M:%SZ'),
             }
             async with httpx.AsyncClient(timeout=10.0) as client:
@@ -195,9 +205,14 @@ class NewsAPIProvider(NewsProvider):
             query = self.category_keywords.get(category, category)
             url = f"{self.base_url}/everything"
-            # Ask NewsAPI for articles published since midnight UTC today.
-            # Calendar day window: resets cleanly at 00:00:00 UTC every day.
-            _cutoff = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
             params = {
                 'q': query,
@@ -205,7 +220,7 @@ class NewsAPIProvider(NewsProvider):
                 'sortBy': 'publishedAt',
                 'pageSize': min(limit, 20),
                 'apiKey': self.api_key,
-                'from': _cutoff.strftime('%Y-%m-%dT%H:%M:%SZ'),
             }
             async with httpx.AsyncClient(timeout=10.0) as client:

 import httpx
 from typing import List, Optional, Dict
 from datetime import datetime, timezone, timedelta
+from zoneinfo import ZoneInfo   # stdlib from Python 3.9+ — no extra install needed
 from abc import ABC, abstractmethod
 from app.models import Article
 import os
             query = self.category_map.get(category, category)
             url = f"{self.base_url}/search"
+            # Build a window from midnight IST today (converted to UTC) to right now.
+            #
+            # Why IST midnight and not UTC midnight?
+            # IST is UTC+5:30. If we used UTC midnight as the "from" date, GNews
+            # would skip articles published in India between 12:00 AM IST and
+            # 5:30 AM IST — the first 5.5 hours of the Indian day.
+            # By computing IST midnight and converting it to UTC, we tell GNews:
+            # "Give me everything published since the Indian day started".
+            _ist_zone   = ZoneInfo("Asia/Kolkata")
+            _now_ist    = datetime.now(_ist_zone)
+            _cutoff_ist = _now_ist.replace(hour=0, minute=0, second=0, microsecond=0)
+            # Convert IST midnight → UTC so the API gets a valid UTC timestamp.
+            _cutoff_utc = _cutoff_ist.astimezone(timezone.utc)
+            # Current moment in UTC for the "to" bound.
+            _now_utc    = datetime.now(timezone.utc)
             params = {
                 'q': query,
                 'country': 'us',
                 'max': min(limit, 10),  # GNews free tier max 10
                 'apikey': self.api_key,
+                'from': _cutoff_utc.strftime('%Y-%m-%dT%H:%M:%SZ'),  # IST midnight in UTC
+                'to':   _now_utc.strftime('%Y-%m-%dT%H:%M:%SZ'),
             }
             async with httpx.AsyncClient(timeout=10.0) as client:
             query = self.category_keywords.get(category, category)
             url = f"{self.base_url}/everything"
+            # Ask NewsAPI for articles published since midnight IST today.
+            # We compute IST midnight and convert it to UTC before sending it
+            # to the API, because NewsAPI expects UTC timestamps.
+            # This gives Indian users full coverage from their midnight onwards.
+            _ist_zone   = ZoneInfo("Asia/Kolkata")
+            _now_ist    = datetime.now(_ist_zone)
+            _cutoff_ist = _now_ist.replace(hour=0, minute=0, second=0, microsecond=0)
+            _cutoff_utc = _cutoff_ist.astimezone(timezone.utc)  # Convert to UTC for the API
             params = {
                 'q': query,
                 'sortBy': 'publishedAt',
                 'pageSize': min(limit, 20),
                 'apiKey': self.api_key,
+                'from': _cutoff_utc.strftime('%Y-%m-%dT%H:%M:%SZ'),  # IST midnight in UTC
             }
             async with httpx.AsyncClient(timeout=10.0) as client:

app/services/scheduler.py CHANGED Viewed

@@ -13,6 +13,7 @@ import pytz
 from app.services.news_aggregator import NewsAggregator
 from app.services.appwrite_db import get_appwrite_db
 from app.services.cache_service import CacheService
 from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
 from app.services.research_aggregator import ResearchAggregator
 from app.config import settings
@@ -301,7 +302,31 @@ async def fetch_single_category_job(category: str):
                 invalid_count, irrelevant_count
             )
-            # Step 3: Update Redis article cache so the API serves fresh results.
             try:
                 await cache_service.set(f"news:{category}", articles, ttl=settings.CACHE_TTL)
             except Exception as cache_err:

 from app.services.news_aggregator import NewsAggregator
 from app.services.appwrite_db import get_appwrite_db
 from app.services.cache_service import CacheService
+from app.services.upstash_cache import get_upstash_cache   # Needed to bust stale news_v3 keys
 from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
 from app.services.research_aggregator import ResearchAggregator
 from app.config import settings
                 invalid_count, irrelevant_count
             )
+            # Step 3a: Bust the Upstash news_v3 cache for this category.
+            #
+            # The news route (/api/news/<category>) caches its response in Upstash
+            # under the key  "news_v3:<category>:page:<N>:l<limit>".
+            # Without this delete, a user hitting the page right after an Appwrite
+            # save would still get the stale 5-minute-old response (which may be
+            # empty), because the cache has not expired yet.
+            #
+            # Fix: the moment we save new articles, we surgically delete page-1
+            # of this category's cache. This forces the very next API call to
+            # bypass the cache and read fresh data from Appwrite.
+            if saved_count > 0:
+                try:
+                    upstash = get_upstash_cache()
+                    # Delete the most-visited page (page 1, default limit 20).
+                    # Other pages will expire naturally on their 5-min TTL.
+                    stale_key = f"news_v3:{category}:page:1:l20"
+                    await upstash.delete(stale_key)
+                    logger.info("[CACHE BUST] Deleted stale key '%s' — fresh articles will appear immediately.", stale_key)
+                except Exception as bust_err:
+                    # Cache bust failure is not fatal — articles are already in Appwrite.
+                    # The stale cache will expire on its own in at most 5 minutes.
+                    logger.debug("[CACHE BUST] Could not delete stale key: %s", bust_err)
+            # Step 3b: Also update the legacy Redis L1 article cache.
             try:
                 await cache_service.set(f"news:{category}", articles, ttl=settings.CACHE_TTL)
             except Exception as cache_err:

app/utils/data_validation.py CHANGED Viewed

@@ -9,6 +9,7 @@ EMERGENCY HOTFIX (2026-01-23): Fixed AttributeError 'Article' object has no attr
 from typing import Dict, Optional, List, Union
 from datetime import datetime, timezone, timedelta
 import re
 from urllib.parse import urlparse
 from dateutil import parser as dateutil_parser
@@ -71,16 +72,21 @@ def is_valid_article(article: Union[Dict, 'Article']) -> bool:
         return False
     # ── FRESHNESS GATE ────────────────────────────────────────────────────────
-    # We only want articles published within the last 24 hours.
     #
     # CRITICAL ORDER: This check runs on the RAW date string, before
     # normalize_article_date() gets a chance to run. That function has a
     # silent fallback: if a date is unparseable it stamps the article with
     # 'right now'. Without this guard, a 3-day-old article with a broken
     # date string would survive normalization and appear fresh.
-    #
-    # Here we do the opposite: if we cannot confidently parse the date,
-    # we reject the article. No date = no entry.
     try:
         if isinstance(raw_date, datetime):
             pub_dt = raw_date
@@ -91,15 +97,18 @@ def is_valid_article(article: Union[Dict, 'Article']) -> bool:
         if pub_dt.tzinfo is None:
             pub_dt = pub_dt.replace(tzinfo=timezone.utc)
-        # Midnight UTC today — strict calendar day, not a rolling 24h window.
-        # Example: at 11:59 PM, this is still 00:00:00 of the current day,
-        # so only today's articles pass. Yesterday's articles are rejected
-        # regardless of what time the job fires.
-        cutoff = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
-        if pub_dt < cutoff:
-            # Article is older than 24 hours — reject it here before any
-            # keyword matching or Redis dedup call wastes time on it.
             return False
     except Exception:

 from typing import Dict, Optional, List, Union
 from datetime import datetime, timezone, timedelta
+from zoneinfo import ZoneInfo   # stdlib from Python 3.9+ — no extra install needed
 import re
 from urllib.parse import urlparse
 from dateutil import parser as dateutil_parser
         return False
     # ── FRESHNESS GATE ────────────────────────────────────────────────────────
+    # We only want articles published today, where "today" is measured in
+    # Indian Standard Time (IST = UTC+5:30) — because that is where our
+    # users are.
+    #
+    # Why IST and not UTC?
+    # With UTC midnight as the cutoff, articles published in India between
+    # 12:00 AM IST and 5:30 AM IST (the first 5.5 hours of the Indian day)
+    # were incorrectly rejected, because UTC midnight had not yet arrived.
+    # Switching to IST midnight gives Indian users a full 24-hour day.
     #
     # CRITICAL ORDER: This check runs on the RAW date string, before
     # normalize_article_date() gets a chance to run. That function has a
     # silent fallback: if a date is unparseable it stamps the article with
     # 'right now'. Without this guard, a 3-day-old article with a broken
     # date string would survive normalization and appear fresh.
     try:
         if isinstance(raw_date, datetime):
             pub_dt = raw_date
         if pub_dt.tzinfo is None:
             pub_dt = pub_dt.replace(tzinfo=timezone.utc)
+        # Step 1: Find midnight IST today.
+        # We get the current moment in IST, then zero out hours/minutes/seconds.
+        # This gives us "12:00:00 AM of today in India".
+        ist_zone   = ZoneInfo("Asia/Kolkata")
+        now_ist    = datetime.now(ist_zone)
+        cutoff_ist = now_ist.replace(hour=0, minute=0, second=0, microsecond=0)
+        # Step 2: The article timestamp may be in any timezone (UTC, EST, etc.).
+        # Python's datetime comparison handles mixed timezones correctly as long
+        # as both sides are timezone-aware — which they both are here.
+        if pub_dt < cutoff_ist:
+            # Article was published before midnight IST today — reject it.
             return False
     except Exception: