SHAFI commited on
Commit
32e07da
Β·
1 Parent(s): d3e3e7e

fixed time issue

Browse files
app/services/news_providers.py CHANGED
@@ -1,6 +1,7 @@
1
  import httpx
2
  from typing import List, Optional, Dict
3
  from datetime import datetime, timezone, timedelta
 
4
  from abc import ABC, abstractmethod
5
  from app.models import Article
6
  import os
@@ -87,12 +88,21 @@ class GNewsProvider(NewsProvider):
87
  query = self.category_map.get(category, category)
88
  url = f"{self.base_url}/search"
89
 
90
- # Build a window from midnight UTC today to right now.
91
- # This is a strict CALENDAR DAY filter, not a rolling 24-hour window.
92
- # A job running at 11:59 PM will still only fetch today's articles,
93
- # not anything from yesterday.
94
- _now = datetime.now(timezone.utc)
95
- _cutoff = _now.replace(hour=0, minute=0, second=0, microsecond=0) # 00:00:00 UTC today
 
 
 
 
 
 
 
 
 
96
 
97
  params = {
98
  'q': query,
@@ -100,8 +110,8 @@ class GNewsProvider(NewsProvider):
100
  'country': 'us',
101
  'max': min(limit, 10), # GNews free tier max 10
102
  'apikey': self.api_key,
103
- 'from': _cutoff.strftime('%Y-%m-%dT%H:%M:%SZ'),
104
- 'to': _now.strftime('%Y-%m-%dT%H:%M:%SZ'),
105
  }
106
 
107
  async with httpx.AsyncClient(timeout=10.0) as client:
@@ -195,9 +205,14 @@ class NewsAPIProvider(NewsProvider):
195
  query = self.category_keywords.get(category, category)
196
  url = f"{self.base_url}/everything"
197
 
198
- # Ask NewsAPI for articles published since midnight UTC today.
199
- # Calendar day window: resets cleanly at 00:00:00 UTC every day.
200
- _cutoff = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
 
 
 
 
 
201
 
202
  params = {
203
  'q': query,
@@ -205,7 +220,7 @@ class NewsAPIProvider(NewsProvider):
205
  'sortBy': 'publishedAt',
206
  'pageSize': min(limit, 20),
207
  'apiKey': self.api_key,
208
- 'from': _cutoff.strftime('%Y-%m-%dT%H:%M:%SZ'),
209
  }
210
 
211
  async with httpx.AsyncClient(timeout=10.0) as client:
 
1
  import httpx
2
  from typing import List, Optional, Dict
3
  from datetime import datetime, timezone, timedelta
4
+ from zoneinfo import ZoneInfo # stdlib from Python 3.9+ β€” no extra install needed
5
  from abc import ABC, abstractmethod
6
  from app.models import Article
7
  import os
 
88
  query = self.category_map.get(category, category)
89
  url = f"{self.base_url}/search"
90
 
91
+ # Build a window from midnight IST today (converted to UTC) to right now.
92
+ #
93
+ # Why IST midnight and not UTC midnight?
94
+ # IST is UTC+5:30. If we used UTC midnight as the "from" date, GNews
95
+ # would skip articles published in India between 12:00 AM IST and
96
+ # 5:30 AM IST β€” the first 5.5 hours of the Indian day.
97
+ # By computing IST midnight and converting it to UTC, we tell GNews:
98
+ # "Give me everything published since the Indian day started".
99
+ _ist_zone = ZoneInfo("Asia/Kolkata")
100
+ _now_ist = datetime.now(_ist_zone)
101
+ _cutoff_ist = _now_ist.replace(hour=0, minute=0, second=0, microsecond=0)
102
+ # Convert IST midnight β†’ UTC so the API gets a valid UTC timestamp.
103
+ _cutoff_utc = _cutoff_ist.astimezone(timezone.utc)
104
+ # Current moment in UTC for the "to" bound.
105
+ _now_utc = datetime.now(timezone.utc)
106
 
107
  params = {
108
  'q': query,
 
110
  'country': 'us',
111
  'max': min(limit, 10), # GNews free tier max 10
112
  'apikey': self.api_key,
113
+ 'from': _cutoff_utc.strftime('%Y-%m-%dT%H:%M:%SZ'), # IST midnight in UTC
114
+ 'to': _now_utc.strftime('%Y-%m-%dT%H:%M:%SZ'),
115
  }
116
 
117
  async with httpx.AsyncClient(timeout=10.0) as client:
 
205
  query = self.category_keywords.get(category, category)
206
  url = f"{self.base_url}/everything"
207
 
208
+ # Ask NewsAPI for articles published since midnight IST today.
209
+ # We compute IST midnight and convert it to UTC before sending it
210
+ # to the API, because NewsAPI expects UTC timestamps.
211
+ # This gives Indian users full coverage from their midnight onwards.
212
+ _ist_zone = ZoneInfo("Asia/Kolkata")
213
+ _now_ist = datetime.now(_ist_zone)
214
+ _cutoff_ist = _now_ist.replace(hour=0, minute=0, second=0, microsecond=0)
215
+ _cutoff_utc = _cutoff_ist.astimezone(timezone.utc) # Convert to UTC for the API
216
 
217
  params = {
218
  'q': query,
 
220
  'sortBy': 'publishedAt',
221
  'pageSize': min(limit, 20),
222
  'apiKey': self.api_key,
223
+ 'from': _cutoff_utc.strftime('%Y-%m-%dT%H:%M:%SZ'), # IST midnight in UTC
224
  }
225
 
226
  async with httpx.AsyncClient(timeout=10.0) as client:
app/services/scheduler.py CHANGED
@@ -13,6 +13,7 @@ import pytz
13
  from app.services.news_aggregator import NewsAggregator
14
  from app.services.appwrite_db import get_appwrite_db
15
  from app.services.cache_service import CacheService
 
16
  from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
17
  from app.services.research_aggregator import ResearchAggregator
18
  from app.config import settings
@@ -301,7 +302,31 @@ async def fetch_single_category_job(category: str):
301
  invalid_count, irrelevant_count
302
  )
303
 
304
- # Step 3: Update Redis article cache so the API serves fresh results.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  try:
306
  await cache_service.set(f"news:{category}", articles, ttl=settings.CACHE_TTL)
307
  except Exception as cache_err:
 
13
  from app.services.news_aggregator import NewsAggregator
14
  from app.services.appwrite_db import get_appwrite_db
15
  from app.services.cache_service import CacheService
16
+ from app.services.upstash_cache import get_upstash_cache # Needed to bust stale news_v3 keys
17
  from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
18
  from app.services.research_aggregator import ResearchAggregator
19
  from app.config import settings
 
302
  invalid_count, irrelevant_count
303
  )
304
 
305
+ # Step 3a: Bust the Upstash news_v3 cache for this category.
306
+ #
307
+ # The news route (/api/news/<category>) caches its response in Upstash
308
+ # under the key "news_v3:<category>:page:<N>:l<limit>".
309
+ # Without this delete, a user hitting the page right after an Appwrite
310
+ # save would still get the stale 5-minute-old response (which may be
311
+ # empty), because the cache has not expired yet.
312
+ #
313
+ # Fix: the moment we save new articles, we surgically delete page-1
314
+ # of this category's cache. This forces the very next API call to
315
+ # bypass the cache and read fresh data from Appwrite.
316
+ if saved_count > 0:
317
+ try:
318
+ upstash = get_upstash_cache()
319
+ # Delete the most-visited page (page 1, default limit 20).
320
+ # Other pages will expire naturally on their 5-min TTL.
321
+ stale_key = f"news_v3:{category}:page:1:l20"
322
+ await upstash.delete(stale_key)
323
+ logger.info("[CACHE BUST] Deleted stale key '%s' β€” fresh articles will appear immediately.", stale_key)
324
+ except Exception as bust_err:
325
+ # Cache bust failure is not fatal β€” articles are already in Appwrite.
326
+ # The stale cache will expire on its own in at most 5 minutes.
327
+ logger.debug("[CACHE BUST] Could not delete stale key: %s", bust_err)
328
+
329
+ # Step 3b: Also update the legacy Redis L1 article cache.
330
  try:
331
  await cache_service.set(f"news:{category}", articles, ttl=settings.CACHE_TTL)
332
  except Exception as cache_err:
app/utils/data_validation.py CHANGED
@@ -9,6 +9,7 @@ EMERGENCY HOTFIX (2026-01-23): Fixed AttributeError 'Article' object has no attr
9
 
10
  from typing import Dict, Optional, List, Union
11
  from datetime import datetime, timezone, timedelta
 
12
  import re
13
  from urllib.parse import urlparse
14
  from dateutil import parser as dateutil_parser
@@ -71,16 +72,21 @@ def is_valid_article(article: Union[Dict, 'Article']) -> bool:
71
  return False
72
 
73
  # ── FRESHNESS GATE ────────────────────────────────────────────────────────
74
- # We only want articles published within the last 24 hours.
 
 
 
 
 
 
 
 
75
  #
76
  # CRITICAL ORDER: This check runs on the RAW date string, before
77
  # normalize_article_date() gets a chance to run. That function has a
78
  # silent fallback: if a date is unparseable it stamps the article with
79
  # 'right now'. Without this guard, a 3-day-old article with a broken
80
  # date string would survive normalization and appear fresh.
81
- #
82
- # Here we do the opposite: if we cannot confidently parse the date,
83
- # we reject the article. No date = no entry.
84
  try:
85
  if isinstance(raw_date, datetime):
86
  pub_dt = raw_date
@@ -91,15 +97,18 @@ def is_valid_article(article: Union[Dict, 'Article']) -> bool:
91
  if pub_dt.tzinfo is None:
92
  pub_dt = pub_dt.replace(tzinfo=timezone.utc)
93
 
94
- # Midnight UTC today β€” strict calendar day, not a rolling 24h window.
95
- # Example: at 11:59 PM, this is still 00:00:00 of the current day,
96
- # so only today's articles pass. Yesterday's articles are rejected
97
- # regardless of what time the job fires.
98
- cutoff = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
 
99
 
100
- if pub_dt < cutoff:
101
- # Article is older than 24 hours β€” reject it here before any
102
- # keyword matching or Redis dedup call wastes time on it.
 
 
103
  return False
104
 
105
  except Exception:
 
9
 
10
  from typing import Dict, Optional, List, Union
11
  from datetime import datetime, timezone, timedelta
12
+ from zoneinfo import ZoneInfo # stdlib from Python 3.9+ β€” no extra install needed
13
  import re
14
  from urllib.parse import urlparse
15
  from dateutil import parser as dateutil_parser
 
72
  return False
73
 
74
  # ── FRESHNESS GATE ────────────────────────────────────────────────────────
75
+ # We only want articles published today, where "today" is measured in
76
+ # Indian Standard Time (IST = UTC+5:30) β€” because that is where our
77
+ # users are.
78
+ #
79
+ # Why IST and not UTC?
80
+ # With UTC midnight as the cutoff, articles published in India between
81
+ # 12:00 AM IST and 5:30 AM IST (the first 5.5 hours of the Indian day)
82
+ # were incorrectly rejected, because UTC midnight had not yet arrived.
83
+ # Switching to IST midnight gives Indian users a full 24-hour day.
84
  #
85
  # CRITICAL ORDER: This check runs on the RAW date string, before
86
  # normalize_article_date() gets a chance to run. That function has a
87
  # silent fallback: if a date is unparseable it stamps the article with
88
  # 'right now'. Without this guard, a 3-day-old article with a broken
89
  # date string would survive normalization and appear fresh.
 
 
 
90
  try:
91
  if isinstance(raw_date, datetime):
92
  pub_dt = raw_date
 
97
  if pub_dt.tzinfo is None:
98
  pub_dt = pub_dt.replace(tzinfo=timezone.utc)
99
 
100
+ # Step 1: Find midnight IST today.
101
+ # We get the current moment in IST, then zero out hours/minutes/seconds.
102
+ # This gives us "12:00:00 AM of today in India".
103
+ ist_zone = ZoneInfo("Asia/Kolkata")
104
+ now_ist = datetime.now(ist_zone)
105
+ cutoff_ist = now_ist.replace(hour=0, minute=0, second=0, microsecond=0)
106
 
107
+ # Step 2: The article timestamp may be in any timezone (UTC, EST, etc.).
108
+ # Python's datetime comparison handles mixed timezones correctly as long
109
+ # as both sides are timezone-aware β€” which they both are here.
110
+ if pub_dt < cutoff_ist:
111
+ # Article was published before midnight IST today β€” reject it.
112
  return False
113
 
114
  except Exception: