SHAFI commited on
Commit Β·
32e07da
1
Parent(s): d3e3e7e
fixed time issue
Browse files- app/services/news_providers.py +27 -12
- app/services/scheduler.py +26 -1
- app/utils/data_validation.py +21 -12
app/services/news_providers.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import httpx
|
| 2 |
from typing import List, Optional, Dict
|
| 3 |
from datetime import datetime, timezone, timedelta
|
|
|
|
| 4 |
from abc import ABC, abstractmethod
|
| 5 |
from app.models import Article
|
| 6 |
import os
|
|
@@ -87,12 +88,21 @@ class GNewsProvider(NewsProvider):
|
|
| 87 |
query = self.category_map.get(category, category)
|
| 88 |
url = f"{self.base_url}/search"
|
| 89 |
|
| 90 |
-
# Build a window from midnight
|
| 91 |
-
#
|
| 92 |
-
#
|
| 93 |
-
#
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
params = {
|
| 98 |
'q': query,
|
|
@@ -100,8 +110,8 @@ class GNewsProvider(NewsProvider):
|
|
| 100 |
'country': 'us',
|
| 101 |
'max': min(limit, 10), # GNews free tier max 10
|
| 102 |
'apikey': self.api_key,
|
| 103 |
-
'from':
|
| 104 |
-
'to':
|
| 105 |
}
|
| 106 |
|
| 107 |
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
@@ -195,9 +205,14 @@ class NewsAPIProvider(NewsProvider):
|
|
| 195 |
query = self.category_keywords.get(category, category)
|
| 196 |
url = f"{self.base_url}/everything"
|
| 197 |
|
| 198 |
-
# Ask NewsAPI for articles published since midnight
|
| 199 |
-
#
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
params = {
|
| 203 |
'q': query,
|
|
@@ -205,7 +220,7 @@ class NewsAPIProvider(NewsProvider):
|
|
| 205 |
'sortBy': 'publishedAt',
|
| 206 |
'pageSize': min(limit, 20),
|
| 207 |
'apiKey': self.api_key,
|
| 208 |
-
'from':
|
| 209 |
}
|
| 210 |
|
| 211 |
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
|
|
| 1 |
import httpx
|
| 2 |
from typing import List, Optional, Dict
|
| 3 |
from datetime import datetime, timezone, timedelta
|
| 4 |
+
from zoneinfo import ZoneInfo # stdlib from Python 3.9+ β no extra install needed
|
| 5 |
from abc import ABC, abstractmethod
|
| 6 |
from app.models import Article
|
| 7 |
import os
|
|
|
|
| 88 |
query = self.category_map.get(category, category)
|
| 89 |
url = f"{self.base_url}/search"
|
| 90 |
|
| 91 |
+
# Build a window from midnight IST today (converted to UTC) to right now.
|
| 92 |
+
#
|
| 93 |
+
# Why IST midnight and not UTC midnight?
|
| 94 |
+
# IST is UTC+5:30. If we used UTC midnight as the "from" date, GNews
|
| 95 |
+
# would skip articles published in India between 12:00 AM IST and
|
| 96 |
+
# 5:30 AM IST β the first 5.5 hours of the Indian day.
|
| 97 |
+
# By computing IST midnight and converting it to UTC, we tell GNews:
|
| 98 |
+
# "Give me everything published since the Indian day started".
|
| 99 |
+
_ist_zone = ZoneInfo("Asia/Kolkata")
|
| 100 |
+
_now_ist = datetime.now(_ist_zone)
|
| 101 |
+
_cutoff_ist = _now_ist.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 102 |
+
# Convert IST midnight β UTC so the API gets a valid UTC timestamp.
|
| 103 |
+
_cutoff_utc = _cutoff_ist.astimezone(timezone.utc)
|
| 104 |
+
# Current moment in UTC for the "to" bound.
|
| 105 |
+
_now_utc = datetime.now(timezone.utc)
|
| 106 |
|
| 107 |
params = {
|
| 108 |
'q': query,
|
|
|
|
| 110 |
'country': 'us',
|
| 111 |
'max': min(limit, 10), # GNews free tier max 10
|
| 112 |
'apikey': self.api_key,
|
| 113 |
+
'from': _cutoff_utc.strftime('%Y-%m-%dT%H:%M:%SZ'), # IST midnight in UTC
|
| 114 |
+
'to': _now_utc.strftime('%Y-%m-%dT%H:%M:%SZ'),
|
| 115 |
}
|
| 116 |
|
| 117 |
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
|
|
| 205 |
query = self.category_keywords.get(category, category)
|
| 206 |
url = f"{self.base_url}/everything"
|
| 207 |
|
| 208 |
+
# Ask NewsAPI for articles published since midnight IST today.
|
| 209 |
+
# We compute IST midnight and convert it to UTC before sending it
|
| 210 |
+
# to the API, because NewsAPI expects UTC timestamps.
|
| 211 |
+
# This gives Indian users full coverage from their midnight onwards.
|
| 212 |
+
_ist_zone = ZoneInfo("Asia/Kolkata")
|
| 213 |
+
_now_ist = datetime.now(_ist_zone)
|
| 214 |
+
_cutoff_ist = _now_ist.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 215 |
+
_cutoff_utc = _cutoff_ist.astimezone(timezone.utc) # Convert to UTC for the API
|
| 216 |
|
| 217 |
params = {
|
| 218 |
'q': query,
|
|
|
|
| 220 |
'sortBy': 'publishedAt',
|
| 221 |
'pageSize': min(limit, 20),
|
| 222 |
'apiKey': self.api_key,
|
| 223 |
+
'from': _cutoff_utc.strftime('%Y-%m-%dT%H:%M:%SZ'), # IST midnight in UTC
|
| 224 |
}
|
| 225 |
|
| 226 |
async with httpx.AsyncClient(timeout=10.0) as client:
|
app/services/scheduler.py
CHANGED
|
@@ -13,6 +13,7 @@ import pytz
|
|
| 13 |
from app.services.news_aggregator import NewsAggregator
|
| 14 |
from app.services.appwrite_db import get_appwrite_db
|
| 15 |
from app.services.cache_service import CacheService
|
|
|
|
| 16 |
from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
|
| 17 |
from app.services.research_aggregator import ResearchAggregator
|
| 18 |
from app.config import settings
|
|
@@ -301,7 +302,31 @@ async def fetch_single_category_job(category: str):
|
|
| 301 |
invalid_count, irrelevant_count
|
| 302 |
)
|
| 303 |
|
| 304 |
-
# Step
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
try:
|
| 306 |
await cache_service.set(f"news:{category}", articles, ttl=settings.CACHE_TTL)
|
| 307 |
except Exception as cache_err:
|
|
|
|
| 13 |
from app.services.news_aggregator import NewsAggregator
|
| 14 |
from app.services.appwrite_db import get_appwrite_db
|
| 15 |
from app.services.cache_service import CacheService
|
| 16 |
+
from app.services.upstash_cache import get_upstash_cache # Needed to bust stale news_v3 keys
|
| 17 |
from app.services.adaptive_scheduler import get_adaptive_scheduler, AdaptiveScheduler
|
| 18 |
from app.services.research_aggregator import ResearchAggregator
|
| 19 |
from app.config import settings
|
|
|
|
| 302 |
invalid_count, irrelevant_count
|
| 303 |
)
|
| 304 |
|
| 305 |
+
# Step 3a: Bust the Upstash news_v3 cache for this category.
|
| 306 |
+
#
|
| 307 |
+
# The news route (/api/news/<category>) caches its response in Upstash
|
| 308 |
+
# under the key "news_v3:<category>:page:<N>:l<limit>".
|
| 309 |
+
# Without this delete, a user hitting the page right after an Appwrite
|
| 310 |
+
# save would still get the stale 5-minute-old response (which may be
|
| 311 |
+
# empty), because the cache has not expired yet.
|
| 312 |
+
#
|
| 313 |
+
# Fix: the moment we save new articles, we surgically delete page-1
|
| 314 |
+
# of this category's cache. This forces the very next API call to
|
| 315 |
+
# bypass the cache and read fresh data from Appwrite.
|
| 316 |
+
if saved_count > 0:
|
| 317 |
+
try:
|
| 318 |
+
upstash = get_upstash_cache()
|
| 319 |
+
# Delete the most-visited page (page 1, default limit 20).
|
| 320 |
+
# Other pages will expire naturally on their 5-min TTL.
|
| 321 |
+
stale_key = f"news_v3:{category}:page:1:l20"
|
| 322 |
+
await upstash.delete(stale_key)
|
| 323 |
+
logger.info("[CACHE BUST] Deleted stale key '%s' β fresh articles will appear immediately.", stale_key)
|
| 324 |
+
except Exception as bust_err:
|
| 325 |
+
# Cache bust failure is not fatal β articles are already in Appwrite.
|
| 326 |
+
# The stale cache will expire on its own in at most 5 minutes.
|
| 327 |
+
logger.debug("[CACHE BUST] Could not delete stale key: %s", bust_err)
|
| 328 |
+
|
| 329 |
+
# Step 3b: Also update the legacy Redis L1 article cache.
|
| 330 |
try:
|
| 331 |
await cache_service.set(f"news:{category}", articles, ttl=settings.CACHE_TTL)
|
| 332 |
except Exception as cache_err:
|
app/utils/data_validation.py
CHANGED
|
@@ -9,6 +9,7 @@ EMERGENCY HOTFIX (2026-01-23): Fixed AttributeError 'Article' object has no attr
|
|
| 9 |
|
| 10 |
from typing import Dict, Optional, List, Union
|
| 11 |
from datetime import datetime, timezone, timedelta
|
|
|
|
| 12 |
import re
|
| 13 |
from urllib.parse import urlparse
|
| 14 |
from dateutil import parser as dateutil_parser
|
|
@@ -71,16 +72,21 @@ def is_valid_article(article: Union[Dict, 'Article']) -> bool:
|
|
| 71 |
return False
|
| 72 |
|
| 73 |
# ββ FRESHNESS GATE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 74 |
-
# We only want articles published
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
#
|
| 76 |
# CRITICAL ORDER: This check runs on the RAW date string, before
|
| 77 |
# normalize_article_date() gets a chance to run. That function has a
|
| 78 |
# silent fallback: if a date is unparseable it stamps the article with
|
| 79 |
# 'right now'. Without this guard, a 3-day-old article with a broken
|
| 80 |
# date string would survive normalization and appear fresh.
|
| 81 |
-
#
|
| 82 |
-
# Here we do the opposite: if we cannot confidently parse the date,
|
| 83 |
-
# we reject the article. No date = no entry.
|
| 84 |
try:
|
| 85 |
if isinstance(raw_date, datetime):
|
| 86 |
pub_dt = raw_date
|
|
@@ -91,15 +97,18 @@ def is_valid_article(article: Union[Dict, 'Article']) -> bool:
|
|
| 91 |
if pub_dt.tzinfo is None:
|
| 92 |
pub_dt = pub_dt.replace(tzinfo=timezone.utc)
|
| 93 |
|
| 94 |
-
#
|
| 95 |
-
#
|
| 96 |
-
#
|
| 97 |
-
|
| 98 |
-
|
|
|
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
| 103 |
return False
|
| 104 |
|
| 105 |
except Exception:
|
|
|
|
| 9 |
|
| 10 |
from typing import Dict, Optional, List, Union
|
| 11 |
from datetime import datetime, timezone, timedelta
|
| 12 |
+
from zoneinfo import ZoneInfo # stdlib from Python 3.9+ β no extra install needed
|
| 13 |
import re
|
| 14 |
from urllib.parse import urlparse
|
| 15 |
from dateutil import parser as dateutil_parser
|
|
|
|
| 72 |
return False
|
| 73 |
|
| 74 |
# ββ FRESHNESS GATE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 75 |
+
# We only want articles published today, where "today" is measured in
|
| 76 |
+
# Indian Standard Time (IST = UTC+5:30) β because that is where our
|
| 77 |
+
# users are.
|
| 78 |
+
#
|
| 79 |
+
# Why IST and not UTC?
|
| 80 |
+
# With UTC midnight as the cutoff, articles published in India between
|
| 81 |
+
# 12:00 AM IST and 5:30 AM IST (the first 5.5 hours of the Indian day)
|
| 82 |
+
# were incorrectly rejected, because UTC midnight had not yet arrived.
|
| 83 |
+
# Switching to IST midnight gives Indian users a full 24-hour day.
|
| 84 |
#
|
| 85 |
# CRITICAL ORDER: This check runs on the RAW date string, before
|
| 86 |
# normalize_article_date() gets a chance to run. That function has a
|
| 87 |
# silent fallback: if a date is unparseable it stamps the article with
|
| 88 |
# 'right now'. Without this guard, a 3-day-old article with a broken
|
| 89 |
# date string would survive normalization and appear fresh.
|
|
|
|
|
|
|
|
|
|
| 90 |
try:
|
| 91 |
if isinstance(raw_date, datetime):
|
| 92 |
pub_dt = raw_date
|
|
|
|
| 97 |
if pub_dt.tzinfo is None:
|
| 98 |
pub_dt = pub_dt.replace(tzinfo=timezone.utc)
|
| 99 |
|
| 100 |
+
# Step 1: Find midnight IST today.
|
| 101 |
+
# We get the current moment in IST, then zero out hours/minutes/seconds.
|
| 102 |
+
# This gives us "12:00:00 AM of today in India".
|
| 103 |
+
ist_zone = ZoneInfo("Asia/Kolkata")
|
| 104 |
+
now_ist = datetime.now(ist_zone)
|
| 105 |
+
cutoff_ist = now_ist.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 106 |
|
| 107 |
+
# Step 2: The article timestamp may be in any timezone (UTC, EST, etc.).
|
| 108 |
+
# Python's datetime comparison handles mixed timezones correctly as long
|
| 109 |
+
# as both sides are timezone-aware β which they both are here.
|
| 110 |
+
if pub_dt < cutoff_ist:
|
| 111 |
+
# Article was published before midnight IST today β reject it.
|
| 112 |
return False
|
| 113 |
|
| 114 |
except Exception:
|