Spaces:
Running
Running
Devang1290 commited on
Commit Β·
0b32eb4
1
Parent(s): 8a191ee
feat: phase 3 - supabase query cache for instant top 5 results on repeat searches
Browse files- backend/services/database.py +71 -0
- hf_app.py +34 -5
backend/services/database.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import sys
|
| 2 |
from pathlib import Path
|
|
|
|
| 3 |
|
| 4 |
sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
|
| 5 |
from backend.core.logger import logger
|
|
@@ -130,3 +131,73 @@ class DatabaseManager:
|
|
| 130 |
except Exception as e:
|
| 131 |
logger.error(f"Error updating audio_url for {article_id}: {str(e)}")
|
| 132 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import sys
|
| 2 |
from pathlib import Path
|
| 3 |
+
from typing import Optional
|
| 4 |
|
| 5 |
sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
|
| 6 |
from backend.core.logger import logger
|
|
|
|
| 131 |
except Exception as e:
|
| 132 |
logger.error(f"Error updating audio_url for {article_id}: {str(e)}")
|
| 133 |
return False
|
| 134 |
+
|
| 135 |
+
def check_query_cache(self, query: str, language: str) -> Optional[list]:
|
| 136 |
+
"""
|
| 137 |
+
Checks if a search query was already served today.
|
| 138 |
+
Returns the cached articles in order, or None if no cache exists.
|
| 139 |
+
"""
|
| 140 |
+
if not self.supabase:
|
| 141 |
+
return None
|
| 142 |
+
|
| 143 |
+
try:
|
| 144 |
+
query = query.strip().lower()
|
| 145 |
+
|
| 146 |
+
# 1. Check if the query exists in cache for today (cache_date is automatically today in DB if not provided,
|
| 147 |
+
# or we sort by created_at DESC)
|
| 148 |
+
res = self.supabase.table("query_cache")\
|
| 149 |
+
.select("article_ids")\
|
| 150 |
+
.eq("query_text", query)\
|
| 151 |
+
.eq("language", language)\
|
| 152 |
+
.order("created_at", desc=True)\
|
| 153 |
+
.limit(1)\
|
| 154 |
+
.execute()
|
| 155 |
+
|
| 156 |
+
if not res.data:
|
| 157 |
+
return None
|
| 158 |
+
|
| 159 |
+
article_ids = res.data[0].get("article_ids", [])
|
| 160 |
+
if not article_ids:
|
| 161 |
+
return None
|
| 162 |
+
|
| 163 |
+
# 2. Fetch the actual articles
|
| 164 |
+
art_res = self.supabase.table("articles").select("*").in_("id", article_ids).execute()
|
| 165 |
+
|
| 166 |
+
# 3. Restore the original sorted order (Top 5 priority)
|
| 167 |
+
article_map = {a["id"]: a for a in art_res.data}
|
| 168 |
+
cached_articles = [article_map[aid] for aid in article_ids if aid in article_map]
|
| 169 |
+
|
| 170 |
+
if cached_articles:
|
| 171 |
+
logger.info(f"Cache hit! Restored {len(cached_articles)} articles for '{query}'")
|
| 172 |
+
return cached_articles
|
| 173 |
+
return None
|
| 174 |
+
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logger.error(f"Error reading query cache for '{query}': {e}")
|
| 177 |
+
return None
|
| 178 |
+
|
| 179 |
+
def write_query_cache(self, query: str, language: str, article_ids: list) -> bool:
|
| 180 |
+
"""
|
| 181 |
+
Saves the resulting top article IDs for a search query.
|
| 182 |
+
"""
|
| 183 |
+
if not self.supabase or not article_ids:
|
| 184 |
+
return False
|
| 185 |
+
|
| 186 |
+
try:
|
| 187 |
+
query = query.strip().lower()
|
| 188 |
+
import datetime
|
| 189 |
+
today = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d")
|
| 190 |
+
|
| 191 |
+
record = {
|
| 192 |
+
"query_text": query,
|
| 193 |
+
"language": language,
|
| 194 |
+
"cache_date": today,
|
| 195 |
+
"article_ids": article_ids
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
self.supabase.table("query_cache").upsert(record).execute()
|
| 199 |
+
logger.info(f"Cached top {len(article_ids)} articles for '{query}'")
|
| 200 |
+
return True
|
| 201 |
+
except Exception as e:
|
| 202 |
+
logger.error(f"Error writing query cache for '{query}': {e}")
|
| 203 |
+
return False
|
hf_app.py
CHANGED
|
@@ -143,6 +143,7 @@ def search(req: SearchRequest):
|
|
| 143 |
"""
|
| 144 |
Phase 1: Scrape & Groq Summarize (Top 5). Returns in ~5s.
|
| 145 |
Phase 2: Silently adds Top 2 to the background TTS Priority Queue (priority=10).
|
|
|
|
| 146 |
"""
|
| 147 |
if req.language not in ["english", "hindi"]:
|
| 148 |
raise HTTPException(status_code=400, detail="Language must be 'english' or 'hindi'")
|
|
@@ -155,7 +156,31 @@ def search(req: SearchRequest):
|
|
| 155 |
print(f"\n{'='*80}")
|
| 156 |
print(f"SEARCH REQUEST: '{query}' ({language})")
|
| 157 |
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
|
|
|
| 159 |
try:
|
| 160 |
articles = _phase1_scrape_and_summarize(query, language, req.pages, req.no_dedup)
|
| 161 |
except Exception as e:
|
|
@@ -195,7 +220,6 @@ def search(req: SearchRequest):
|
|
| 195 |
audio_pending=True,
|
| 196 |
)
|
| 197 |
|
| 198 |
-
|
| 199 |
@app.post("/generate-audio", status_code=202)
|
| 200 |
def generate_audio_on_demand(req: GenerateAudioRequest):
|
| 201 |
"""
|
|
@@ -220,13 +244,13 @@ def generate_audio_on_demand(req: GenerateAudioRequest):
|
|
| 220 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 221 |
|
| 222 |
def _phase1_scrape_and_summarize(query: str, language: str, pages: int, no_dedup: bool) -> List[Dict]:
|
| 223 |
-
"""Sync Scrape + Groq API + Database Insert."""
|
| 224 |
t0 = time.monotonic()
|
| 225 |
project_root = get_project_root()
|
| 226 |
scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py"
|
| 227 |
safe_query = sanitize_query_folder(query)
|
| 228 |
|
| 229 |
-
print(f"[Phase 1] Step 1/
|
| 230 |
result = subprocess.run(
|
| 231 |
[sys.executable, str(scraper_script), f"--{language}", "--search", query, "--pages", str(max(1, pages))],
|
| 232 |
capture_output=True,
|
|
@@ -245,17 +269,22 @@ def _phase1_scrape_and_summarize(query: str, language: str, pages: int, no_dedup
|
|
| 245 |
with open(latest_json, "r", encoding="utf-8") as f:
|
| 246 |
articles = json.load(f)
|
| 247 |
|
| 248 |
-
print(f"[Phase 1] Step 2/
|
| 249 |
summarized = summarize_with_groq(articles, language, max_articles=5)
|
| 250 |
|
| 251 |
if not summarized:
|
| 252 |
raise RuntimeError("Groq summarization returned empty results")
|
| 253 |
|
| 254 |
-
print(f"[Phase 1] Step 3/
|
|
|
|
| 255 |
for article in summarized:
|
| 256 |
article["audio_url"] = ""
|
| 257 |
article["audio_status"] = "queued"
|
|
|
|
| 258 |
db.insert_article(article)
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
print(f"[Phase 1] β
Complete in {time.monotonic() - t0:.1f}s β {len(summarized)} articles ready")
|
| 261 |
return summarized
|
|
|
|
| 143 |
"""
|
| 144 |
Phase 1: Scrape & Groq Summarize (Top 5). Returns in ~5s.
|
| 145 |
Phase 2: Silently adds Top 2 to the background TTS Priority Queue (priority=10).
|
| 146 |
+
Now with Supabase Query Caching (Instant return for repeat searches).
|
| 147 |
"""
|
| 148 |
if req.language not in ["english", "hindi"]:
|
| 149 |
raise HTTPException(status_code=400, detail="Language must be 'english' or 'hindi'")
|
|
|
|
| 156 |
print(f"\n{'='*80}")
|
| 157 |
print(f"SEARCH REQUEST: '{query}' ({language})")
|
| 158 |
print(f"{'='*80}\n")
|
| 159 |
+
|
| 160 |
+
# ββ Phase 0: Check Supabase Query Cache ββββββββββββββββββββββββββββββββββ
|
| 161 |
+
if not req.no_dedup:
|
| 162 |
+
cached_articles = db.check_query_cache(query, language)
|
| 163 |
+
if cached_articles:
|
| 164 |
+
print(f"[Phase 0] β
Cache Hit for '{query}'! Returning instantly.")
|
| 165 |
+
|
| 166 |
+
# Even on a cache hit, we should ensure the Top 2 are in the TTS queue
|
| 167 |
+
# if their audio hasn't finished generating yet.
|
| 168 |
+
for art in cached_articles[:2]:
|
| 169 |
+
if art.get("audio_status") == "queued":
|
| 170 |
+
tts_queue.put_nowait((10, time.monotonic(), {
|
| 171 |
+
"article": dict(art),
|
| 172 |
+
"language": language,
|
| 173 |
+
"query": query
|
| 174 |
+
}))
|
| 175 |
+
|
| 176 |
+
return SearchResponse(
|
| 177 |
+
status="cache_hit",
|
| 178 |
+
message=f"Cache hit! Found {len(cached_articles)} articles for '{query}'.",
|
| 179 |
+
articles=[ArticleResponse(**a) for a in cached_articles],
|
| 180 |
+
audio_pending=any(a.get("audio_status") != "ready" for a in cached_articles),
|
| 181 |
+
)
|
| 182 |
|
| 183 |
+
# ββ Phase 1: Not cached. Scrape + Summarize + Insert ββββββββββββββββββββ
|
| 184 |
try:
|
| 185 |
articles = _phase1_scrape_and_summarize(query, language, req.pages, req.no_dedup)
|
| 186 |
except Exception as e:
|
|
|
|
| 220 |
audio_pending=True,
|
| 221 |
)
|
| 222 |
|
|
|
|
| 223 |
@app.post("/generate-audio", status_code=202)
|
| 224 |
def generate_audio_on_demand(req: GenerateAudioRequest):
|
| 225 |
"""
|
|
|
|
| 244 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 245 |
|
| 246 |
def _phase1_scrape_and_summarize(query: str, language: str, pages: int, no_dedup: bool) -> List[Dict]:
|
| 247 |
+
"""Sync Scrape + Groq API + Database Insert + Update Cache."""
|
| 248 |
t0 = time.monotonic()
|
| 249 |
project_root = get_project_root()
|
| 250 |
scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py"
|
| 251 |
safe_query = sanitize_query_folder(query)
|
| 252 |
|
| 253 |
+
print(f"[Phase 1] Step 1/4: Scraping articles...")
|
| 254 |
result = subprocess.run(
|
| 255 |
[sys.executable, str(scraper_script), f"--{language}", "--search", query, "--pages", str(max(1, pages))],
|
| 256 |
capture_output=True,
|
|
|
|
| 269 |
with open(latest_json, "r", encoding="utf-8") as f:
|
| 270 |
articles = json.load(f)
|
| 271 |
|
| 272 |
+
print(f"[Phase 1] Step 2/4: Summarizing top 5 via Groq...")
|
| 273 |
summarized = summarize_with_groq(articles, language, max_articles=5)
|
| 274 |
|
| 275 |
if not summarized:
|
| 276 |
raise RuntimeError("Groq summarization returned empty results")
|
| 277 |
|
| 278 |
+
print(f"[Phase 1] Step 3/4: Inserting into Supabase...")
|
| 279 |
+
article_ids = []
|
| 280 |
for article in summarized:
|
| 281 |
article["audio_url"] = ""
|
| 282 |
article["audio_status"] = "queued"
|
| 283 |
+
article_ids.append(article.get("id"))
|
| 284 |
db.insert_article(article)
|
| 285 |
+
|
| 286 |
+
print(f"[Phase 1] Step 4/4: Updating Query Cache...")
|
| 287 |
+
db.write_query_cache(query, language, article_ids)
|
| 288 |
|
| 289 |
print(f"[Phase 1] β
Complete in {time.monotonic() - t0:.1f}s β {len(summarized)} articles ready")
|
| 290 |
return summarized
|