Devang1290 commited on
Commit
0b32eb4
Β·
1 Parent(s): 8a191ee

feat: phase 3 - supabase query cache for instant top 5 results on repeat searches

Browse files
Files changed (2) hide show
  1. backend/services/database.py +71 -0
  2. hf_app.py +34 -5
backend/services/database.py CHANGED
@@ -1,5 +1,6 @@
1
  import sys
2
  from pathlib import Path
 
3
 
4
  sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
5
  from backend.core.logger import logger
@@ -130,3 +131,73 @@ class DatabaseManager:
130
  except Exception as e:
131
  logger.error(f"Error updating audio_url for {article_id}: {str(e)}")
132
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import sys
2
  from pathlib import Path
3
+ from typing import Optional
4
 
5
  sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
6
  from backend.core.logger import logger
 
131
  except Exception as e:
132
  logger.error(f"Error updating audio_url for {article_id}: {str(e)}")
133
  return False
134
+
135
+ def check_query_cache(self, query: str, language: str) -> Optional[list]:
136
+ """
137
+ Checks if a search query was already served today.
138
+ Returns the cached articles in order, or None if no cache exists.
139
+ """
140
+ if not self.supabase:
141
+ return None
142
+
143
+ try:
144
+ query = query.strip().lower()
145
+
146
+ # 1. Check if the query exists in cache for today (cache_date is automatically today in DB if not provided,
147
+ # or we sort by created_at DESC)
148
+ res = self.supabase.table("query_cache")\
149
+ .select("article_ids")\
150
+ .eq("query_text", query)\
151
+ .eq("language", language)\
152
+ .order("created_at", desc=True)\
153
+ .limit(1)\
154
+ .execute()
155
+
156
+ if not res.data:
157
+ return None
158
+
159
+ article_ids = res.data[0].get("article_ids", [])
160
+ if not article_ids:
161
+ return None
162
+
163
+ # 2. Fetch the actual articles
164
+ art_res = self.supabase.table("articles").select("*").in_("id", article_ids).execute()
165
+
166
+ # 3. Restore the original sorted order (Top 5 priority)
167
+ article_map = {a["id"]: a for a in art_res.data}
168
+ cached_articles = [article_map[aid] for aid in article_ids if aid in article_map]
169
+
170
+ if cached_articles:
171
+ logger.info(f"Cache hit! Restored {len(cached_articles)} articles for '{query}'")
172
+ return cached_articles
173
+ return None
174
+
175
+ except Exception as e:
176
+ logger.error(f"Error reading query cache for '{query}': {e}")
177
+ return None
178
+
179
+ def write_query_cache(self, query: str, language: str, article_ids: list) -> bool:
180
+ """
181
+ Saves the resulting top article IDs for a search query.
182
+ """
183
+ if not self.supabase or not article_ids:
184
+ return False
185
+
186
+ try:
187
+ query = query.strip().lower()
188
+ import datetime
189
+ today = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d")
190
+
191
+ record = {
192
+ "query_text": query,
193
+ "language": language,
194
+ "cache_date": today,
195
+ "article_ids": article_ids
196
+ }
197
+
198
+ self.supabase.table("query_cache").upsert(record).execute()
199
+ logger.info(f"Cached top {len(article_ids)} articles for '{query}'")
200
+ return True
201
+ except Exception as e:
202
+ logger.error(f"Error writing query cache for '{query}': {e}")
203
+ return False
hf_app.py CHANGED
@@ -143,6 +143,7 @@ def search(req: SearchRequest):
143
  """
144
  Phase 1: Scrape & Groq Summarize (Top 5). Returns in ~5s.
145
  Phase 2: Silently adds Top 2 to the background TTS Priority Queue (priority=10).
 
146
  """
147
  if req.language not in ["english", "hindi"]:
148
  raise HTTPException(status_code=400, detail="Language must be 'english' or 'hindi'")
@@ -155,7 +156,31 @@ def search(req: SearchRequest):
155
  print(f"\n{'='*80}")
156
  print(f"SEARCH REQUEST: '{query}' ({language})")
157
  print(f"{'='*80}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
 
159
  try:
160
  articles = _phase1_scrape_and_summarize(query, language, req.pages, req.no_dedup)
161
  except Exception as e:
@@ -195,7 +220,6 @@ def search(req: SearchRequest):
195
  audio_pending=True,
196
  )
197
 
198
-
199
  @app.post("/generate-audio", status_code=202)
200
  def generate_audio_on_demand(req: GenerateAudioRequest):
201
  """
@@ -220,13 +244,13 @@ def generate_audio_on_demand(req: GenerateAudioRequest):
220
  # ─────────────────────────────────────────────
221
 
222
  def _phase1_scrape_and_summarize(query: str, language: str, pages: int, no_dedup: bool) -> List[Dict]:
223
- """Sync Scrape + Groq API + Database Insert."""
224
  t0 = time.monotonic()
225
  project_root = get_project_root()
226
  scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py"
227
  safe_query = sanitize_query_folder(query)
228
 
229
- print(f"[Phase 1] Step 1/3: Scraping articles...")
230
  result = subprocess.run(
231
  [sys.executable, str(scraper_script), f"--{language}", "--search", query, "--pages", str(max(1, pages))],
232
  capture_output=True,
@@ -245,17 +269,22 @@ def _phase1_scrape_and_summarize(query: str, language: str, pages: int, no_dedup
245
  with open(latest_json, "r", encoding="utf-8") as f:
246
  articles = json.load(f)
247
 
248
- print(f"[Phase 1] Step 2/3: Summarizing top 5 via Groq...")
249
  summarized = summarize_with_groq(articles, language, max_articles=5)
250
 
251
  if not summarized:
252
  raise RuntimeError("Groq summarization returned empty results")
253
 
254
- print(f"[Phase 1] Step 3/3: Inserting into Supabase...")
 
255
  for article in summarized:
256
  article["audio_url"] = ""
257
  article["audio_status"] = "queued"
 
258
  db.insert_article(article)
 
 
 
259
 
260
  print(f"[Phase 1] βœ… Complete in {time.monotonic() - t0:.1f}s β€” {len(summarized)} articles ready")
261
  return summarized
 
143
  """
144
  Phase 1: Scrape & Groq Summarize (Top 5). Returns in ~5s.
145
  Phase 2: Silently adds Top 2 to the background TTS Priority Queue (priority=10).
146
+ Now with Supabase Query Caching (Instant return for repeat searches).
147
  """
148
  if req.language not in ["english", "hindi"]:
149
  raise HTTPException(status_code=400, detail="Language must be 'english' or 'hindi'")
 
156
  print(f"\n{'='*80}")
157
  print(f"SEARCH REQUEST: '{query}' ({language})")
158
  print(f"{'='*80}\n")
159
+
160
+ # ── Phase 0: Check Supabase Query Cache ──────────────────────────────────
161
+ if not req.no_dedup:
162
+ cached_articles = db.check_query_cache(query, language)
163
+ if cached_articles:
164
+ print(f"[Phase 0] βœ… Cache Hit for '{query}'! Returning instantly.")
165
+
166
+ # Even on a cache hit, we should ensure the Top 2 are in the TTS queue
167
+ # if their audio hasn't finished generating yet.
168
+ for art in cached_articles[:2]:
169
+ if art.get("audio_status") == "queued":
170
+ tts_queue.put_nowait((10, time.monotonic(), {
171
+ "article": dict(art),
172
+ "language": language,
173
+ "query": query
174
+ }))
175
+
176
+ return SearchResponse(
177
+ status="cache_hit",
178
+ message=f"Cache hit! Found {len(cached_articles)} articles for '{query}'.",
179
+ articles=[ArticleResponse(**a) for a in cached_articles],
180
+ audio_pending=any(a.get("audio_status") != "ready" for a in cached_articles),
181
+ )
182
 
183
+ # ── Phase 1: Not cached. Scrape + Summarize + Insert ────────────────────
184
  try:
185
  articles = _phase1_scrape_and_summarize(query, language, req.pages, req.no_dedup)
186
  except Exception as e:
 
220
  audio_pending=True,
221
  )
222
 
 
223
  @app.post("/generate-audio", status_code=202)
224
  def generate_audio_on_demand(req: GenerateAudioRequest):
225
  """
 
244
  # ─────────────────────────────────────────────
245
 
246
  def _phase1_scrape_and_summarize(query: str, language: str, pages: int, no_dedup: bool) -> List[Dict]:
247
+ """Sync Scrape + Groq API + Database Insert + Update Cache."""
248
  t0 = time.monotonic()
249
  project_root = get_project_root()
250
  scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py"
251
  safe_query = sanitize_query_folder(query)
252
 
253
+ print(f"[Phase 1] Step 1/4: Scraping articles...")
254
  result = subprocess.run(
255
  [sys.executable, str(scraper_script), f"--{language}", "--search", query, "--pages", str(max(1, pages))],
256
  capture_output=True,
 
269
  with open(latest_json, "r", encoding="utf-8") as f:
270
  articles = json.load(f)
271
 
272
+ print(f"[Phase 1] Step 2/4: Summarizing top 5 via Groq...")
273
  summarized = summarize_with_groq(articles, language, max_articles=5)
274
 
275
  if not summarized:
276
  raise RuntimeError("Groq summarization returned empty results")
277
 
278
+ print(f"[Phase 1] Step 3/4: Inserting into Supabase...")
279
+ article_ids = []
280
  for article in summarized:
281
  article["audio_url"] = ""
282
  article["audio_status"] = "queued"
283
+ article_ids.append(article.get("id"))
284
  db.insert_article(article)
285
+
286
+ print(f"[Phase 1] Step 4/4: Updating Query Cache...")
287
+ db.write_query_cache(query, language, article_ids)
288
 
289
  print(f"[Phase 1] βœ… Complete in {time.monotonic() - t0:.1f}s β€” {len(summarized)} articles ready")
290
  return summarized