Spaces:

dev11-13
/

news-whisper-api

Running

App Files Files Community

Devang1290 commited on 4 days ago

Commit

0b32eb4

1 Parent(s): 8a191ee

feat: phase 3 - supabase query cache for instant top 5 results on repeat searches

Browse files

Files changed (2) hide show

backend/services/database.py +71 -0
hf_app.py +34 -5

backend/services/database.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 from backend.core.logger import logger
@@ -130,3 +131,73 @@ class DatabaseManager:
         except Exception as e:
             logger.error(f"Error updating audio_url for {article_id}: {str(e)}")
             return False

 import sys
 from pathlib import Path
+from typing import Optional
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 from backend.core.logger import logger
         except Exception as e:
             logger.error(f"Error updating audio_url for {article_id}: {str(e)}")
             return False
+    def check_query_cache(self, query: str, language: str) -> Optional[list]:
+        """
+        Checks if a search query was already served today.
+        Returns the cached articles in order, or None if no cache exists.
+        """
+        if not self.supabase:
+            return None
+        try:
+            query = query.strip().lower()
+            # 1. Check if the query exists in cache for today (cache_date is automatically today in DB if not provided,
+            # or we sort by created_at DESC)
+            res = self.supabase.table("query_cache")\
+                .select("article_ids")\
+                .eq("query_text", query)\
+                .eq("language", language)\
+                .order("created_at", desc=True)\
+                .limit(1)\
+                .execute()
+            if not res.data:
+                return None
+            article_ids = res.data[0].get("article_ids", [])
+            if not article_ids:
+                return None
+            # 2. Fetch the actual articles
+            art_res = self.supabase.table("articles").select("*").in_("id", article_ids).execute()
+            # 3. Restore the original sorted order (Top 5 priority)
+            article_map = {a["id"]: a for a in art_res.data}
+            cached_articles = [article_map[aid] for aid in article_ids if aid in article_map]
+            if cached_articles:
+                logger.info(f"Cache hit! Restored {len(cached_articles)} articles for '{query}'")
+                return cached_articles
+            return None
+        except Exception as e:
+            logger.error(f"Error reading query cache for '{query}': {e}")
+            return None
+    def write_query_cache(self, query: str, language: str, article_ids: list) -> bool:
+        """
+        Saves the resulting top article IDs for a search query.
+        """
+        if not self.supabase or not article_ids:
+            return False
+        try:
+            query = query.strip().lower()
+            import datetime
+            today = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d")
+            record = {
+                "query_text": query,
+                "language": language,
+                "cache_date": today,
+                "article_ids": article_ids
+            }
+            self.supabase.table("query_cache").upsert(record).execute()
+            logger.info(f"Cached top {len(article_ids)} articles for '{query}'")
+            return True
+        except Exception as e:
+            logger.error(f"Error writing query cache for '{query}': {e}")
+            return False

hf_app.py CHANGED Viewed

@@ -143,6 +143,7 @@ def search(req: SearchRequest):
     """
     Phase 1: Scrape & Groq Summarize (Top 5). Returns in ~5s.
     Phase 2: Silently adds Top 2 to the background TTS Priority Queue (priority=10).
     """
     if req.language not in ["english", "hindi"]:
         raise HTTPException(status_code=400, detail="Language must be 'english' or 'hindi'")
@@ -155,7 +156,31 @@ def search(req: SearchRequest):
     print(f"\n{'='*80}")
     print(f"SEARCH REQUEST: '{query}' ({language})")
     print(f"{'='*80}\n")
     try:
         articles = _phase1_scrape_and_summarize(query, language, req.pages, req.no_dedup)
     except Exception as e:
@@ -195,7 +220,6 @@ def search(req: SearchRequest):
         audio_pending=True,
     )
 @app.post("/generate-audio", status_code=202)
 def generate_audio_on_demand(req: GenerateAudioRequest):
     """
@@ -220,13 +244,13 @@ def generate_audio_on_demand(req: GenerateAudioRequest):
 # ─────────────────────────────────────────────
 def _phase1_scrape_and_summarize(query: str, language: str, pages: int, no_dedup: bool) -> List[Dict]:
-    """Sync Scrape + Groq API + Database Insert."""
     t0 = time.monotonic()
     project_root = get_project_root()
     scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py"
     safe_query = sanitize_query_folder(query)
-    print(f"[Phase 1] Step 1/3: Scraping articles...")
     result = subprocess.run(
         [sys.executable, str(scraper_script), f"--{language}", "--search", query, "--pages", str(max(1, pages))],
         capture_output=True,
@@ -245,17 +269,22 @@ def _phase1_scrape_and_summarize(query: str, language: str, pages: int, no_dedup
     with open(latest_json, "r", encoding="utf-8") as f:
         articles = json.load(f)
-    print(f"[Phase 1] Step 2/3: Summarizing top 5 via Groq...")
     summarized = summarize_with_groq(articles, language, max_articles=5)
     if not summarized:
         raise RuntimeError("Groq summarization returned empty results")
-    print(f"[Phase 1] Step 3/3: Inserting into Supabase...")
     for article in summarized:
         article["audio_url"] = ""
         article["audio_status"] = "queued"
         db.insert_article(article)
     print(f"[Phase 1] ✅ Complete in {time.monotonic() - t0:.1f}s — {len(summarized)} articles ready")
     return summarized

     """
     Phase 1: Scrape & Groq Summarize (Top 5). Returns in ~5s.
     Phase 2: Silently adds Top 2 to the background TTS Priority Queue (priority=10).
+    Now with Supabase Query Caching (Instant return for repeat searches).
     """
     if req.language not in ["english", "hindi"]:
         raise HTTPException(status_code=400, detail="Language must be 'english' or 'hindi'")
     print(f"\n{'='*80}")
     print(f"SEARCH REQUEST: '{query}' ({language})")
     print(f"{'='*80}\n")
+    # ── Phase 0: Check Supabase Query Cache ──────────────────────────────────
+    if not req.no_dedup:
+        cached_articles = db.check_query_cache(query, language)
+        if cached_articles:
+            print(f"[Phase 0] ✅ Cache Hit for '{query}'! Returning instantly.")
+            # Even on a cache hit, we should ensure the Top 2 are in the TTS queue
+            # if their audio hasn't finished generating yet.
+            for art in cached_articles[:2]:
+                if art.get("audio_status") == "queued":
+                    tts_queue.put_nowait((10, time.monotonic(), {
+                        "article": dict(art),
+                        "language": language,
+                        "query": query
+                    }))
+            return SearchResponse(
+                status="cache_hit",
+                message=f"Cache hit! Found {len(cached_articles)} articles for '{query}'.",
+                articles=[ArticleResponse(**a) for a in cached_articles],
+                audio_pending=any(a.get("audio_status") != "ready" for a in cached_articles),
+            )
+    # ── Phase 1: Not cached. Scrape + Summarize + Insert ────────────────────
     try:
         articles = _phase1_scrape_and_summarize(query, language, req.pages, req.no_dedup)
     except Exception as e:
         audio_pending=True,
     )
 @app.post("/generate-audio", status_code=202)
 def generate_audio_on_demand(req: GenerateAudioRequest):
     """
 # ─────────────────────────────────────────────
 def _phase1_scrape_and_summarize(query: str, language: str, pages: int, no_dedup: bool) -> List[Dict]:
+    """Sync Scrape + Groq API + Database Insert + Update Cache."""
     t0 = time.monotonic()
     project_root = get_project_root()
     scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py"
     safe_query = sanitize_query_folder(query)
+    print(f"[Phase 1] Step 1/4: Scraping articles...")
     result = subprocess.run(
         [sys.executable, str(scraper_script), f"--{language}", "--search", query, "--pages", str(max(1, pages))],
         capture_output=True,
     with open(latest_json, "r", encoding="utf-8") as f:
         articles = json.load(f)
+    print(f"[Phase 1] Step 2/4: Summarizing top 5 via Groq...")
     summarized = summarize_with_groq(articles, language, max_articles=5)
     if not summarized:
         raise RuntimeError("Groq summarization returned empty results")
+    print(f"[Phase 1] Step 3/4: Inserting into Supabase...")
+    article_ids = []
     for article in summarized:
         article["audio_url"] = ""
         article["audio_status"] = "queued"
+        article_ids.append(article.get("id"))
         db.insert_article(article)
+    print(f"[Phase 1] Step 4/4: Updating Query Cache...")
+    db.write_query_cache(query, language, article_ids)
     print(f"[Phase 1] ✅ Complete in {time.monotonic() - t0:.1f}s — {len(summarized)} articles ready")
     return summarized