Spaces:

dev11-13
/

news-whisper-api

Sleeping

App Files Files Community

Devang1290 commited on 25 days ago

Commit

0147f31

1 Parent(s): e15f0bd

feat: smart priority queue for TTS with lazy frontend pull architecture

Browse files

Files changed (2) hide show

backend/services/database.py +22 -2
hf_app.py +127 -91

backend/services/database.py CHANGED Viewed

@@ -73,6 +73,7 @@ class DatabaseManager:
                 "content": article_data.get('content', ''),
                 "summary": article_data.get('summary', ''),
                 "audio_url": article_data.get('audio_url', ''),
                 "published_at": article_data.get('published_date'),
                 "scraped_at": article_data.get('scraped_at'),
                 "summary_generated_at": article_data.get('summary_generated_at')
@@ -94,9 +95,25 @@ class DatabaseManager:
             logger.error(f"Error inserting article {article_data.get('id')}: {str(e)}")
             return False
     def update_audio_url(self, article_id: str, audio_url: str) -> bool:
         """
-        Updates the audio_url for a specific article in the articles table.
         Called progressively as each TTS clip finishes generating.
         """
         if not self.supabase:
@@ -104,7 +121,10 @@ class DatabaseManager:
         try:
             self.supabase.table("articles").update(
-                {"audio_url": audio_url}
             ).eq("id", article_id).execute()
             return True
         except Exception as e:

                 "content": article_data.get('content', ''),
                 "summary": article_data.get('summary', ''),
                 "audio_url": article_data.get('audio_url', ''),
+                "audio_status": article_data.get('audio_status', 'queued'),
                 "published_at": article_data.get('published_date'),
                 "scraped_at": article_data.get('scraped_at'),
                 "summary_generated_at": article_data.get('summary_generated_at')
             logger.error(f"Error inserting article {article_data.get('id')}: {str(e)}")
             return False
+    def update_audio_status(self, article_id: str, status: str) -> bool:
+        """
+        Updates the progressive audio_status ('queued', 'generating', 'ready').
+        """
+        if not self.supabase:
+            return False
+        try:
+            self.supabase.table("articles").update(
+                {"audio_status": status}
+            ).eq("id", article_id).execute()
+            return True
+        except Exception as e:
+            logger.error(f"Error updating audio_status for {article_id}: {str(e)}")
+            return False
     def update_audio_url(self, article_id: str, audio_url: str) -> bool:
         """
+        Updates the audio_url and sets status to 'ready'.
         Called progressively as each TTS clip finishes generating.
         """
         if not self.supabase:
         try:
             self.supabase.table("articles").update(
+                {
+                    "audio_url": audio_url,
+                    "audio_status": "ready"
+                }
             ).eq("id", article_id).execute()
             return True
         except Exception as e:

hf_app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 """
-News-Whisper On-Demand Search API (v2 — Two-Phase Response)
-===========================================================
-Phase 1 (Synchronous):  Scrape → Groq Summarize (top 5) → Insert to Supabase → Return articles
-Phase 2 (Background):   Kokoro TTS per article → Upload to Cloudinary → Update audio_url in Supabase
-The frontend subscribes to Supabase Realtime and progressively unlocks
-Play buttons as each audio_url changes from null to a Cloudinary URL.
 """
 import sys
@@ -13,12 +14,15 @@ import os
 import json
 import subprocess
 import time
 from pathlib import Path
 from typing import List, Dict, Optional
 from datetime import datetime, timezone
 from fastapi import FastAPI, BackgroundTasks, HTTPException
 from fastapi.responses import RedirectResponse
 from pydantic import BaseModel, Field
 # Ensure project root is in path
@@ -30,19 +34,62 @@ from backend.services.database import DatabaseManager
 from backend.services.cloud import upload_file
 from backend.common.paths import get_project_root, sanitize_query_folder, find_latest_json
 app = FastAPI(
-    title="News-Whisper On-Demand API",
-    description=(
-        "Two-phase search API for News Whisper.\n\n"
-        "**Phase 1:** Returns article summaries in ~5 seconds.\n\n"
-        "**Phase 2:** Generates audio in the background (~65s). "
-        "Subscribe to Supabase Realtime to get progressive audio updates."
-    ),
-    version="2.0.0",
 )
-db = DatabaseManager()
 # ─────────────────────────────────────────────
 #  Request / Response Models
@@ -61,6 +108,7 @@ class ArticleResponse(BaseModel):
     url: str
     author: str
     audio_url: Optional[str] = None
 class SearchResponse(BaseModel):
     status: str
@@ -68,6 +116,11 @@ class SearchResponse(BaseModel):
     articles: List[ArticleResponse] = []
     audio_pending: bool = False
 # ─────────────────────────────────────────────
 #  Endpoints
@@ -81,17 +134,15 @@ def root_redirect():
 @app.get("/health")
 def health_check():
-    """Keep-alive endpoint. Pinged by GitHub Actions to prevent the HF Space from sleeping."""
-    return {"status": "alive"}
 @app.post("/search", response_model=SearchResponse)
-def search(req: SearchRequest, background_tasks: BackgroundTasks):
     """
-    Triggers the on-demand search pipeline.
-    **Phase 1 (sync, ~5s):** Scrapes articles, summarizes top 5 via Groq, inserts into Supabase.
-    **Phase 2 (async, ~65s):** Generates Kokoro TTS audio for each article and progressively updates Supabase.
     """
     if req.language not in ["english", "hindi"]:
         raise HTTPException(status_code=400, detail="Language must be 'english' or 'hindi'")
@@ -105,7 +156,6 @@ def search(req: SearchRequest, background_tasks: BackgroundTasks):
     print(f"SEARCH REQUEST: '{query}' ({language})")
     print(f"{'='*80}\n")
-    # ── Phase 1: Synchronous — Scrape + Summarize + Insert ────────────────────
     try:
         articles = _phase1_scrape_and_summarize(query, language, req.pages, req.no_dedup)
     except Exception as e:
@@ -117,15 +167,19 @@ def search(req: SearchRequest, background_tasks: BackgroundTasks):
             status="empty",
             message=f"No articles found for '{query}'.",
             articles=[],
-            audio_pending=False,
         )
-    # ── Phase 2: Async — TTS in background ────────────────────────────────────
-    background_tasks.add_task(_phase2_generate_audio, articles, language, query)
     return SearchResponse(
         status="ready",
-        message=f"Found {len(articles)} articles for '{query}'. Audio is generating in the background.",
         articles=[
             ArticleResponse(
                 id=a.get("id", ""),
@@ -134,6 +188,7 @@ def search(req: SearchRequest, background_tasks: BackgroundTasks):
                 url=a.get("url", ""),
                 author=a.get("author", ""),
                 audio_url=None,
             )
             for a in articles
         ],
@@ -141,25 +196,36 @@ def search(req: SearchRequest, background_tasks: BackgroundTasks):
     )
 # ─────────────────────────────────────────────
-#  Phase 1: Scrape + Groq Summarize + Insert
 # ─────────────────────────────────────────────
-def _phase1_scrape_and_summarize(
-    query: str, language: str, pages: int, no_dedup: bool
-) -> List[Dict]:
-    """
-    Runs synchronously:
-    1. Scrape articles via subprocess (reuses existing scraper)
-    2. Summarize top 5 via Groq API
-    3. Insert articles into Supabase (audio_url = null)
-    """
     t0 = time.monotonic()
     project_root = get_project_root()
     scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py"
     safe_query = sanitize_query_folder(query)
-    # ── Step 1: Scrape ────────────────────────────────────────────────────────
     print(f"[Phase 1] Step 1/3: Scraping articles...")
     result = subprocess.run(
         [sys.executable, str(scraper_script), f"--{language}", "--search", query, "--pages", str(max(1, pages))],
@@ -171,7 +237,6 @@ def _phase1_scrape_and_summarize(
         print(f"[Phase 1] Scraper stderr: {result.stderr[-500:]}")
         raise RuntimeError("Web scraping failed")
-    # Find the scraped JSON
     scraped_dir = project_root / "articles" / language / "search_queries" / safe_query
     latest_json = find_latest_json(scraped_dir)
     if not latest_json:
@@ -180,74 +245,45 @@ def _phase1_scrape_and_summarize(
     with open(latest_json, "r", encoding="utf-8") as f:
         articles = json.load(f)
-    print(f"[Phase 1] Scraped {len(articles)} articles in {time.monotonic() - t0:.1f}s")
-    # ── Step 2: Groq Summarize (top 5) ────────────────────────────────────────
     print(f"[Phase 1] Step 2/3: Summarizing top 5 via Groq...")
-    t1 = time.monotonic()
     summarized = summarize_with_groq(articles, language, max_articles=5)
-    print(f"[Phase 1] Summarized {len(summarized)} articles via Groq in {time.monotonic() - t1:.1f}s")
     if not summarized:
         raise RuntimeError("Groq summarization returned empty results")
-    # ── Step 3: Insert into Supabase (audio_url = null) ───────────────────────
     print(f"[Phase 1] Step 3/3: Inserting into Supabase...")
     for article in summarized:
-        article["audio_url"] = ""  # Will be updated by Phase 2
         db.insert_article(article)
-    total = time.monotonic() - t0
-    print(f"[Phase 1] ✅ Complete in {total:.1f}s — {len(summarized)} articles ready")
     return summarized
-# ─────────────────────────────────────────────
-#  Phase 2: TTS Generation (Background)
-# ─────────────────────────────────────────────
-def _phase2_generate_audio(articles: List[Dict], language: str, query: str):
-    """
-    Runs in the background after the HTTP response is sent.
-    Generates Kokoro TTS for each article and progressively updates Supabase.
-    """
-    print(f"\n[Phase 2] Starting TTS generation for {len(articles)} articles...")
-    t0 = time.monotonic()
-    safe_query = sanitize_query_folder(query)
     try:
-        # Import TTS module
-        from backend.text_to_speech.tts import generate_audio
-        from backend.services.delivery import DeliveryService
-        delivery = DeliveryService()
-        output_dir = delivery.get_audio_output_dir(language, query, is_search=True)
-        # Generate audio for all articles
-        articles_with_audio = generate_audio(articles, language, output_dir)
-        # Upload each audio to Cloudinary and update Supabase progressively
         timestamp = delivery._get_timestamp_folder()
-        parent_folder = "search_queries"
         safe_target = query.replace(" ", "_").lower()
-        for article in articles_with_audio:
-            article_id = article.get("id")
-            local_audio = article.get("local_audio_path")
-            if local_audio and os.path.exists(local_audio):
-                cloud_folder = f"audios/{language}/{parent_folder}/{safe_target}/{timestamp}"
-                audio_url = upload_file(local_audio, cloud_folder, resource_type="auto")
-                if audio_url:
-                    # Progressive update: frontend sees this via Supabase Realtime
-                    db.update_audio_url(article_id, audio_url)
-                    print(f"[Phase 2] ✅ Audio ready for {article_id}: {audio_url[:80]}...")
-        total = time.monotonic() - t0
-        print(f"[Phase 2] ✅ TTS complete in {total:.1f}s — all audio uploaded")
     except Exception as e:
-        print(f"[Phase 2] ❌ TTS generation failed: {e}")
-        import traceback
-        traceback.print_exc()

 """
+News-Whisper On-Demand Search API (v3 — Lazy/Pull Generation)
+=============================================================
+Phase 1: Sync scrape + Groq Summarize (Top 5) -> Insert to DB as 'queued'.
+Phase 2: Add Top 2 to PriorityQueue (background preload).
+Phase 3: Frontend calls `POST /generate-audio` on scroll/click to jump queue.
+Includes a single asyncio daemon worker to guarantee Kokoro TTS runs sequentially
+without memory leaks or CPU race conditions on Hugging Face Spaces.
 """
 import sys
 import json
 import subprocess
 import time
+import asyncio
 from pathlib import Path
 from typing import List, Dict, Optional
 from datetime import datetime, timezone
+import contextlib
 from fastapi import FastAPI, BackgroundTasks, HTTPException
 from fastapi.responses import RedirectResponse
+from fastapi.concurrency import run_in_threadpool
 from pydantic import BaseModel, Field
 # Ensure project root is in path
 from backend.services.cloud import upload_file
 from backend.common.paths import get_project_root, sanitize_query_folder, find_latest_json
+# ─────────────────────────────────────────────
+#  Global TTS Queue & Worker
+# ─────────────────────────────────────────────
+db = DatabaseManager()
+tts_queue = asyncio.PriorityQueue()
+worker_task = None
+async def tts_worker():
+    """Background worker that continuously processes TTS requests one at a time."""
+    print("[Queue Worker] Started listening for TTS tasks.")
+    while True:
+        try:
+            priority, timestamp, task_data = await tts_queue.get()
+            article = task_data["article"]
+            language = task_data["language"]
+            query = task_data["query"]
+            article_id = article.get("id")
+            print(f"\n[Queue Worker] Generating audio for {article_id} (Priority {priority})")
+            db.update_audio_status(article_id, "generating")
+            # Run heavy TTS in threadpool to prevent blocking the async event loop
+            audio_url = await run_in_threadpool(_sync_generate_single_audio, article, language, query)
+            if audio_url:
+                db.update_audio_url(article_id, audio_url)  # Sets status to 'ready'
+                print(f"[Queue Worker] ✅ Audio ready for {article_id}: {audio_url}")
+            else:
+                db.update_audio_status(article_id, "failed")
+                print(f"[Queue Worker] ❌ Audio failed for {article_id}")
+        except asyncio.CancelledError:
+            break
+        except Exception as e:
+            print(f"[Queue Worker] ❌ Task failed: {e}")
+        finally:
+            tts_queue.task_done()
+@contextlib.asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    global worker_task
+    worker_task = asyncio.create_task(tts_worker())
+    yield
+    # Shutdown
+    if worker_task:
+        worker_task.cancel()
 app = FastAPI(
+    title="News-Whisper Intelligent API",
+    description="Search API with Lazy Audio Generation and Priority Queueing.",
+    version="3.0.0",
+    lifespan=lifespan,
 )
 # ─────────────────────────────────────────────
 #  Request / Response Models
     url: str
     author: str
     audio_url: Optional[str] = None
+    audio_status: str = "queued"
 class SearchResponse(BaseModel):
     status: str
     articles: List[ArticleResponse] = []
     audio_pending: bool = False
+class GenerateAudioRequest(BaseModel):
+    article: dict
+    language: str
+    query: str
 # ─────────────────────────────────────────────
 #  Endpoints
 @app.get("/health")
 def health_check():
+    """Keep-alive endpoint."""
+    return {"status": "alive", "queue_size": tts_queue.qsize()}
 @app.post("/search", response_model=SearchResponse)
+def search(req: SearchRequest):
     """
+    Phase 1: Scrape & Groq Summarize (Top 5). Returns in ~5s.
+    Phase 2: Silently adds Top 2 to the background TTS Priority Queue (priority=10).
     """
     if req.language not in ["english", "hindi"]:
         raise HTTPException(status_code=400, detail="Language must be 'english' or 'hindi'")
     print(f"SEARCH REQUEST: '{query}' ({language})")
     print(f"{'='*80}\n")
     try:
         articles = _phase1_scrape_and_summarize(query, language, req.pages, req.no_dedup)
     except Exception as e:
             status="empty",
             message=f"No articles found for '{query}'.",
             articles=[],
         )
+    # ── Phase 2: Preload Top 2 silently ───────────────────────────────────────
+    for article in articles[:2]:
+        tts_queue.put_nowait((10, time.monotonic(), {
+            "article": dict(article),
+            "language": language,
+            "query": query
+        }))
     return SearchResponse(
         status="ready",
+        message=f"Found {len(articles)} articles for '{query}'. Preloading audio...",
         articles=[
             ArticleResponse(
                 id=a.get("id", ""),
                 url=a.get("url", ""),
                 author=a.get("author", ""),
                 audio_url=None,
+                audio_status="queued"
             )
             for a in articles
         ],
     )
+@app.post("/generate-audio", status_code=202)
+def generate_audio_on_demand(req: GenerateAudioRequest):
+    """
+    Phase 3: Frontend Pull.
+    Adds a specific article to the FRONT of the queue (priority=1).
+    Used when a user scrolls an article into view or explicitly clicks it.
+    """
+    article_id = req.article.get("id")
+    print(f"[API] Priority Generation requested for: {article_id}")
+    tts_queue.put_nowait((1, time.monotonic(), {
+        "article": req.article,
+        "language": req.language.lower(),
+        "query": req.query.strip()
+    }))
+    return {"status": "queued", "message": f"Article {article_id} added to priority queue"}
 # ─────────────────────────────────────────────
+#  Pipeline Logic
 # ─────────────────────────────────────────────
+def _phase1_scrape_and_summarize(query: str, language: str, pages: int, no_dedup: bool) -> List[Dict]:
+    """Sync Scrape + Groq API + Database Insert."""
     t0 = time.monotonic()
     project_root = get_project_root()
     scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py"
     safe_query = sanitize_query_folder(query)
     print(f"[Phase 1] Step 1/3: Scraping articles...")
     result = subprocess.run(
         [sys.executable, str(scraper_script), f"--{language}", "--search", query, "--pages", str(max(1, pages))],
         print(f"[Phase 1] Scraper stderr: {result.stderr[-500:]}")
         raise RuntimeError("Web scraping failed")
     scraped_dir = project_root / "articles" / language / "search_queries" / safe_query
     latest_json = find_latest_json(scraped_dir)
     if not latest_json:
     with open(latest_json, "r", encoding="utf-8") as f:
         articles = json.load(f)
     print(f"[Phase 1] Step 2/3: Summarizing top 5 via Groq...")
     summarized = summarize_with_groq(articles, language, max_articles=5)
     if not summarized:
         raise RuntimeError("Groq summarization returned empty results")
     print(f"[Phase 1] Step 3/3: Inserting into Supabase...")
     for article in summarized:
+        article["audio_url"] = ""
+        article["audio_status"] = "queued"
         db.insert_article(article)
+    print(f"[Phase 1] ✅ Complete in {time.monotonic() - t0:.1f}s — {len(summarized)} articles ready")
     return summarized
+def _sync_generate_single_audio(article: Dict, language: str, query: str) -> Optional[str]:
+    """Runs Kokoro TTS for a single article and uploads to Cloudinary."""
+    from backend.text_to_speech.tts import generate_audio
+    from backend.services.delivery import DeliveryService
+    delivery = DeliveryService()
+    output_dir = delivery.get_audio_output_dir(language, query, is_search=True)
     try:
+        articles_with_audio = generate_audio([article], language, output_dir)
+        if not articles_with_audio:
+            return None
+        local_audio = articles_with_audio[0].get("local_audio_path")
+        if not local_audio or not os.path.exists(local_audio):
+            return None
         timestamp = delivery._get_timestamp_folder()
         safe_target = query.replace(" ", "_").lower()
+        cloud_folder = f"audios/{language}/search_queries/{safe_target}/{timestamp}"
+        return upload_file(local_audio, cloud_folder, resource_type="auto")
     except Exception as e:
+        print(f"[_sync_generate_single_audio] Error: {e}")
+        return None