Spaces:

dev11-13
/

news-whisper-api

Sleeping

App Files Files Community

Devang1290 commited on 26 days ago

Commit

e15f0bd

1 Parent(s): d50eed9

feat: two-phase response - Groq summarization + async TTS with progressive Supabase updates

Browse files

Files changed (3) hide show

backend/services/database.py +17 -0
backend/summarize_articles/groq_summarizer.py +155 -0
hf_app.py +228 -39

backend/services/database.py CHANGED Viewed

@@ -93,3 +93,20 @@ class DatabaseManager:
         except Exception as e:
             logger.error(f"Error inserting article {article_data.get('id')}: {str(e)}")
             return False

         except Exception as e:
             logger.error(f"Error inserting article {article_data.get('id')}: {str(e)}")
             return False
+    def update_audio_url(self, article_id: str, audio_url: str) -> bool:
+        """
+        Updates the audio_url for a specific article in the articles table.
+        Called progressively as each TTS clip finishes generating.
+        """
+        if not self.supabase:
+            return False
+        try:
+            self.supabase.table("articles").update(
+                {"audio_url": audio_url}
+            ).eq("id", article_id).execute()
+            return True
+        except Exception as e:
+            logger.error(f"Error updating audio_url for {article_id}: {str(e)}")
+            return False

backend/summarize_articles/groq_summarizer.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Groq-based Summarizer for On-Demand Search
+============================================
+Uses Groq API (llama-3.3-70b) for fast summarization (~3s for 5 articles)
+instead of local t5-small CPU inference (~100s for 15 articles).
+This module is used ONLY for the on-demand search API.
+The batch feed pipeline continues using t5-small for CI determinism.
+"""
+import os
+import sys
+import time
+from pathlib import Path
+from typing import List, Dict, Optional
+from datetime import datetime, timezone
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+from core.logger import logger
+from core.config import config
+try:
+    from groq import Groq
+except ImportError:
+    Groq = None
+_groq_client = None
+def _get_groq_client() -> Optional[object]:
+    """Lazy-initialize the Groq client."""
+    global _groq_client
+    if _groq_client is None:
+        api_key = config.GROQ_API_KEY or os.getenv("GROQ_API_KEY")
+        if not api_key:
+            logger.error("GROQ_API_KEY is not set. Cannot use Groq summarization.")
+            return None
+        if Groq is None:
+            logger.error("groq package not installed.")
+            return None
+        _groq_client = Groq(api_key=api_key)
+    return _groq_client
+def _summarize_one_english(client, title: str, content: str) -> str:
+    """Summarize a single English article via Groq."""
+    response = client.chat.completions.create(
+        model="llama-3.3-70b-versatile",
+        messages=[
+            {
+                "role": "system",
+                "content": (
+                    "You are a professional news anchor. Summarize the given news article "
+                    "into a smooth, natural 2-3 sentence broadcast script. "
+                    "Keep it concise and informative. Write in a conversational tone suitable "
+                    "for text-to-speech. Output ONLY the summary text, nothing else."
+                ),
+            },
+            {
+                "role": "user",
+                "content": f"Title: {title}\n\nArticle:\n{content[:3000]}",
+            },
+        ],
+        temperature=0.3,
+        max_tokens=300,
+    )
+    return response.choices[0].message.content.strip()
+def _summarize_one_hindi(client, title: str, content: str) -> str:
+    """Summarize a single Hindi article via Groq."""
+    response = client.chat.completions.create(
+        model="llama-3.3-70b-versatile",
+        messages=[
+            {
+                "role": "system",
+                "content": (
+                    "You are a Hindi news anchor. Summarize the given Hindi news article "
+                    "into a natural, smooth 2-3 sentence broadcast script in Hindi. "
+                    "Use simple words. Write all numbers in Hindi words (e.g. दस, सात). "
+                    "Output ONLY the Hindi summary text, nothing else, no quotes."
+                ),
+            },
+            {
+                "role": "user",
+                "content": f"शीर्षक: {title}\n\nलेख:\n{content[:3000]}",
+            },
+        ],
+        temperature=0.3,
+        max_tokens=500,
+    )
+    return response.choices[0].message.content.strip()
+def summarize_with_groq(
+    articles: List[Dict],
+    language: str,
+    max_articles: int = 5,
+) -> List[Dict]:
+    """
+    Summarize articles using Groq API.
+    Args:
+        articles: List of scraped article dicts with 'title' and 'content' keys.
+        language: 'english' or 'hindi'
+        max_articles: Maximum number of articles to process (default: 5)
+    Returns:
+        List of article dicts with 'summary' field added.
+        Returns empty list if Groq is unavailable.
+    """
+    client = _get_groq_client()
+    if client is None:
+        logger.error("Groq client unavailable. Returning empty results.")
+        return []
+    # Take only top N articles
+    articles = articles[:max_articles]
+    logger.info(f"Summarizing {len(articles)} articles via Groq ({language})...")
+    summarize_fn = _summarize_one_hindi if language == "hindi" else _summarize_one_english
+    processed = []
+    for idx, article in enumerate(articles, 1):
+        title = article.get("title", "Untitled")
+        content = article.get("content", "")
+        if not content:
+            logger.warning(f"[{idx}/{len(articles)}] Skipped (no content): {title[:50]}")
+            continue
+        try:
+            safe_title = title[:50].encode("utf-8", errors="replace").decode("utf-8")
+            logger.info(f"[{idx}/{len(articles)}] Summarizing: {safe_title}...")
+            summary = summarize_fn(client, title, content)
+            article["summary"] = summary
+            article["summarized"] = True
+            article["summary_generated_at"] = datetime.now(timezone.utc).isoformat()
+            processed.append(article)
+            # Small delay to respect Groq rate limits (30 RPM free tier)
+            if idx < len(articles):
+                time.sleep(2)
+        except Exception as e:
+            logger.error(f"Groq summarization failed for article {article.get('id')}: {e}")
+            # Include the article without summary rather than dropping it
+            article["summary"] = content[:500] + "..."
+            article["summarized"] = False
+            processed.append(article)
+    logger.success(f"Groq summarization complete: {sum(1 for a in processed if a.get('summarized'))} / {len(processed)} articles.")
+    return processed

hf_app.py CHANGED Viewed

@@ -1,64 +1,253 @@
 import sys
 import os
 from fastapi import FastAPI, BackgroundTasks, HTTPException
-from pydantic import BaseModel
-from typing import Optional
-# Ensure project root is in path so we can import main
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-from main import process_search
 app = FastAPI(
     title="News-Whisper On-Demand API",
-    description="Asynchronous API for running the News Whisper pipeline on-demand.",
-    version="1.0.0"
 )
 class SearchRequest(BaseModel):
-    query: str
-    language: str  # "english" or "hindi"
-    pages: Optional[int] = 1
-    no_dedup: Optional[bool] = False
 @app.get("/health")
 def health_check():
-    """
-    Keep-alive endpoint.
-    A GET request here prevents the Hugging Face space from sleeping after 48h.
-    """
     return {"status": "alive"}
-def run_pipeline(language: str, query: str, no_dedup: bool, pages: int):
-    """
-    Wrapper function to execute the heavy ML pipeline in the background.
-    """
-    try:
-        print(f"Starting background search task: '{query}' ({language})")
-        success = process_search(language, query, no_dedup, pages)
-        if not success:
-            print(f"❌ Background pipeline failed for query: {query}")
-        else:
-            print(f"✅ Background pipeline succeeded for query: {query}")
-    except Exception as e:
-        print(f"❌ Exception during pipeline execution: {e}")
-@app.post("/search")
-def trigger_search(req: SearchRequest, background_tasks: BackgroundTasks):
     """
-    Triggers the on-demand search pipeline asynchronously.
     """
     if req.language not in ["english", "hindi"]:
         raise HTTPException(status_code=400, detail="Language must be 'english' or 'hindi'")
     if not req.query.strip():
         raise HTTPException(status_code=400, detail="Search query cannot be empty")
-    # Add the heavy pipeline to the BackgroundTasks queue
-    # This allows FastAPI to return a 202 Accepted instantly while the TTS runs.
-    background_tasks.add_task(run_pipeline, req.language, req.query, req.no_dedup, req.pages)
-    return {
-        "status": "processing",
-        "message": f"Search for '{req.query}' in {req.language} has been queued. Listen to Supabase Realtime for the final output."
-    }

+"""
+News-Whisper On-Demand Search API (v2 — Two-Phase Response)
+===========================================================
+Phase 1 (Synchronous):  Scrape → Groq Summarize (top 5) → Insert to Supabase → Return articles
+Phase 2 (Background):   Kokoro TTS per article → Upload to Cloudinary → Update audio_url in Supabase
+The frontend subscribes to Supabase Realtime and progressively unlocks
+Play buttons as each audio_url changes from null to a Cloudinary URL.
+"""
 import sys
 import os
+import json
+import subprocess
+import time
+from pathlib import Path
+from typing import List, Dict, Optional
+from datetime import datetime, timezone
 from fastapi import FastAPI, BackgroundTasks, HTTPException
+from fastapi.responses import RedirectResponse
+from pydantic import BaseModel, Field
+# Ensure project root is in path
+PROJECT_ROOT = Path(__file__).parent.resolve()
+sys.path.append(str(PROJECT_ROOT))
+from backend.summarize_articles.groq_summarizer import summarize_with_groq
+from backend.services.database import DatabaseManager
+from backend.services.cloud import upload_file
+from backend.common.paths import get_project_root, sanitize_query_folder, find_latest_json
 app = FastAPI(
     title="News-Whisper On-Demand API",
+    description=(
+        "Two-phase search API for News Whisper.\n\n"
+        "**Phase 1:** Returns article summaries in ~5 seconds.\n\n"
+        "**Phase 2:** Generates audio in the background (~65s). "
+        "Subscribe to Supabase Realtime to get progressive audio updates."
+    ),
+    version="2.0.0",
 )
+db = DatabaseManager()
+# ─────────────────────────────────────────────
+#  Request / Response Models
+# ─────────────────────────────────────────────
 class SearchRequest(BaseModel):
+    query: str = Field(..., description="Search term", json_schema_extra={"examples": ["cricket"]})
+    language: str = Field(..., description="Language: 'english' or 'hindi'", json_schema_extra={"examples": ["english"]})
+    pages: Optional[int] = Field(1, description="Number of search result pages to scrape")
+    no_dedup: Optional[bool] = Field(False, description="Skip duplicate article checking")
+class ArticleResponse(BaseModel):
+    id: str
+    title: str
+    summary: str
+    url: str
+    author: str
+    audio_url: Optional[str] = None
+class SearchResponse(BaseModel):
+    status: str
+    message: str
+    articles: List[ArticleResponse] = []
+    audio_pending: bool = False
+# ─────────────────────────────────────────────
+#  Endpoints
+# ─────────────────────────────────────────────
+@app.get("/", include_in_schema=False)
+def root_redirect():
+    """Redirect root to Swagger docs."""
+    return RedirectResponse(url="/docs")
 @app.get("/health")
 def health_check():
+    """Keep-alive endpoint. Pinged by GitHub Actions to prevent the HF Space from sleeping."""
     return {"status": "alive"}
+@app.post("/search", response_model=SearchResponse)
+def search(req: SearchRequest, background_tasks: BackgroundTasks):
     """
+    Triggers the on-demand search pipeline.
+    **Phase 1 (sync, ~5s):** Scrapes articles, summarizes top 5 via Groq, inserts into Supabase.
+    **Phase 2 (async, ~65s):** Generates Kokoro TTS audio for each article and progressively updates Supabase.
     """
     if req.language not in ["english", "hindi"]:
         raise HTTPException(status_code=400, detail="Language must be 'english' or 'hindi'")
     if not req.query.strip():
         raise HTTPException(status_code=400, detail="Search query cannot be empty")
+    query = req.query.strip()
+    language = req.language.lower()
+    print(f"\n{'='*80}")
+    print(f"SEARCH REQUEST: '{query}' ({language})")
+    print(f"{'='*80}\n")
+    # ── Phase 1: Synchronous — Scrape + Summarize + Insert ────────────────────
+    try:
+        articles = _phase1_scrape_and_summarize(query, language, req.pages, req.no_dedup)
+    except Exception as e:
+        print(f"❌ Phase 1 failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Pipeline failed: {str(e)}")
+    if not articles:
+        return SearchResponse(
+            status="empty",
+            message=f"No articles found for '{query}'.",
+            articles=[],
+            audio_pending=False,
+        )
+    # ── Phase 2: Async — TTS in background ────────────────────────────────────
+    background_tasks.add_task(_phase2_generate_audio, articles, language, query)
+    return SearchResponse(
+        status="ready",
+        message=f"Found {len(articles)} articles for '{query}'. Audio is generating in the background.",
+        articles=[
+            ArticleResponse(
+                id=a.get("id", ""),
+                title=a.get("title", ""),
+                summary=a.get("summary", ""),
+                url=a.get("url", ""),
+                author=a.get("author", ""),
+                audio_url=None,
+            )
+            for a in articles
+        ],
+        audio_pending=True,
+    )
+# ─────────────────────────────────────────────
+#  Phase 1: Scrape + Groq Summarize + Insert
+# ─────────────────────────────────────────────
+def _phase1_scrape_and_summarize(
+    query: str, language: str, pages: int, no_dedup: bool
+) -> List[Dict]:
+    """
+    Runs synchronously:
+    1. Scrape articles via subprocess (reuses existing scraper)
+    2. Summarize top 5 via Groq API
+    3. Insert articles into Supabase (audio_url = null)
+    """
+    t0 = time.monotonic()
+    project_root = get_project_root()
+    scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py"
+    safe_query = sanitize_query_folder(query)
+    # ── Step 1: Scrape ────────────────────────────────────────────────────────
+    print(f"[Phase 1] Step 1/3: Scraping articles...")
+    result = subprocess.run(
+        [sys.executable, str(scraper_script), f"--{language}", "--search", query, "--pages", str(max(1, pages))],
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+    if result.returncode != 0:
+        print(f"[Phase 1] Scraper stderr: {result.stderr[-500:]}")
+        raise RuntimeError("Web scraping failed")
+    # Find the scraped JSON
+    scraped_dir = project_root / "articles" / language / "search_queries" / safe_query
+    latest_json = find_latest_json(scraped_dir)
+    if not latest_json:
+        raise RuntimeError(f"No scraped articles found in {scraped_dir}")
+    with open(latest_json, "r", encoding="utf-8") as f:
+        articles = json.load(f)
+    print(f"[Phase 1] Scraped {len(articles)} articles in {time.monotonic() - t0:.1f}s")
+    # ── Step 2: Groq Summarize (top 5) ────────────────────────────────────────
+    print(f"[Phase 1] Step 2/3: Summarizing top 5 via Groq...")
+    t1 = time.monotonic()
+    summarized = summarize_with_groq(articles, language, max_articles=5)
+    print(f"[Phase 1] Summarized {len(summarized)} articles via Groq in {time.monotonic() - t1:.1f}s")
+    if not summarized:
+        raise RuntimeError("Groq summarization returned empty results")
+    # ── Step 3: Insert into Supabase (audio_url = null) ───────────────────────
+    print(f"[Phase 1] Step 3/3: Inserting into Supabase...")
+    for article in summarized:
+        article["audio_url"] = ""  # Will be updated by Phase 2
+        db.insert_article(article)
+    total = time.monotonic() - t0
+    print(f"[Phase 1] ✅ Complete in {total:.1f}s — {len(summarized)} articles ready")
+    return summarized
+# ─────────────────────────────────────────────
+#  Phase 2: TTS Generation (Background)
+# ─────────────────────────────────────────────
+def _phase2_generate_audio(articles: List[Dict], language: str, query: str):
+    """
+    Runs in the background after the HTTP response is sent.
+    Generates Kokoro TTS for each article and progressively updates Supabase.
+    """
+    print(f"\n[Phase 2] Starting TTS generation for {len(articles)} articles...")
+    t0 = time.monotonic()
+    safe_query = sanitize_query_folder(query)
+    try:
+        # Import TTS module
+        from backend.text_to_speech.tts import generate_audio
+        from backend.services.delivery import DeliveryService
+        delivery = DeliveryService()
+        output_dir = delivery.get_audio_output_dir(language, query, is_search=True)
+        # Generate audio for all articles
+        articles_with_audio = generate_audio(articles, language, output_dir)
+        # Upload each audio to Cloudinary and update Supabase progressively
+        timestamp = delivery._get_timestamp_folder()
+        parent_folder = "search_queries"
+        safe_target = query.replace(" ", "_").lower()
+        for article in articles_with_audio:
+            article_id = article.get("id")
+            local_audio = article.get("local_audio_path")
+            if local_audio and os.path.exists(local_audio):
+                cloud_folder = f"audios/{language}/{parent_folder}/{safe_target}/{timestamp}"
+                audio_url = upload_file(local_audio, cloud_folder, resource_type="auto")
+                if audio_url:
+                    # Progressive update: frontend sees this via Supabase Realtime
+                    db.update_audio_url(article_id, audio_url)
+                    print(f"[Phase 2] ✅ Audio ready for {article_id}: {audio_url[:80]}...")
+        total = time.monotonic() - t0
+        print(f"[Phase 2] ✅ TTS complete in {total:.1f}s — all audio uploaded")
+    except Exception as e:
+        print(f"[Phase 2] ❌ TTS generation failed: {e}")
+        import traceback
+        traceback.print_exc()