|
|
import re |
|
|
from loguru import logger |
|
|
from .serp_utils import serpapi_web_search |
|
|
from .article_extractor import fetch_article_text_from_url |
|
|
|
|
|
SERPAPI_KEY = "your-serpapi-key" |
|
|
|
|
|
def analyze_url(news_url: str, run_serp: bool): |
|
|
"""Process and extract text from a given news URL.""" |
|
|
article_text, headline = fetch_article_text_from_url(news_url) |
|
|
qa_fallback_note = "" |
|
|
|
|
|
if not article_text and headline and run_serp and SERPAPI_KEY: |
|
|
serpapi_result = serpapi_web_search(headline, num=8) |
|
|
snippets = [res.get("snippet", "") for res in serpapi_result.get("result", {}).get("organic_results", [])] |
|
|
serp_text = "\n\n".join([s for s in snippets if s])[:3000] |
|
|
if serp_text: |
|
|
article_text = f"(SERP fallback for headline: {headline})\n\n{serp_text}" |
|
|
qa_fallback_note = ( |
|
|
"Note: full article text unavailable — using SERP snippets for analysis. " |
|
|
"Please verify date/location in original sources." |
|
|
) |
|
|
else: |
|
|
article_text = f"(No extractable text) Headline: {headline}" |
|
|
qa_fallback_note = "Note: only headline extracted — limited reliability." |
|
|
elif not article_text: |
|
|
article_text = f"(Failed to extract article text from URL: {news_url})" |
|
|
|
|
|
return article_text, headline, qa_fallback_note |
|
|
|
|
|
|
|
|
def analyze_text(news_text: str): |
|
|
"""Process plain text input.""" |
|
|
news_text = (news_text or "").strip() |
|
|
if not news_text: |
|
|
raise ValueError("Empty text input provided.") |
|
|
return news_text, "", "" |
|
|
|