""" NewsAPI.org Adapter Provides real-time news from 80,000+ sources worldwide. Best for temporal queries requiring fresh, breaking news. Features: - Real-time updates (15-minute refresh) - 80,000+ sources including African outlets - Structured data (title, description, content, source, publishedAt) - Free tier: 100 requests/day - Paid tier: $449/month for production Get API key: https://newsapi.org/register """ import logging import asyncio from typing import List, Dict, Any, Optional from datetime import datetime import httpx logger = logging.getLogger(__name__) class NewsAPIAdapter: """ Adapter for NewsAPI.org real-time news search. Provides fresh news results to complement database search. Designed to be fast (2s timeout) and resilient (graceful fallbacks). """ BASE_URL = "https://newsapi.org/v2" def __init__( self, api_key: str, timeout: float = 2.0, max_results: int = 20 ): """ Initialize NewsAPI adapter. Args: api_key: NewsAPI.org API key timeout: Maximum time to wait for results (seconds) max_results: Maximum number of results to return """ self.api_key = api_key self.timeout = timeout self.max_results = max_results self.client = None if not api_key or api_key == "your-newsapi-key-here": logger.warning("NewsAPI key not configured - adapter disabled") self.api_key = None else: logger.info(f"NewsAPI adapter initialized (timeout={timeout}s, max={max_results})") async def _ensure_client(self): """Lazy initialization of HTTP client""" if self.client is None: self.client = httpx.AsyncClient( timeout=self.timeout, headers={ "X-Api-Key": self.api_key, "User-Agent": "ARKI-AI-RAG/2.5 (Ethiopia News Assistant)" } ) # Domains that are NOT news sources — filter these out _NON_NEWS_DOMAINS = { "pypi.org", "github.com", "stackoverflow.com", "reddit.com", "wikipedia.org", "arxiv.org", "researchgate.net", "academia.edu", "linkedin.com", "facebook.com", "twitter.com", "x.com", "youtube.com", "instagram.com", "tiktok.com", "amazon.com", "ebay.com", "etsy.com", "plos.org", "pubmed.ncbi.nlm.nih.gov", "springer.com", "stemlynsblog.org", } async def search( self, query: str, language: str = "en", sort_by: str = "publishedAt", from_date: Optional[str] = None, max_results: Optional[int] = None ) -> List[Dict[str, Any]]: """ Search NewsAPI for the given query. Always anchors to Ethiopia/Africa context for single-word queries. Filters out non-news domains (pypi, github, academic, social media). """ if not self.api_key: logger.warning("NewsAPI unavailable - returning empty results") return [] await self._ensure_client() max_results = max_results or self.max_results # Build search query — always ensure Ethiopia/Africa context words = query.strip().split() if len(words) == 1: # Single word: anchor to Ethiopia news explicitly search_q = f'"{query}" AND ("Ethiopia" OR "Africa" OR "Horn of Africa")' elif len(words) <= 3: # Short query: AND all terms search_q = " AND ".join(f'"{w}"' for w in words) else: # Longer query: use first 3 key terms search_q = " AND ".join(f'"{w}"' for w in words[:3]) try: url = f"{self.BASE_URL}/everything" params = { "q": search_q, "language": language, "sortBy": sort_by, "pageSize": min(max_results * 2, 100), # Fetch extra to allow filtering } if from_date: params["from"] = from_date logger.info(f"[NewsAPI] Searching: '{search_q}' (lang={language})") response = await self.client.get(url, params=params) if response.status_code == 200: data = response.json() if data.get("status") != "ok": logger.warning(f"NewsAPI error: {data.get('message', 'unknown')}") return [] articles = data.get("articles", []) results = [] filtered_out = 0 for article in articles: # Filter non-news domains url_str = article.get("url", "") domain = self._extract_domain(url_str) if any(nd in domain for nd in self._NON_NEWS_DOMAINS): filtered_out += 1 logger.debug(f"[NewsAPI] Filtered non-news: {domain}") continue normalized = self._normalize_result(article) if normalized: results.append(normalized) if len(results) >= max_results: break if filtered_out: logger.info(f"[NewsAPI] Filtered {filtered_out} non-news articles") logger.info( f"[NewsAPI] '{query[:50]}' → {len(results)} results " f"(total available: {data.get('totalResults', 0)})" ) return results elif response.status_code == 401: logger.error("NewsAPI: Invalid API key") return [] elif response.status_code == 429: logger.warning("NewsAPI: Rate limit exceeded (100 requests/day on free tier)") return [] elif response.status_code == 426: logger.warning("NewsAPI: Upgrade required (free tier limitations)") return [] else: logger.warning(f"NewsAPI returned status {response.status_code}: {response.text[:200]}") return [] except asyncio.TimeoutError: logger.warning(f"NewsAPI timeout ({self.timeout}s)") return [] except Exception as e: logger.error(f"NewsAPI search error: {e}") return [] async def search_top_headlines( self, country: str = "us", category: Optional[str] = None, max_results: Optional[int] = None ) -> List[Dict[str, Any]]: """ Get top headlines from NewsAPI. Args: country: Country code (us, gb, etc.) - Note: Ethiopia (et) not supported category: Category (business, entertainment, general, health, science, sports, technology) max_results: Override default max_results Returns: List of normalized search results """ if not self.api_key: logger.warning("NewsAPI unavailable - returning empty results") return [] await self._ensure_client() max_results = max_results or self.max_results try: url = f"{self.BASE_URL}/top-headlines" params = { "country": country, "pageSize": max_results } if category: params["category"] = category logger.info(f"[NewsAPI] Fetching top headlines (country={country}, category={category})") response = await self.client.get(url, params=params) if response.status_code == 200: data = response.json() articles = data.get("articles", []) results = [] for article in articles: normalized = self._normalize_result(article) if normalized: results.append(normalized) logger.info(f"[NewsAPI] Top headlines: {len(results)} results") return results else: logger.warning(f"NewsAPI top headlines returned status {response.status_code}") return [] except Exception as e: logger.error(f"NewsAPI top headlines error: {e}") return [] def _normalize_result(self, article: Dict[str, Any]) -> Optional[Dict[str, Any]]: """ Normalize NewsAPI result to common format. Args: article: Raw article from NewsAPI Returns: Normalized result dict or None if invalid """ try: # Extract fields title = article.get("title", "").strip() url = article.get("url", "").strip() description = article.get("description", "").strip() content = article.get("content", "").strip() source_name = article.get("source", {}).get("name", "").strip() published_at = article.get("publishedAt", "") author = article.get("author", "") url_to_image = article.get("urlToImage", "") # Validate required fields if not title or not url: logger.debug(f"Skipping invalid result: missing title or URL") return None # Combine description + content for better context full_content = description if content and content != description: # NewsAPI truncates content with [+X chars] # We'll use Jina Reader to get full article later full_content = f"{description}\n\n{content}" # Calculate freshness score freshness_score = self._calculate_freshness(published_at) return { "title": title, "url": url, "content": full_content or title, # Use title if no content "snippet": description, "source": source_name or self._extract_domain(url), "published_at": published_at, "author": author, "image_url": url_to_image, "source_type": "live", "is_live": True, "freshness_score": freshness_score, "language": "en", # NewsAPI returns language in query "metadata": { "title": title, "url": url, "source": source_name, "published_at": published_at, "author": author, "search_engine": "newsapi" } } except Exception as e: logger.warning(f"Failed to normalize NewsAPI result: {e}") return None def _calculate_freshness(self, published_at: str) -> float: """ Calculate freshness score based on article age. Args: published_at: ISO format date string Returns: Freshness score (0.0 to 1.0) """ if not published_at: return 0.8 # Unknown age, assume recent try: pub_date = datetime.fromisoformat(published_at.replace('Z', '+00:00')) age = datetime.utcnow() - pub_date.replace(tzinfo=None) age_minutes = age.total_seconds() / 60 # NewsAPI results are very fresh if age_minutes < 10: return 1.0 # < 10 min elif age_minutes < 60: return 0.98 # < 1 hour elif age_minutes < 360: return 0.95 # < 6 hours elif age_minutes < 1440: return 0.9 # < 24 hours else: return 0.85 # Older but still from live search except: return 0.8 # Default to recent def _extract_domain(self, url: str) -> str: """ Extract domain name from URL. Args: url: Full URL Returns: Domain name (e.g., "bbc.com") """ try: from urllib.parse import urlparse parsed = urlparse(url) domain = parsed.netloc # Remove www. prefix if domain.startswith("www."): domain = domain[4:] return domain except: return "unknown" def is_available(self) -> bool: """ Check if NewsAPI is available. Returns: True if API key is configured, False otherwise """ return self.api_key is not None async def close(self): """Close HTTP client""" if self.client: await self.client.aclose() self.client = None logger.debug("NewsAPI client closed") # ═══════════════════════════════════════════════════════════════════════════ # SINGLETON INSTANCE # ═══════════════════════════════════════════════════════════════════════════ _default_adapter = None def get_newsapi_adapter( api_key: str, timeout: float = 2.0, max_results: int = 20 ) -> NewsAPIAdapter: """ Get or create the default NewsAPI adapter instance. Args: api_key: NewsAPI.org API key timeout: Search timeout in seconds max_results: Maximum results to return Returns: NewsAPIAdapter instance """ global _default_adapter if _default_adapter is None: _default_adapter = NewsAPIAdapter( api_key=api_key, timeout=timeout, max_results=max_results ) return _default_adapter