Spaces:
Running
Running
| """ | |
| RSS feed ingestion with Google News RSS support. | |
| """ | |
| import logging | |
| from datetime import datetime, timezone | |
| from typing import Optional | |
| from urllib.parse import quote_plus | |
| import feedparser | |
| from dateutil import parser as dateutil_parser | |
| from app.settings import get_settings | |
| from app.utils import clean_text, normalize_url | |
| logger = logging.getLogger(__name__) | |
| def build_google_news_rss_url(query: str, language: str = "en") -> str: | |
| """ | |
| Build a Google News RSS URL for a search query. | |
| Args: | |
| query: Search query (e.g., "copper price") | |
| language: Language code (e.g., "en") | |
| Returns: | |
| Google News RSS URL | |
| """ | |
| encoded_query = quote_plus(query) | |
| # Google News RSS format | |
| url = f"https://news.google.com/rss/search?q={encoded_query}&hl={language}&gl=US&ceid=US:{language}" | |
| return url | |
| def parse_rss_date(date_str: str) -> Optional[datetime]: | |
| """Parse RSS date string to datetime.""" | |
| if not date_str: | |
| return None | |
| try: | |
| dt = dateutil_parser.parse(date_str) | |
| if dt.tzinfo is None: | |
| dt = dt.replace(tzinfo=timezone.utc) | |
| return dt | |
| except Exception: | |
| return None | |
| def fetch_rss_feed( | |
| url: str, | |
| max_items: int = 100 | |
| ) -> list[dict]: | |
| """ | |
| Fetch and parse an RSS feed. | |
| Args: | |
| url: RSS feed URL | |
| max_items: Maximum number of items to return | |
| Returns: | |
| List of article dicts with keys: title, url, published_at, source, description | |
| """ | |
| logger.info(f"Fetching RSS feed: {url}") | |
| try: | |
| feed = feedparser.parse(url) | |
| if feed.bozo and feed.bozo_exception: | |
| logger.warning(f"RSS feed parsing warning: {feed.bozo_exception}") | |
| articles = [] | |
| for entry in feed.entries[:max_items]: | |
| try: | |
| # Extract fields | |
| title = entry.get("title", "") | |
| link = entry.get("link", "") | |
| published = entry.get("published", entry.get("updated", "")) | |
| source = entry.get("source", {}).get("title", "") | |
| # Google News wraps the actual source in the title | |
| # Format: "Article Title - Source Name" | |
| if not source and " - " in title: | |
| parts = title.rsplit(" - ", 1) | |
| if len(parts) == 2: | |
| title, source = parts | |
| # Get description/summary | |
| description = entry.get("summary", entry.get("description", "")) | |
| # Clean content | |
| title = clean_text(title) | |
| description = clean_text(description) | |
| if not title: | |
| continue | |
| # Parse date | |
| published_at = parse_rss_date(published) | |
| if not published_at: | |
| published_at = datetime.now(timezone.utc) | |
| articles.append({ | |
| "title": title, | |
| "url": normalize_url(link) if link else None, | |
| "published_at": published_at, | |
| "source": source or "Google News", | |
| "description": description or None, | |
| }) | |
| except Exception as e: | |
| logger.debug(f"Error parsing RSS entry: {e}") | |
| continue | |
| logger.info(f"Fetched {len(articles)} articles from RSS") | |
| return articles | |
| except Exception as e: | |
| logger.error(f"Failed to fetch RSS feed: {e}") | |
| return [] | |
| def fetch_google_news( | |
| query: Optional[str] = None, | |
| language: Optional[str] = None, | |
| max_items: int = 100 | |
| ) -> list[dict]: | |
| """ | |
| Fetch articles from Google News RSS. | |
| Args: | |
| query: Search query. If None, uses settings. | |
| language: Language code. If None, uses settings. | |
| max_items: Maximum articles to fetch | |
| Returns: | |
| List of article dicts | |
| """ | |
| settings = get_settings() | |
| query = query or settings.news_query | |
| language = language or settings.news_language | |
| url = build_google_news_rss_url(query, language) | |
| return fetch_rss_feed(url, max_items) | |
| def fetch_custom_rss_feeds( | |
| urls: list[str], | |
| max_items_per_feed: int = 50 | |
| ) -> list[dict]: | |
| """ | |
| Fetch articles from multiple custom RSS feeds. | |
| Args: | |
| urls: List of RSS feed URLs | |
| max_items_per_feed: Max items per feed | |
| Returns: | |
| Combined list of article dicts | |
| """ | |
| all_articles = [] | |
| for url in urls: | |
| try: | |
| articles = fetch_rss_feed(url, max_items_per_feed) | |
| all_articles.extend(articles) | |
| except Exception as e: | |
| logger.error(f"Failed to fetch RSS {url}: {e}") | |
| return all_articles | |