Spaces:
Runtime error
Runtime error
| import requests | |
| from datetime import datetime, timedelta | |
| import json | |
| import re | |
| # Configuration | |
| import os | |
| API_KEY = os.environ.get("THE_GUARDIANS_API_KEY") # Replace with your Guardian API key | |
| BASE_URL = os.environ.get("BASE_URL") | |
| KEYWORDS = ["climate change", "artificial intelligence", "global economy"] # Keywords to search | |
| DATE_FROM = (datetime.now() - timedelta(days=2)).strftime("%Y-%m-%d") # Last 7 days | |
| # PAGE_SIZE = 2 | |
| # MAX_PAGES = 1 | |
| def clean_article_body(html_body: str) -> str: | |
| """ | |
| Cleans the HTML body of a Guardian article and returns plain text. | |
| """ | |
| if not html_body: | |
| return "" | |
| # Step 1: Remove HTML tags while preserving important paragraph breaks | |
| text = html_body | |
| # Replace paragraph tags with newlines for better readability | |
| text = re.sub(r'<p[^>]*>', '\n\n', text) | |
| text = re.sub(r'</p>', '', text) | |
| # Handle headers | |
| text = re.sub(r'<h[1-6][^>]*>', '\n\n', text) | |
| text = re.sub(r'</h[1-6]>', '\n', text) | |
| # Replace list items with bullet points | |
| text = re.sub(r'<li[^>]*>', '\n• ', text) | |
| # Remove all other HTML tags | |
| text = re.sub(r'<[^>]+>', '', text) | |
| # Step 2: Handle special characters and entities | |
| entities = { | |
| '&': '&', | |
| '<': '<', | |
| '>': '>', | |
| '"': '"', | |
| ''': "'", | |
| ' ': ' ', | |
| '’': "'", | |
| '‘': "'", | |
| '“': '"', | |
| '”': '"', | |
| '–': '–', | |
| '—': '—', | |
| '…': '…' | |
| } | |
| for entity, replacement in entities.items(): | |
| text = text.replace(entity, replacement) | |
| # Step 3: Fix spacing issues | |
| text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space | |
| text = re.sub(r'\n\s+', '\n', text) # Remove spaces after newlines | |
| text = re.sub(r'\n{3,}', '\n\n', text) # Replace 3+ consecutive newlines with 2 | |
| # Step 4: Final trimming | |
| text = text.strip() | |
| return text | |
| def fetch_trending_news(keywords, date_from, page_size=5, max_pages=1): | |
| print("Page size", page_size, "Max pages", max_pages) | |
| """ | |
| Fetch trending news articles from The Guardian API based on keywords. | |
| Returns a list of articles sorted by relevance. | |
| """ | |
| articles = [] | |
| query = " OR ".join(keywords) # Combine keywords with OR for search query | |
| params = { | |
| "api-key": API_KEY, | |
| "q": query, | |
| "from-date": date_from, | |
| "page-size": page_size, | |
| "order-by": "relevance", # Sort by relevance to get trending articles | |
| "show-fields": "headline,webPublicationDate,webUrl,trailText,body" | |
| } | |
| for page in range(1, max_pages + 1): | |
| params["page"] = page | |
| try: | |
| response = requests.get(BASE_URL, params=params) | |
| response.raise_for_status() # Raise exception for bad status codes | |
| data = response.json() | |
| # Extract articles from response | |
| if "response" in data and "results" in data["response"]: | |
| articles.extend(data["response"]["results"]) | |
| else: | |
| print("No results found or unexpected response format.") | |
| break | |
| # Check if there are more pages | |
| if data["response"]["pages"] < page: | |
| break | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching page {page}: {e}") | |
| break | |
| for article in articles: | |
| if "fields" in article and "body" in article["fields"]: | |
| article["fields"]["body"] = clean_article_body(article["fields"]["body"]) | |
| article["fields"]["trailText"] = clean_article_body(article["fields"]["trailText"]) | |
| return articles | |
| # def main(): | |
| # print(f"Fetching trending news for keywords: {', '.join(KEYWORDS)}") | |
| # print(f"From date: {DATE_FROM}") | |
| # articles = fetch_trending_news(KEYWORDS, DATE_FROM) | |
| # if __name__ == "__main__": | |
| # main() |