Spaces:

Agents-MCP-Hackathon
/

NewsShots

Runtime error

App Files Files Community

NewsShots / utils /getNews.py

SwikarG

Update utils/getNews.py

ec494a9 verified 11 months ago

raw

history blame contribute delete

4.06 kB

	import requests
	from datetime import datetime, timedelta
	import json
	import re
	# Configuration
	import os
	API_KEY = os.environ.get("THE_GUARDIANS_API_KEY") # Replace with your Guardian API key
	BASE_URL = os.environ.get("BASE_URL")

	KEYWORDS = ["climate change", "artificial intelligence", "global economy"] # Keywords to search
	DATE_FROM = (datetime.now() - timedelta(days=2)).strftime("%Y-%m-%d") # Last 7 days
	# PAGE_SIZE = 2
	# MAX_PAGES = 1

	def clean_article_body(html_body: str) -> str:
	"""
	Cleans the HTML body of a Guardian article and returns plain text.
	"""
	if not html_body:
	return ""

	# Step 1: Remove HTML tags while preserving important paragraph breaks
	text = html_body

	# Replace paragraph tags with newlines for better readability
	text = re.sub(r'<p[^>]*>', '\n\n', text)
	text = re.sub(r'</p>', '', text)

	# Handle headers
	text = re.sub(r'<h[1-6][^>]*>', '\n\n', text)
	text = re.sub(r'</h[1-6]>', '\n', text)

	# Replace list items with bullet points
	text = re.sub(r'<li[^>]*>', '\n• ', text)

	# Remove all other HTML tags
	text = re.sub(r'<[^>]+>', '', text)

	# Step 2: Handle special characters and entities
	entities = {
	'&': '&',
	'<': '<',
	'>': '>',
	'"': '"',
	''': "'",
	' ': ' ',
	'’': "'",
	'‘': "'",
	'“': '"',
	'”': '"',
	'–': '–',
	'—': '—',
	'…': '…'
	}

	for entity, replacement in entities.items():
	text = text.replace(entity, replacement)

	# Step 3: Fix spacing issues
	text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
	text = re.sub(r'\n\s+', '\n', text) # Remove spaces after newlines
	text = re.sub(r'\n{3,}', '\n\n', text) # Replace 3+ consecutive newlines with 2

	# Step 4: Final trimming
	text = text.strip()

	return text

	def fetch_trending_news(keywords, date_from, page_size=5, max_pages=1):
	print("Page size", page_size, "Max pages", max_pages)
	"""
	Fetch trending news articles from The Guardian API based on keywords.
	Returns a list of articles sorted by relevance.
	"""
	articles = []
	query = " OR ".join(keywords) # Combine keywords with OR for search query

	params = {
	"api-key": API_KEY,
	"q": query,
	"from-date": date_from,
	"page-size": page_size,
	"order-by": "relevance", # Sort by relevance to get trending articles
	"show-fields": "headline,webPublicationDate,webUrl,trailText,body"
	}

	for page in range(1, max_pages + 1):
	params["page"] = page
	try:
	response = requests.get(BASE_URL, params=params)
	response.raise_for_status() # Raise exception for bad status codes
	data = response.json()

	# Extract articles from response
	if "response" in data and "results" in data["response"]:
	articles.extend(data["response"]["results"])
	else:
	print("No results found or unexpected response format.")
	break

	# Check if there are more pages
	if data["response"]["pages"] < page:
	break

	except requests.exceptions.RequestException as e:
	print(f"Error fetching page {page}: {e}")
	break
	for article in articles:
	if "fields" in article and "body" in article["fields"]:
	article["fields"]["body"] = clean_article_body(article["fields"]["body"])
	article["fields"]["trailText"] = clean_article_body(article["fields"]["trailText"])

	return articles



	# def main():
	# print(f"Fetching trending news for keywords: {', '.join(KEYWORDS)}")
	# print(f"From date: {DATE_FROM}")
	# articles = fetch_trending_news(KEYWORDS, DATE_FROM)

	# if __name__ == "__main__":
	# main()