NewsShots / utils /getNews.py
SwikarG's picture
Update utils/getNews.py
ec494a9 verified
import requests
from datetime import datetime, timedelta
import json
import re
# Configuration
import os
API_KEY = os.environ.get("THE_GUARDIANS_API_KEY") # Replace with your Guardian API key
BASE_URL = os.environ.get("BASE_URL")
KEYWORDS = ["climate change", "artificial intelligence", "global economy"] # Keywords to search
DATE_FROM = (datetime.now() - timedelta(days=2)).strftime("%Y-%m-%d") # Last 7 days
# PAGE_SIZE = 2
# MAX_PAGES = 1
def clean_article_body(html_body: str) -> str:
"""
Cleans the HTML body of a Guardian article and returns plain text.
"""
if not html_body:
return ""
# Step 1: Remove HTML tags while preserving important paragraph breaks
text = html_body
# Replace paragraph tags with newlines for better readability
text = re.sub(r'<p[^>]*>', '\n\n', text)
text = re.sub(r'</p>', '', text)
# Handle headers
text = re.sub(r'<h[1-6][^>]*>', '\n\n', text)
text = re.sub(r'</h[1-6]>', '\n', text)
# Replace list items with bullet points
text = re.sub(r'<li[^>]*>', '\n• ', text)
# Remove all other HTML tags
text = re.sub(r'<[^>]+>', '', text)
# Step 2: Handle special characters and entities
entities = {
'&amp;': '&',
'&lt;': '<',
'&gt;': '>',
'&quot;': '"',
'&#39;': "'",
'&nbsp;': ' ',
'&rsquo;': "'",
'&lsquo;': "'",
'&ldquo;': '"',
'&rdquo;': '"',
'&ndash;': '–',
'&mdash;': '—',
'&hellip;': '…'
}
for entity, replacement in entities.items():
text = text.replace(entity, replacement)
# Step 3: Fix spacing issues
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
text = re.sub(r'\n\s+', '\n', text) # Remove spaces after newlines
text = re.sub(r'\n{3,}', '\n\n', text) # Replace 3+ consecutive newlines with 2
# Step 4: Final trimming
text = text.strip()
return text
def fetch_trending_news(keywords, date_from, page_size=5, max_pages=1):
print("Page size", page_size, "Max pages", max_pages)
"""
Fetch trending news articles from The Guardian API based on keywords.
Returns a list of articles sorted by relevance.
"""
articles = []
query = " OR ".join(keywords) # Combine keywords with OR for search query
params = {
"api-key": API_KEY,
"q": query,
"from-date": date_from,
"page-size": page_size,
"order-by": "relevance", # Sort by relevance to get trending articles
"show-fields": "headline,webPublicationDate,webUrl,trailText,body"
}
for page in range(1, max_pages + 1):
params["page"] = page
try:
response = requests.get(BASE_URL, params=params)
response.raise_for_status() # Raise exception for bad status codes
data = response.json()
# Extract articles from response
if "response" in data and "results" in data["response"]:
articles.extend(data["response"]["results"])
else:
print("No results found or unexpected response format.")
break
# Check if there are more pages
if data["response"]["pages"] < page:
break
except requests.exceptions.RequestException as e:
print(f"Error fetching page {page}: {e}")
break
for article in articles:
if "fields" in article and "body" in article["fields"]:
article["fields"]["body"] = clean_article_body(article["fields"]["body"])
article["fields"]["trailText"] = clean_article_body(article["fields"]["trailText"])
return articles
# def main():
# print(f"Fetching trending news for keywords: {', '.join(KEYWORDS)}")
# print(f"From date: {DATE_FROM}")
# articles = fetch_trending_news(KEYWORDS, DATE_FROM)
# if __name__ == "__main__":
# main()