New_Aggregator_v1 / scraper.py
saran14's picture
Initial commit: Pakistani News Aggregator Flask App
796e8b5
import requests
from bs4 import BeautifulSoup
import json
import datetime
# Replace with the correct GUID News URL
URL = "https://www.theguidenews.com/latest-news"
def scrape_guid_news():
response = requests.get(URL, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(response.content, "lxml")
news_list = []
articles = soup.find_all("div", class_="news-card") # Adjust class based on HTML structure
for article in articles:
try:
title = article.find("h2").get_text(strip=True)
link = article.find("a")["href"]
image_tag = article.find("img")
image = image_tag["src"] if image_tag else None
summary = article.find("p").get_text(strip=True)
date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
news_list.append({
"title": title,
"link": link,
"image": image,
"summary": summary,
"date": date
})
except Exception as e:
print(f"Skipping one article due to error: {e}")
continue
with open("news.json", "w", encoding="utf-8") as f:
json.dump(news_list, f, indent=4, ensure_ascii=False)
print(f"[+] Scraped {len(news_list)} news articles successfully!")
if __name__ == "__main__":
scrape_guid_news()