interactive_trading_bot / crypto_news_scraper.py
samsonleegh's picture
Update crypto_news_scraper.py
dbb587c verified
import pandas as pd
import feedparser
import html
import re
def scrape_crypto_news(urls: list = None, num_entries: int = 10) -> pd.DataFrame:
if urls is None:
# urls = ["https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://cryptonews.com/news/feed/"]
urls = ["https://www.coindesk.com/arc/outboundfeeds/rss/","https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://decrypt.co/feed"]
if num_entries is None:
num_entries = 10
news_dict = {}
for url in urls:
try:
feed = feedparser.parse(url, request_headers={
"User-Agent": "Mozilla/5.0 (CryptoNewsBot; +https://example.com)"
})
# Check if the feed was parsed correctly
if getattr(feed, "bozo", 0):
print("[warn] bozo_exception:", getattr(feed, "bozo_exception", None))
# Helper to extract the best text field per entry
def get_text(entry):
if "content" in entry and entry.content:
for c in entry.content:
if c.get("value"):
return c["value"]
if entry.get("summary"):
return entry["summary"]
if entry.get("description"):
return entry["description"]
return ""
# Print the first few articles
for i, e in enumerate(feed.entries[:num_entries], 1):
title = e.get("title", "").strip()
link = e.get("link", "")
raw_html = get_text(e)
plain = re.sub(r"<[^>]+>", " ", html.unescape(raw_html))
plain = re.sub(r"\s+", " ", plain).strip()
pub = e.get("published", "")
news_dict[title] = {"link": link, "published": pub, "description": plain, "source": url}
except:
pass
news_df = pd.DataFrame(news_dict).T
news_df = news_df.reset_index().rename(columns={"index":"title"})
return news_df