Spaces:

samsonleegh
/

interactive_trading_bot

Sleeping

App Files Files Community

interactive_trading_bot / crypto_news_scraper.py

samsonleegh

Update crypto_news_scraper.py

dbb587c verified 6 months ago

raw

history blame contribute delete

2.07 kB

	import pandas as pd
	import feedparser
	import html
	import re

	def scrape_crypto_news(urls: list = None, num_entries: int = 10) -> pd.DataFrame:
	if urls is None:
	# urls = ["https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://cryptonews.com/news/feed/"]
	urls = ["https://www.coindesk.com/arc/outboundfeeds/rss/","https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://decrypt.co/feed"]
	if num_entries is None:
	num_entries = 10
	news_dict = {}
	for url in urls:
	try:
	feed = feedparser.parse(url, request_headers={
	"User-Agent": "Mozilla/5.0 (CryptoNewsBot; +https://example.com)"
	})

	# Check if the feed was parsed correctly
	if getattr(feed, "bozo", 0):
	print("[warn] bozo_exception:", getattr(feed, "bozo_exception", None))

	# Helper to extract the best text field per entry
	def get_text(entry):
	if "content" in entry and entry.content:
	for c in entry.content:
	if c.get("value"):
	return c["value"]
	if entry.get("summary"):
	return entry["summary"]
	if entry.get("description"):
	return entry["description"]
	return ""

	# Print the first few articles
	for i, e in enumerate(feed.entries[:num_entries], 1):
	title = e.get("title", "").strip()
	link = e.get("link", "")
	raw_html = get_text(e)
	plain = re.sub(r"<[^>]+>", " ", html.unescape(raw_html))
	plain = re.sub(r"\s+", " ", plain).strip()
	pub = e.get("published", "")
	news_dict[title] = {"link": link, "published": pub, "description": plain, "source": url}

	except:
	pass
	news_df = pd.DataFrame(news_dict).T
	news_df = news_df.reset_index().rename(columns={"index":"title"})
	return news_df