Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import feedparser | |
| import html | |
| import re | |
| def scrape_crypto_news(urls: list = None, num_entries: int = 10) -> pd.DataFrame: | |
| if urls is None: | |
| # urls = ["https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://cryptonews.com/news/feed/"] | |
| urls = ["https://www.coindesk.com/arc/outboundfeeds/rss/","https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://decrypt.co/feed"] | |
| if num_entries is None: | |
| num_entries = 10 | |
| news_dict = {} | |
| for url in urls: | |
| try: | |
| feed = feedparser.parse(url, request_headers={ | |
| "User-Agent": "Mozilla/5.0 (CryptoNewsBot; +https://example.com)" | |
| }) | |
| # Check if the feed was parsed correctly | |
| if getattr(feed, "bozo", 0): | |
| print("[warn] bozo_exception:", getattr(feed, "bozo_exception", None)) | |
| # Helper to extract the best text field per entry | |
| def get_text(entry): | |
| if "content" in entry and entry.content: | |
| for c in entry.content: | |
| if c.get("value"): | |
| return c["value"] | |
| if entry.get("summary"): | |
| return entry["summary"] | |
| if entry.get("description"): | |
| return entry["description"] | |
| return "" | |
| # Print the first few articles | |
| for i, e in enumerate(feed.entries[:num_entries], 1): | |
| title = e.get("title", "").strip() | |
| link = e.get("link", "") | |
| raw_html = get_text(e) | |
| plain = re.sub(r"<[^>]+>", " ", html.unescape(raw_html)) | |
| plain = re.sub(r"\s+", " ", plain).strip() | |
| pub = e.get("published", "") | |
| news_dict[title] = {"link": link, "published": pub, "description": plain, "source": url} | |
| except: | |
| pass | |
| news_df = pd.DataFrame(news_dict).T | |
| news_df = news_df.reset_index().rename(columns={"index":"title"}) | |
| return news_df |