Spaces:
Sleeping
Sleeping
Update crypto_news_scraper.py
Browse files- crypto_news_scraper.py +2 -9
crypto_news_scraper.py
CHANGED
|
@@ -5,7 +5,7 @@ import re
|
|
| 5 |
|
| 6 |
def scrape_crypto_news(urls: list = None, num_entries: int = 10) -> pd.DataFrame:
|
| 7 |
if urls is None:
|
| 8 |
-
urls = ["https://www.coindesk.com/arc/outboundfeeds/rss/","https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://decrypt.co/feed"]
|
| 9 |
if num_entries is None:
|
| 10 |
num_entries = 10
|
| 11 |
news_dict = {}
|
|
@@ -39,17 +39,10 @@ def scrape_crypto_news(urls: list = None, num_entries: int = 10) -> pd.DataFrame
|
|
| 39 |
plain = re.sub(r"<[^>]+>", " ", html.unescape(raw_html))
|
| 40 |
plain = re.sub(r"\s+", " ", plain).strip()
|
| 41 |
pub = e.get("published", "")
|
| 42 |
-
news_dict[title] = {"link": link, "published": pub, "description": plain, "source": url}
|
| 43 |
|
| 44 |
except:
|
| 45 |
pass
|
| 46 |
news_df = pd.DataFrame(news_dict).T
|
| 47 |
news_df = news_df.reset_index().rename(columns={"index":"title"})
|
| 48 |
-
# Add duplicate filtering
|
| 49 |
-
news_df = news_df.drop_duplicates(subset=['title', 'link'])
|
| 50 |
-
|
| 51 |
-
# Sort by publication date
|
| 52 |
-
news_df['published'] = pd.to_datetime(news_df['published'], errors='coerce')
|
| 53 |
-
news_df = news_df.sort_values('published', ascending=False)[['title','published','description']]
|
| 54 |
-
news_df['description'] = news_df['description'].str.slice(0, 150) # limit description length
|
| 55 |
return news_df
|
|
|
|
| 5 |
|
| 6 |
def scrape_crypto_news(urls: list = None, num_entries: int = 10) -> pd.DataFrame:
|
| 7 |
if urls is None:
|
| 8 |
+
urls = ["https://www.coindesk.com/arc/outboundfeeds/rss/","https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://decrypt.co/feed"]
|
| 9 |
if num_entries is None:
|
| 10 |
num_entries = 10
|
| 11 |
news_dict = {}
|
|
|
|
| 39 |
plain = re.sub(r"<[^>]+>", " ", html.unescape(raw_html))
|
| 40 |
plain = re.sub(r"\s+", " ", plain).strip()
|
| 41 |
pub = e.get("published", "")
|
| 42 |
+
news_dict[title] = {"link": link, "published": pub, "description": plain[:180], "source": url}
|
| 43 |
|
| 44 |
except:
|
| 45 |
pass
|
| 46 |
news_df = pd.DataFrame(news_dict).T
|
| 47 |
news_df = news_df.reset_index().rename(columns={"index":"title"})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
return news_df
|