samsonleegh commited on
Commit
46f2ca6
·
verified ·
1 Parent(s): 9c59536

Update crypto_news_scraper.py

Browse files
Files changed (1) hide show
  1. crypto_news_scraper.py +2 -9
crypto_news_scraper.py CHANGED
@@ -5,7 +5,7 @@ import re
5
 
6
  def scrape_crypto_news(urls: list = None, num_entries: int = 10) -> pd.DataFrame:
7
  if urls is None:
8
- urls = ["https://www.coindesk.com/arc/outboundfeeds/rss/","https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://decrypt.co/feed"] #"https://cryptonews.com/news/feed/"]
9
  if num_entries is None:
10
  num_entries = 10
11
  news_dict = {}
@@ -39,17 +39,10 @@ def scrape_crypto_news(urls: list = None, num_entries: int = 10) -> pd.DataFrame
39
  plain = re.sub(r"<[^>]+>", " ", html.unescape(raw_html))
40
  plain = re.sub(r"\s+", " ", plain).strip()
41
  pub = e.get("published", "")
42
- news_dict[title] = {"link": link, "published": pub, "description": plain, "source": url}
43
 
44
  except:
45
  pass
46
  news_df = pd.DataFrame(news_dict).T
47
  news_df = news_df.reset_index().rename(columns={"index":"title"})
48
- # Add duplicate filtering
49
- news_df = news_df.drop_duplicates(subset=['title', 'link'])
50
-
51
- # Sort by publication date
52
- news_df['published'] = pd.to_datetime(news_df['published'], errors='coerce')
53
- news_df = news_df.sort_values('published', ascending=False)[['title','published','description']]
54
- news_df['description'] = news_df['description'].str.slice(0, 150) # limit description length
55
  return news_df
 
5
 
6
  def scrape_crypto_news(urls: list = None, num_entries: int = 10) -> pd.DataFrame:
7
  if urls is None:
8
+ urls = ["https://www.coindesk.com/arc/outboundfeeds/rss/","https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://decrypt.co/feed"]
9
  if num_entries is None:
10
  num_entries = 10
11
  news_dict = {}
 
39
  plain = re.sub(r"<[^>]+>", " ", html.unescape(raw_html))
40
  plain = re.sub(r"\s+", " ", plain).strip()
41
  pub = e.get("published", "")
42
+ news_dict[title] = {"link": link, "published": pub, "description": plain[:180], "source": url}
43
 
44
  except:
45
  pass
46
  news_df = pd.DataFrame(news_dict).T
47
  news_df = news_df.reset_index().rename(columns={"index":"title"})
 
 
 
 
 
 
 
48
  return news_df