MAX_ENTRIES_EACH_RSS = 5 MAX_TOTAL_ENTRIES = 200 from dateutil import parser as dateutil_parser from datetime import datetime, timezone from bs4 import BeautifulSoup import feedparser import humanize import html import re ls_rss_urls = [ { "meta_url": "https://vneconomy.vn/chung-khoan.rss", "meta_domain": "vneconomy.vn (Hội Khoa học Kinh tế Việt Nam)", "meta_label": "chứng khoán", }, { "meta_url": "https://cafebiz.vn/rss/chung-khoan.rss", "meta_domain": "cafebiz.vn", "meta_label": "chứng khoán", }, { "meta_url": "https://vietstock.vn/830/chung-khoan/co-phieu.rss", "meta_domain": "vietstock.vn", "meta_label": "cổ phiếu", }, { "meta_url": "https://vietstock.vn/739/chung-khoan/giao-dich-noi-bo.rss", "meta_domain": "vietstock.vn", "meta_label": "giao dịch nội bộ", }, { "meta_url": "https://nhandan.vn/rss/chungkhoan-1191.rss", "meta_domain": "nhandan.vn (Đảng Cộng sản Việt Nam)", "meta_label": "chứng khoán", }, { "meta_url": "https://thanhnien.vn/rss/kinh-te/chung-khoan.rss", "meta_domain": "thanhnien.vn (Hội Liên hiệp Thanh niên Việt Nam)", "meta_label": "chứng khoán", }, { "meta_url": "https://www.sggp.org.vn/rss/kinhte-taichinhchuungkhoan-44.rss", "meta_domain": "sggp.org.vn (Đảng bộ Đảng Cộng sản Việt Nam TP.HCM)", "meta_label": "kinh tế, tài chính, chứng khoán", }, { "meta_url": "https://antt.vn/rss/chung-khoan.rss", "meta_domain": "antt.vn", "meta_label": "chứng khoán", }, { "meta_url": "https://nld.com.vn/rss/kinh-te/tai-chinh-chung-khoan.rss", "meta_domain": "nld.com.vn (Thành ủy TP.HCM)", "meta_label": "tài chính, chứng khoán", }, { "meta_url": "https://soha.vn/rss/kinh-doanh/chung-khoan.rss", "meta_domain": "soha.vn", "meta_label": "chứng khoán", }, { "meta_url": "https://tienphong.vn/rss/tai-chinh-chung-khoan-105.rss", "meta_domain": "tienphong.vn (Đoàn TNCS Hồ Chí Minh)", "meta_label": "tài chính, chứng khoán", }, { "meta_url": "https://www.nguoiduatin.vn/rss/kinh-te/tai-chinh-ngan-hang.rss", "meta_domain": "nguoiduatin.vn (Hội Luật gia Việt Nam)", "meta_label": "kinh tế, tài chính, ngân hàng", }, { "meta_url": "https://bnews.vn/rss/chung-khoan-33.rss", "meta_domain": "bnews.vn (Thông tấn xã Việt Nam)", "meta_label": "chứng khoán", }, { "meta_url": "https://vnexpress.net/rss/kinh-doanh.rss", "meta_domain": "vnexpress.net (Bộ Khoa học và Công nghệ)", "meta_label": "kinh doanh", }, { "meta_url": "https://vtv.vn/rss/kinh-te.rss", "meta_domain": "vtv.vn (Đài Truyền hình Việt Nam)", "meta_label": "kinh tế", }, { "meta_url": "https://rss.nytimes.com/services/xml/rss/nyt/Economy.xml", "meta_domain": "nytimes.com (The New York Times - US)", "meta_label": "economy", }, { "meta_url": "https://feeds.content.dowjones.io/public/rss/socialeconomyfeed", "meta_domain": "wsj.com (The Wall Street Journal - US)", "meta_label": "economy", }, { "meta_url": "https://abcnews.go.com/abcnews/moneyheadlines", "meta_domain": "abcnews.com (ABC News - US)", "meta_label": "money", }, { "meta_url": "https://www.wired.com/feed/category/business/latest/rss", "meta_domain": "wired.com (WIRED - US)", "meta_label": "business", }, { "meta_url": "https://www.theguardian.com/uk/business/rss", "meta_domain": "theguardian.com (The Guardian - UK)", "meta_label": "business", }, { "meta_url": "https://feeds.npr.org/1017/rss.xml", "meta_domain": "npr.org (National Public Radio - US)", "meta_label": "economy", }, { "meta_url": "https://www.economywatch.com/feed", "meta_domain": "economywatch.com (Economy Watch - UK)", "meta_label": "economy", }, ] def normalize_time(dt): if dt.tzinfo is None: return dt.replace(tzinfo=timezone.utc) return dt.astimezone(timezone.utc) def humanize_ago(dt): now = datetime.now(timezone.utc) dt_utc = dt.astimezone(timezone.utc) return humanize.naturaltime(now - dt_utc).capitalize() def clean_text(str_html): # Fix missing the leading & str_html = html.unescape(re.sub(r'#(\d+);', r'&#\1;', str_html)) # Remove all img tags soup = BeautifulSoup(str_html, "html.parser") for img in soup.find_all("img"): img.decompose() str_html = str(soup) # 🍌 Remove all HTML tags str_html = soup.get_text(separator=' ', strip=True) # Return return str_html def rss_parser(): # ---------- all_entries = [] for myrss in ls_rss_urls: try: new_entries = feedparser.parse(myrss["meta_url"]).entries for ent in new_entries: ent["meta_url"] = myrss["meta_url"] ent["meta_domain"] = myrss["meta_domain"] ent["meta_label"] = myrss["meta_label"] all_entries += new_entries[:MAX_ENTRIES_EACH_RSS] print(f"{'✅' if len(new_entries)>0 else '❌'} {myrss['meta_url']} > {len(new_entries)}") except: pass # try: # published_text = new_entries[0]['published'] # print(f"Test: {published_text} -> {dateutil_parser.parse(published_text)}") # except: # pass # ---------- # ls_type_of_keys = [] # for e in all_entries: # if list(e.keys()) not in ls_type_of_keys: # ls_type_of_keys.append(list(e.keys())) # ls_common_keys = list(set(ls_type_of_keys[0]).intersection(*ls_type_of_keys[1:])) # print(ls_common_keys) # ---------- all_entries_clean = [] for e in all_entries: all_entries_clean.append({ "title": clean_text(e["title"]), "link": e["link"], "summary": clean_text(e["summary"]), "time": normalize_time(dateutil_parser.parse(e["published"])), "time_ago": humanize_ago(normalize_time(dateutil_parser.parse(e["published"]))), # "published_parsed": e["published_parsed"], # "title_detail": e["title_detail"], # "id": e["id"], # "links": e["links"], # "published": e["published"], # "guidislink": e["guidislink"], # "summary_detail": e["summary_detail"], "meta_url": e["meta_url"], "meta_domain": e["meta_domain"], "meta_label": e["meta_label"], }) # ---------- all_entries_sorted = sorted(all_entries_clean, key=lambda x: x["time"], reverse=True) # ---------- return all_entries_sorted[:MAX_TOTAL_ENTRIES]