Spaces:
Running
Running
| MAX_ENTRIES_EACH_RSS = 5 | |
| MAX_TOTAL_ENTRIES = 200 | |
| from dateutil import parser as dateutil_parser | |
| from datetime import datetime, timezone | |
| from bs4 import BeautifulSoup | |
| import feedparser | |
| import humanize | |
| import html | |
| import re | |
| ls_rss_urls = [ | |
| { | |
| "meta_url": "https://vneconomy.vn/chung-khoan.rss", | |
| "meta_domain": "vneconomy.vn (Hội Khoa học Kinh tế Việt Nam)", | |
| "meta_label": "chứng khoán", | |
| }, | |
| { | |
| "meta_url": "https://cafebiz.vn/rss/chung-khoan.rss", | |
| "meta_domain": "cafebiz.vn", | |
| "meta_label": "chứng khoán", | |
| }, | |
| { | |
| "meta_url": "https://vietstock.vn/830/chung-khoan/co-phieu.rss", | |
| "meta_domain": "vietstock.vn", | |
| "meta_label": "cổ phiếu", | |
| }, | |
| { | |
| "meta_url": "https://vietstock.vn/739/chung-khoan/giao-dich-noi-bo.rss", | |
| "meta_domain": "vietstock.vn", | |
| "meta_label": "giao dịch nội bộ", | |
| }, | |
| { | |
| "meta_url": "https://nhandan.vn/rss/chungkhoan-1191.rss", | |
| "meta_domain": "nhandan.vn (Đảng Cộng sản Việt Nam)", | |
| "meta_label": "chứng khoán", | |
| }, | |
| { | |
| "meta_url": "https://thanhnien.vn/rss/kinh-te/chung-khoan.rss", | |
| "meta_domain": "thanhnien.vn (Hội Liên hiệp Thanh niên Việt Nam)", | |
| "meta_label": "chứng khoán", | |
| }, | |
| { | |
| "meta_url": "https://www.sggp.org.vn/rss/kinhte-taichinhchuungkhoan-44.rss", | |
| "meta_domain": "sggp.org.vn (Đảng bộ Đảng Cộng sản Việt Nam TP.HCM)", | |
| "meta_label": "kinh tế, tài chính, chứng khoán", | |
| }, | |
| { | |
| "meta_url": "https://antt.vn/rss/chung-khoan.rss", | |
| "meta_domain": "antt.vn", | |
| "meta_label": "chứng khoán", | |
| }, | |
| { | |
| "meta_url": "https://nld.com.vn/rss/kinh-te/tai-chinh-chung-khoan.rss", | |
| "meta_domain": "nld.com.vn (Thành ủy TP.HCM)", | |
| "meta_label": "tài chính, chứng khoán", | |
| }, | |
| { | |
| "meta_url": "https://soha.vn/rss/kinh-doanh/chung-khoan.rss", | |
| "meta_domain": "soha.vn", | |
| "meta_label": "chứng khoán", | |
| }, | |
| { | |
| "meta_url": "https://tienphong.vn/rss/tai-chinh-chung-khoan-105.rss", | |
| "meta_domain": "tienphong.vn (Đoàn TNCS Hồ Chí Minh)", | |
| "meta_label": "tài chính, chứng khoán", | |
| }, | |
| { | |
| "meta_url": "https://www.nguoiduatin.vn/rss/kinh-te/tai-chinh-ngan-hang.rss", | |
| "meta_domain": "nguoiduatin.vn (Hội Luật gia Việt Nam)", | |
| "meta_label": "kinh tế, tài chính, ngân hàng", | |
| }, | |
| { | |
| "meta_url": "https://bnews.vn/rss/chung-khoan-33.rss", | |
| "meta_domain": "bnews.vn (Thông tấn xã Việt Nam)", | |
| "meta_label": "chứng khoán", | |
| }, | |
| { | |
| "meta_url": "https://vnexpress.net/rss/kinh-doanh.rss", | |
| "meta_domain": "vnexpress.net (Bộ Khoa học và Công nghệ)", | |
| "meta_label": "kinh doanh", | |
| }, | |
| { | |
| "meta_url": "https://vtv.vn/rss/kinh-te.rss", | |
| "meta_domain": "vtv.vn (Đài Truyền hình Việt Nam)", | |
| "meta_label": "kinh tế", | |
| }, | |
| { | |
| "meta_url": "https://rss.nytimes.com/services/xml/rss/nyt/Economy.xml", | |
| "meta_domain": "nytimes.com (The New York Times - US)", | |
| "meta_label": "economy", | |
| }, | |
| { | |
| "meta_url": "https://feeds.content.dowjones.io/public/rss/socialeconomyfeed", | |
| "meta_domain": "wsj.com (The Wall Street Journal - US)", | |
| "meta_label": "economy", | |
| }, | |
| { | |
| "meta_url": "https://abcnews.go.com/abcnews/moneyheadlines", | |
| "meta_domain": "abcnews.com (ABC News - US)", | |
| "meta_label": "money", | |
| }, | |
| { | |
| "meta_url": "https://www.wired.com/feed/category/business/latest/rss", | |
| "meta_domain": "wired.com (WIRED - US)", | |
| "meta_label": "business", | |
| }, | |
| { | |
| "meta_url": "https://www.theguardian.com/uk/business/rss", | |
| "meta_domain": "theguardian.com (The Guardian - UK)", | |
| "meta_label": "business", | |
| }, | |
| { | |
| "meta_url": "https://feeds.npr.org/1017/rss.xml", | |
| "meta_domain": "npr.org (National Public Radio - US)", | |
| "meta_label": "economy", | |
| }, | |
| { | |
| "meta_url": "https://www.economywatch.com/feed", | |
| "meta_domain": "economywatch.com (Economy Watch - UK)", | |
| "meta_label": "economy", | |
| }, | |
| ] | |
| def normalize_time(dt): | |
| if dt.tzinfo is None: | |
| return dt.replace(tzinfo=timezone.utc) | |
| return dt.astimezone(timezone.utc) | |
| def humanize_ago(dt): | |
| now = datetime.now(timezone.utc) | |
| dt_utc = dt.astimezone(timezone.utc) | |
| return humanize.naturaltime(now - dt_utc).capitalize() | |
| def clean_text(str_html): | |
| # Fix missing the leading & | |
| str_html = html.unescape(re.sub(r'#(\d+);', r'&#\1;', str_html)) | |
| # Remove all img tags | |
| soup = BeautifulSoup(str_html, "html.parser") | |
| for img in soup.find_all("img"): | |
| img.decompose() | |
| str_html = str(soup) | |
| # 🍌 Remove all HTML tags | |
| str_html = soup.get_text(separator=' ', strip=True) | |
| # Return | |
| return str_html | |
| def rss_parser(): | |
| # ---------- | |
| all_entries = [] | |
| for myrss in ls_rss_urls: | |
| try: | |
| new_entries = feedparser.parse(myrss["meta_url"]).entries | |
| for ent in new_entries: | |
| ent["meta_url"] = myrss["meta_url"] | |
| ent["meta_domain"] = myrss["meta_domain"] | |
| ent["meta_label"] = myrss["meta_label"] | |
| all_entries += new_entries[:MAX_ENTRIES_EACH_RSS] | |
| print(f"{'✅' if len(new_entries)>0 else '❌'} {myrss['meta_url']} > {len(new_entries)}") | |
| except: | |
| pass | |
| # try: | |
| # published_text = new_entries[0]['published'] | |
| # print(f"Test: {published_text} -> {dateutil_parser.parse(published_text)}") | |
| # except: | |
| # pass | |
| # ---------- | |
| # ls_type_of_keys = [] | |
| # for e in all_entries: | |
| # if list(e.keys()) not in ls_type_of_keys: | |
| # ls_type_of_keys.append(list(e.keys())) | |
| # ls_common_keys = list(set(ls_type_of_keys[0]).intersection(*ls_type_of_keys[1:])) | |
| # print(ls_common_keys) | |
| # ---------- | |
| all_entries_clean = [] | |
| for e in all_entries: | |
| all_entries_clean.append({ | |
| "title": clean_text(e["title"]), | |
| "link": e["link"], | |
| "summary": clean_text(e["summary"]), | |
| "time": normalize_time(dateutil_parser.parse(e["published"])), | |
| "time_ago": humanize_ago(normalize_time(dateutil_parser.parse(e["published"]))), | |
| # "published_parsed": e["published_parsed"], | |
| # "title_detail": e["title_detail"], | |
| # "id": e["id"], | |
| # "links": e["links"], | |
| # "published": e["published"], | |
| # "guidislink": e["guidislink"], | |
| # "summary_detail": e["summary_detail"], | |
| "meta_url": e["meta_url"], | |
| "meta_domain": e["meta_domain"], | |
| "meta_label": e["meta_label"], | |
| }) | |
| # ---------- | |
| all_entries_sorted = sorted(all_entries_clean, key=lambda x: x["time"], reverse=True) | |
| # ---------- | |
| return all_entries_sorted[:MAX_TOTAL_ENTRIES] |