NEWS / _rss_parser.py
baobuiquang's picture
initial commit
b5a1c35 verified
raw
history blame
7.3 kB
MAX_ENTRIES_EACH_RSS = 5
MAX_TOTAL_ENTRIES = 200
from dateutil import parser as dateutil_parser
from datetime import datetime, timezone
from bs4 import BeautifulSoup
import feedparser
import humanize
import html
import re
ls_rss_urls = [
{
"meta_url": "https://vneconomy.vn/chung-khoan.rss",
"meta_domain": "vneconomy.vn (Hội Khoa học Kinh tế Việt Nam)",
"meta_label": "chứng khoán",
},
{
"meta_url": "https://cafebiz.vn/rss/chung-khoan.rss",
"meta_domain": "cafebiz.vn",
"meta_label": "chứng khoán",
},
{
"meta_url": "https://vietstock.vn/830/chung-khoan/co-phieu.rss",
"meta_domain": "vietstock.vn",
"meta_label": "cổ phiếu",
},
{
"meta_url": "https://vietstock.vn/739/chung-khoan/giao-dich-noi-bo.rss",
"meta_domain": "vietstock.vn",
"meta_label": "giao dịch nội bộ",
},
{
"meta_url": "https://nhandan.vn/rss/chungkhoan-1191.rss",
"meta_domain": "nhandan.vn (Đảng Cộng sản Việt Nam)",
"meta_label": "chứng khoán",
},
{
"meta_url": "https://thanhnien.vn/rss/kinh-te/chung-khoan.rss",
"meta_domain": "thanhnien.vn (Hội Liên hiệp Thanh niên Việt Nam)",
"meta_label": "chứng khoán",
},
{
"meta_url": "https://www.sggp.org.vn/rss/kinhte-taichinhchuungkhoan-44.rss",
"meta_domain": "sggp.org.vn (Đảng bộ Đảng Cộng sản Việt Nam TP.HCM)",
"meta_label": "kinh tế, tài chính, chứng khoán",
},
{
"meta_url": "https://antt.vn/rss/chung-khoan.rss",
"meta_domain": "antt.vn",
"meta_label": "chứng khoán",
},
{
"meta_url": "https://nld.com.vn/rss/kinh-te/tai-chinh-chung-khoan.rss",
"meta_domain": "nld.com.vn (Thành ủy TP.HCM)",
"meta_label": "tài chính, chứng khoán",
},
{
"meta_url": "https://soha.vn/rss/kinh-doanh/chung-khoan.rss",
"meta_domain": "soha.vn",
"meta_label": "chứng khoán",
},
{
"meta_url": "https://tienphong.vn/rss/tai-chinh-chung-khoan-105.rss",
"meta_domain": "tienphong.vn (Đoàn TNCS Hồ Chí Minh)",
"meta_label": "tài chính, chứng khoán",
},
{
"meta_url": "https://www.nguoiduatin.vn/rss/kinh-te/tai-chinh-ngan-hang.rss",
"meta_domain": "nguoiduatin.vn (Hội Luật gia Việt Nam)",
"meta_label": "kinh tế, tài chính, ngân hàng",
},
{
"meta_url": "https://bnews.vn/rss/chung-khoan-33.rss",
"meta_domain": "bnews.vn (Thông tấn xã Việt Nam)",
"meta_label": "chứng khoán",
},
{
"meta_url": "https://vnexpress.net/rss/kinh-doanh.rss",
"meta_domain": "vnexpress.net (Bộ Khoa học và Công nghệ)",
"meta_label": "kinh doanh",
},
{
"meta_url": "https://vtv.vn/rss/kinh-te.rss",
"meta_domain": "vtv.vn (Đài Truyền hình Việt Nam)",
"meta_label": "kinh tế",
},
{
"meta_url": "https://rss.nytimes.com/services/xml/rss/nyt/Economy.xml",
"meta_domain": "nytimes.com (The New York Times - US)",
"meta_label": "economy",
},
{
"meta_url": "https://feeds.content.dowjones.io/public/rss/socialeconomyfeed",
"meta_domain": "wsj.com (The Wall Street Journal - US)",
"meta_label": "economy",
},
{
"meta_url": "https://abcnews.go.com/abcnews/moneyheadlines",
"meta_domain": "abcnews.com (ABC News - US)",
"meta_label": "money",
},
{
"meta_url": "https://www.wired.com/feed/category/business/latest/rss",
"meta_domain": "wired.com (WIRED - US)",
"meta_label": "business",
},
{
"meta_url": "https://www.theguardian.com/uk/business/rss",
"meta_domain": "theguardian.com (The Guardian - UK)",
"meta_label": "business",
},
{
"meta_url": "https://feeds.npr.org/1017/rss.xml",
"meta_domain": "npr.org (National Public Radio - US)",
"meta_label": "economy",
},
{
"meta_url": "https://www.economywatch.com/feed",
"meta_domain": "economywatch.com (Economy Watch - UK)",
"meta_label": "economy",
},
]
def normalize_time(dt):
if dt.tzinfo is None:
return dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc)
def humanize_ago(dt):
now = datetime.now(timezone.utc)
dt_utc = dt.astimezone(timezone.utc)
return humanize.naturaltime(now - dt_utc).capitalize()
def clean_text(str_html):
# Fix missing the leading &
str_html = html.unescape(re.sub(r'#(\d+);', r'&#\1;', str_html))
# Remove all img tags
soup = BeautifulSoup(str_html, "html.parser")
for img in soup.find_all("img"):
img.decompose()
str_html = str(soup)
# 🍌 Remove all HTML tags
str_html = soup.get_text(separator=' ', strip=True)
# Return
return str_html
def rss_parser():
# ----------
all_entries = []
for myrss in ls_rss_urls:
try:
new_entries = feedparser.parse(myrss["meta_url"]).entries
for ent in new_entries:
ent["meta_url"] = myrss["meta_url"]
ent["meta_domain"] = myrss["meta_domain"]
ent["meta_label"] = myrss["meta_label"]
all_entries += new_entries[:MAX_ENTRIES_EACH_RSS]
print(f"{'✅' if len(new_entries)>0 else '❌'} {myrss['meta_url']} > {len(new_entries)}")
except:
pass
# try:
# published_text = new_entries[0]['published']
# print(f"Test: {published_text} -> {dateutil_parser.parse(published_text)}")
# except:
# pass
# ----------
# ls_type_of_keys = []
# for e in all_entries:
# if list(e.keys()) not in ls_type_of_keys:
# ls_type_of_keys.append(list(e.keys()))
# ls_common_keys = list(set(ls_type_of_keys[0]).intersection(*ls_type_of_keys[1:]))
# print(ls_common_keys)
# ----------
all_entries_clean = []
for e in all_entries:
all_entries_clean.append({
"title": clean_text(e["title"]),
"link": e["link"],
"summary": clean_text(e["summary"]),
"time": normalize_time(dateutil_parser.parse(e["published"])),
"time_ago": humanize_ago(normalize_time(dateutil_parser.parse(e["published"]))),
# "published_parsed": e["published_parsed"],
# "title_detail": e["title_detail"],
# "id": e["id"],
# "links": e["links"],
# "published": e["published"],
# "guidislink": e["guidislink"],
# "summary_detail": e["summary_detail"],
"meta_url": e["meta_url"],
"meta_domain": e["meta_domain"],
"meta_label": e["meta_label"],
})
# ----------
all_entries_sorted = sorted(all_entries_clean, key=lambda x: x["time"], reverse=True)
# ----------
return all_entries_sorted[:MAX_TOTAL_ENTRIES]