Spaces:

onelevelstudio
/

NEWS

Running

App Files Files

NEWS / _rss_parser.py

baobuiquang

initial commit

b5a1c35 verified 2 days ago

raw

history blame

7.3 kB

	MAX_ENTRIES_EACH_RSS = 5
	MAX_TOTAL_ENTRIES = 200

	from dateutil import parser as dateutil_parser
	from datetime import datetime, timezone
	from bs4 import BeautifulSoup
	import feedparser
	import humanize
	import html
	import re

	ls_rss_urls = [
	{
	"meta_url": "https://vneconomy.vn/chung-khoan.rss",
	"meta_domain": "vneconomy.vn (Hội Khoa học Kinh tế Việt Nam)",
	"meta_label": "chứng khoán",
	},
	{
	"meta_url": "https://cafebiz.vn/rss/chung-khoan.rss",
	"meta_domain": "cafebiz.vn",
	"meta_label": "chứng khoán",
	},
	{
	"meta_url": "https://vietstock.vn/830/chung-khoan/co-phieu.rss",
	"meta_domain": "vietstock.vn",
	"meta_label": "cổ phiếu",
	},
	{
	"meta_url": "https://vietstock.vn/739/chung-khoan/giao-dich-noi-bo.rss",
	"meta_domain": "vietstock.vn",
	"meta_label": "giao dịch nội bộ",
	},
	{
	"meta_url": "https://nhandan.vn/rss/chungkhoan-1191.rss",
	"meta_domain": "nhandan.vn (Đảng Cộng sản Việt Nam)",
	"meta_label": "chứng khoán",
	},
	{
	"meta_url": "https://thanhnien.vn/rss/kinh-te/chung-khoan.rss",
	"meta_domain": "thanhnien.vn (Hội Liên hiệp Thanh niên Việt Nam)",
	"meta_label": "chứng khoán",
	},
	{
	"meta_url": "https://www.sggp.org.vn/rss/kinhte-taichinhchuungkhoan-44.rss",
	"meta_domain": "sggp.org.vn (Đảng bộ Đảng Cộng sản Việt Nam TP.HCM)",
	"meta_label": "kinh tế, tài chính, chứng khoán",
	},
	{
	"meta_url": "https://antt.vn/rss/chung-khoan.rss",
	"meta_domain": "antt.vn",
	"meta_label": "chứng khoán",
	},
	{
	"meta_url": "https://nld.com.vn/rss/kinh-te/tai-chinh-chung-khoan.rss",
	"meta_domain": "nld.com.vn (Thành ủy TP.HCM)",
	"meta_label": "tài chính, chứng khoán",
	},
	{
	"meta_url": "https://soha.vn/rss/kinh-doanh/chung-khoan.rss",
	"meta_domain": "soha.vn",
	"meta_label": "chứng khoán",
	},
	{
	"meta_url": "https://tienphong.vn/rss/tai-chinh-chung-khoan-105.rss",
	"meta_domain": "tienphong.vn (Đoàn TNCS Hồ Chí Minh)",
	"meta_label": "tài chính, chứng khoán",
	},
	{
	"meta_url": "https://www.nguoiduatin.vn/rss/kinh-te/tai-chinh-ngan-hang.rss",
	"meta_domain": "nguoiduatin.vn (Hội Luật gia Việt Nam)",
	"meta_label": "kinh tế, tài chính, ngân hàng",
	},
	{
	"meta_url": "https://bnews.vn/rss/chung-khoan-33.rss",
	"meta_domain": "bnews.vn (Thông tấn xã Việt Nam)",
	"meta_label": "chứng khoán",
	},
	{
	"meta_url": "https://vnexpress.net/rss/kinh-doanh.rss",
	"meta_domain": "vnexpress.net (Bộ Khoa học và Công nghệ)",
	"meta_label": "kinh doanh",
	},
	{
	"meta_url": "https://vtv.vn/rss/kinh-te.rss",
	"meta_domain": "vtv.vn (Đài Truyền hình Việt Nam)",
	"meta_label": "kinh tế",
	},
	{
	"meta_url": "https://rss.nytimes.com/services/xml/rss/nyt/Economy.xml",
	"meta_domain": "nytimes.com (The New York Times - US)",
	"meta_label": "economy",
	},
	{
	"meta_url": "https://feeds.content.dowjones.io/public/rss/socialeconomyfeed",
	"meta_domain": "wsj.com (The Wall Street Journal - US)",
	"meta_label": "economy",
	},
	{
	"meta_url": "https://abcnews.go.com/abcnews/moneyheadlines",
	"meta_domain": "abcnews.com (ABC News - US)",
	"meta_label": "money",
	},
	{
	"meta_url": "https://www.wired.com/feed/category/business/latest/rss",
	"meta_domain": "wired.com (WIRED - US)",
	"meta_label": "business",
	},
	{
	"meta_url": "https://www.theguardian.com/uk/business/rss",
	"meta_domain": "theguardian.com (The Guardian - UK)",
	"meta_label": "business",
	},
	{
	"meta_url": "https://feeds.npr.org/1017/rss.xml",
	"meta_domain": "npr.org (National Public Radio - US)",
	"meta_label": "economy",
	},
	{
	"meta_url": "https://www.economywatch.com/feed",
	"meta_domain": "economywatch.com (Economy Watch - UK)",
	"meta_label": "economy",
	},
	]

	def normalize_time(dt):
	if dt.tzinfo is None:
	return dt.replace(tzinfo=timezone.utc)
	return dt.astimezone(timezone.utc)

	def humanize_ago(dt):
	now = datetime.now(timezone.utc)
	dt_utc = dt.astimezone(timezone.utc)
	return humanize.naturaltime(now - dt_utc).capitalize()

	def clean_text(str_html):
	# Fix missing the leading &
	str_html = html.unescape(re.sub(r'#(\d+);', r'&#\1;', str_html))
	# Remove all img tags
	soup = BeautifulSoup(str_html, "html.parser")
	for img in soup.find_all("img"):
	img.decompose()
	str_html = str(soup)
	# 🍌 Remove all HTML tags
	str_html = soup.get_text(separator=' ', strip=True)
	# Return
	return str_html

	def rss_parser():
	# ----------
	all_entries = []
	for myrss in ls_rss_urls:
	try:
	new_entries = feedparser.parse(myrss["meta_url"]).entries
	for ent in new_entries:
	ent["meta_url"] = myrss["meta_url"]
	ent["meta_domain"] = myrss["meta_domain"]
	ent["meta_label"] = myrss["meta_label"]
	all_entries += new_entries[:MAX_ENTRIES_EACH_RSS]
	print(f"{'✅' if len(new_entries)>0 else '❌'} {myrss['meta_url']} > {len(new_entries)}")
	except:
	pass
	# try:
	# published_text = new_entries[0]['published']
	# print(f"Test: {published_text} -> {dateutil_parser.parse(published_text)}")
	# except:
	# pass
	# ----------
	# ls_type_of_keys = []
	# for e in all_entries:
	# if list(e.keys()) not in ls_type_of_keys:
	# ls_type_of_keys.append(list(e.keys()))
	# ls_common_keys = list(set(ls_type_of_keys[0]).intersection(*ls_type_of_keys[1:]))
	# print(ls_common_keys)
	# ----------
	all_entries_clean = []
	for e in all_entries:
	all_entries_clean.append({
	"title": clean_text(e["title"]),
	"link": e["link"],
	"summary": clean_text(e["summary"]),
	"time": normalize_time(dateutil_parser.parse(e["published"])),
	"time_ago": humanize_ago(normalize_time(dateutil_parser.parse(e["published"]))),
	# "published_parsed": e["published_parsed"],
	# "title_detail": e["title_detail"],
	# "id": e["id"],
	# "links": e["links"],
	# "published": e["published"],
	# "guidislink": e["guidislink"],
	# "summary_detail": e["summary_detail"],
	"meta_url": e["meta_url"],
	"meta_domain": e["meta_domain"],
	"meta_label": e["meta_label"],
	})
	# ----------
	all_entries_sorted = sorted(all_entries_clean, key=lambda x: x["time"], reverse=True)
	# ----------
	return all_entries_sorted[:MAX_TOTAL_ENTRIES]