Spaces:

onelevelstudio
/

NEWS

Running

File size: 7,298 Bytes

b5a1c35

MAX_ENTRIES_EACH_RSS = 5
MAX_TOTAL_ENTRIES = 200

from dateutil import parser as dateutil_parser
from datetime import datetime, timezone
from bs4 import BeautifulSoup
import feedparser
import humanize
import html
import re

ls_rss_urls = [
    {
        "meta_url": "https://vneconomy.vn/chung-khoan.rss",
        "meta_domain": "vneconomy.vn (Hội Khoa học Kinh tế Việt Nam)",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://cafebiz.vn/rss/chung-khoan.rss",
        "meta_domain": "cafebiz.vn",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://vietstock.vn/830/chung-khoan/co-phieu.rss",
        "meta_domain": "vietstock.vn",
        "meta_label": "cổ phiếu",
    },
    {
        "meta_url": "https://vietstock.vn/739/chung-khoan/giao-dich-noi-bo.rss",
        "meta_domain": "vietstock.vn",
        "meta_label": "giao dịch nội bộ",
    },
    {
        "meta_url": "https://nhandan.vn/rss/chungkhoan-1191.rss",
        "meta_domain": "nhandan.vn (Đảng Cộng sản Việt Nam)",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://thanhnien.vn/rss/kinh-te/chung-khoan.rss",
        "meta_domain": "thanhnien.vn (Hội Liên hiệp Thanh niên Việt Nam)",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://www.sggp.org.vn/rss/kinhte-taichinhchuungkhoan-44.rss",
        "meta_domain": "sggp.org.vn (Đảng bộ Đảng Cộng sản Việt Nam TP.HCM)",
        "meta_label": "kinh tế, tài chính, chứng khoán",
    },
    {
        "meta_url": "https://antt.vn/rss/chung-khoan.rss",
        "meta_domain": "antt.vn",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://nld.com.vn/rss/kinh-te/tai-chinh-chung-khoan.rss",
        "meta_domain": "nld.com.vn (Thành ủy TP.HCM)",
        "meta_label": "tài chính, chứng khoán",
    },
    {
        "meta_url": "https://soha.vn/rss/kinh-doanh/chung-khoan.rss",
        "meta_domain": "soha.vn",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://tienphong.vn/rss/tai-chinh-chung-khoan-105.rss",
        "meta_domain": "tienphong.vn (Đoàn TNCS Hồ Chí Minh)",
        "meta_label": "tài chính, chứng khoán",
    },
    {
        "meta_url": "https://www.nguoiduatin.vn/rss/kinh-te/tai-chinh-ngan-hang.rss",
        "meta_domain": "nguoiduatin.vn (Hội Luật gia Việt Nam)",
        "meta_label": "kinh tế, tài chính, ngân hàng",
    },
    {
        "meta_url": "https://bnews.vn/rss/chung-khoan-33.rss",
        "meta_domain": "bnews.vn (Thông tấn xã Việt Nam)",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://vnexpress.net/rss/kinh-doanh.rss",
        "meta_domain": "vnexpress.net (Bộ Khoa học và Công nghệ)",
        "meta_label": "kinh doanh",
    },
    {
        "meta_url": "https://vtv.vn/rss/kinh-te.rss",
        "meta_domain": "vtv.vn (Đài Truyền hình Việt Nam)",
        "meta_label": "kinh tế",
    },
    {
        "meta_url": "https://rss.nytimes.com/services/xml/rss/nyt/Economy.xml",
        "meta_domain": "nytimes.com (The New York Times - US)",
        "meta_label": "economy",
    },
    {
        "meta_url": "https://feeds.content.dowjones.io/public/rss/socialeconomyfeed",
        "meta_domain": "wsj.com (The Wall Street Journal - US)",
        "meta_label": "economy",
    },
    {
        "meta_url": "https://abcnews.go.com/abcnews/moneyheadlines",
        "meta_domain": "abcnews.com (ABC News - US)",
        "meta_label": "money",
    },
    {
        "meta_url": "https://www.wired.com/feed/category/business/latest/rss",
        "meta_domain": "wired.com (WIRED - US)",
        "meta_label": "business",
    },
    {
        "meta_url": "https://www.theguardian.com/uk/business/rss",
        "meta_domain": "theguardian.com (The Guardian - UK)",
        "meta_label": "business",
    },
    {
        "meta_url": "https://feeds.npr.org/1017/rss.xml",
        "meta_domain": "npr.org (National Public Radio - US)",
        "meta_label": "economy",
    },
    {
        "meta_url": "https://www.economywatch.com/feed",
        "meta_domain": "economywatch.com (Economy Watch - UK)",
        "meta_label": "economy",
    },
]

def normalize_time(dt):
    if dt.tzinfo is None:
        return dt.replace(tzinfo=timezone.utc)
    return dt.astimezone(timezone.utc)

def humanize_ago(dt):
    now = datetime.now(timezone.utc)
    dt_utc = dt.astimezone(timezone.utc)
    return humanize.naturaltime(now - dt_utc).capitalize()

def clean_text(str_html):
    # Fix missing the leading &
    str_html = html.unescape(re.sub(r'#(\d+);', r'&#\1;', str_html))
    # Remove all img tags
    soup = BeautifulSoup(str_html, "html.parser")
    for img in soup.find_all("img"):
        img.decompose()
    str_html = str(soup)
    # 🍌 Remove all HTML tags
    str_html = soup.get_text(separator=' ', strip=True)
    # Return
    return str_html

def rss_parser():
    # ----------
    all_entries = []
    for myrss in ls_rss_urls:
        try:
            new_entries = feedparser.parse(myrss["meta_url"]).entries
            for ent in new_entries:
                ent["meta_url"] = myrss["meta_url"]
                ent["meta_domain"] = myrss["meta_domain"]
                ent["meta_label"] = myrss["meta_label"]
            all_entries += new_entries[:MAX_ENTRIES_EACH_RSS]
            print(f"{'✅' if len(new_entries)>0 else '❌'} {myrss['meta_url']} > {len(new_entries)}")
        except:
            pass
        # try:
        #     published_text = new_entries[0]['published']
        #     print(f"Test: {published_text} -> {dateutil_parser.parse(published_text)}")
        # except:
        #     pass
    # ----------
    # ls_type_of_keys = []
    # for e in all_entries:
    #     if list(e.keys()) not in ls_type_of_keys:
    #         ls_type_of_keys.append(list(e.keys()))
    # ls_common_keys = list(set(ls_type_of_keys[0]).intersection(*ls_type_of_keys[1:]))
    # print(ls_common_keys)
    # ----------
    all_entries_clean = []
    for e in all_entries:
        all_entries_clean.append({
            "title": clean_text(e["title"]),
            "link": e["link"],
            "summary": clean_text(e["summary"]),
            "time": normalize_time(dateutil_parser.parse(e["published"])),
            "time_ago": humanize_ago(normalize_time(dateutil_parser.parse(e["published"]))),
            # "published_parsed": e["published_parsed"],
            # "title_detail": e["title_detail"],
            # "id": e["id"],
            # "links": e["links"],
            # "published": e["published"],
            # "guidislink": e["guidislink"],
            # "summary_detail": e["summary_detail"],
            "meta_url": e["meta_url"],
            "meta_domain": e["meta_domain"],
            "meta_label": e["meta_label"],
        })
    # ----------
    all_entries_sorted = sorted(all_entries_clean, key=lambda x: x["time"], reverse=True)
    # ----------
    return all_entries_sorted[:MAX_TOTAL_ENTRIES]