File size: 7,298 Bytes
b5a1c35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
MAX_ENTRIES_EACH_RSS = 5
MAX_TOTAL_ENTRIES = 200

from dateutil import parser as dateutil_parser
from datetime import datetime, timezone
from bs4 import BeautifulSoup
import feedparser
import humanize
import html
import re

ls_rss_urls = [
    {
        "meta_url": "https://vneconomy.vn/chung-khoan.rss",
        "meta_domain": "vneconomy.vn (Hội Khoa học Kinh tế Việt Nam)",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://cafebiz.vn/rss/chung-khoan.rss",
        "meta_domain": "cafebiz.vn",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://vietstock.vn/830/chung-khoan/co-phieu.rss",
        "meta_domain": "vietstock.vn",
        "meta_label": "cổ phiếu",
    },
    {
        "meta_url": "https://vietstock.vn/739/chung-khoan/giao-dich-noi-bo.rss",
        "meta_domain": "vietstock.vn",
        "meta_label": "giao dịch nội bộ",
    },
    {
        "meta_url": "https://nhandan.vn/rss/chungkhoan-1191.rss",
        "meta_domain": "nhandan.vn (Đảng Cộng sản Việt Nam)",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://thanhnien.vn/rss/kinh-te/chung-khoan.rss",
        "meta_domain": "thanhnien.vn (Hội Liên hiệp Thanh niên Việt Nam)",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://www.sggp.org.vn/rss/kinhte-taichinhchuungkhoan-44.rss",
        "meta_domain": "sggp.org.vn (Đảng bộ Đảng Cộng sản Việt Nam TP.HCM)",
        "meta_label": "kinh tế, tài chính, chứng khoán",
    },
    {
        "meta_url": "https://antt.vn/rss/chung-khoan.rss",
        "meta_domain": "antt.vn",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://nld.com.vn/rss/kinh-te/tai-chinh-chung-khoan.rss",
        "meta_domain": "nld.com.vn (Thành ủy TP.HCM)",
        "meta_label": "tài chính, chứng khoán",
    },
    {
        "meta_url": "https://soha.vn/rss/kinh-doanh/chung-khoan.rss",
        "meta_domain": "soha.vn",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://tienphong.vn/rss/tai-chinh-chung-khoan-105.rss",
        "meta_domain": "tienphong.vn (Đoàn TNCS Hồ Chí Minh)",
        "meta_label": "tài chính, chứng khoán",
    },
    {
        "meta_url": "https://www.nguoiduatin.vn/rss/kinh-te/tai-chinh-ngan-hang.rss",
        "meta_domain": "nguoiduatin.vn (Hội Luật gia Việt Nam)",
        "meta_label": "kinh tế, tài chính, ngân hàng",
    },
    {
        "meta_url": "https://bnews.vn/rss/chung-khoan-33.rss",
        "meta_domain": "bnews.vn (Thông tấn xã Việt Nam)",
        "meta_label": "chứng khoán",
    },
    {
        "meta_url": "https://vnexpress.net/rss/kinh-doanh.rss",
        "meta_domain": "vnexpress.net (Bộ Khoa học và Công nghệ)",
        "meta_label": "kinh doanh",
    },
    {
        "meta_url": "https://vtv.vn/rss/kinh-te.rss",
        "meta_domain": "vtv.vn (Đài Truyền hình Việt Nam)",
        "meta_label": "kinh tế",
    },
    {
        "meta_url": "https://rss.nytimes.com/services/xml/rss/nyt/Economy.xml",
        "meta_domain": "nytimes.com (The New York Times - US)",
        "meta_label": "economy",
    },
    {
        "meta_url": "https://feeds.content.dowjones.io/public/rss/socialeconomyfeed",
        "meta_domain": "wsj.com (The Wall Street Journal - US)",
        "meta_label": "economy",
    },
    {
        "meta_url": "https://abcnews.go.com/abcnews/moneyheadlines",
        "meta_domain": "abcnews.com (ABC News - US)",
        "meta_label": "money",
    },
    {
        "meta_url": "https://www.wired.com/feed/category/business/latest/rss",
        "meta_domain": "wired.com (WIRED - US)",
        "meta_label": "business",
    },
    {
        "meta_url": "https://www.theguardian.com/uk/business/rss",
        "meta_domain": "theguardian.com (The Guardian - UK)",
        "meta_label": "business",
    },
    {
        "meta_url": "https://feeds.npr.org/1017/rss.xml",
        "meta_domain": "npr.org (National Public Radio - US)",
        "meta_label": "economy",
    },
    {
        "meta_url": "https://www.economywatch.com/feed",
        "meta_domain": "economywatch.com (Economy Watch - UK)",
        "meta_label": "economy",
    },
]

def normalize_time(dt):
    if dt.tzinfo is None:
        return dt.replace(tzinfo=timezone.utc)
    return dt.astimezone(timezone.utc)

def humanize_ago(dt):
    now = datetime.now(timezone.utc)
    dt_utc = dt.astimezone(timezone.utc)
    return humanize.naturaltime(now - dt_utc).capitalize()

def clean_text(str_html):
    # Fix missing the leading &
    str_html = html.unescape(re.sub(r'#(\d+);', r'&#\1;', str_html))
    # Remove all img tags
    soup = BeautifulSoup(str_html, "html.parser")
    for img in soup.find_all("img"):
        img.decompose()
    str_html = str(soup)
    # 🍌 Remove all HTML tags
    str_html = soup.get_text(separator=' ', strip=True)
    # Return
    return str_html

def rss_parser():
    # ----------
    all_entries = []
    for myrss in ls_rss_urls:
        try:
            new_entries = feedparser.parse(myrss["meta_url"]).entries
            for ent in new_entries:
                ent["meta_url"] = myrss["meta_url"]
                ent["meta_domain"] = myrss["meta_domain"]
                ent["meta_label"] = myrss["meta_label"]
            all_entries += new_entries[:MAX_ENTRIES_EACH_RSS]
            print(f"{'✅' if len(new_entries)>0 else '❌'} {myrss['meta_url']} > {len(new_entries)}")
        except:
            pass
        # try:
        #     published_text = new_entries[0]['published']
        #     print(f"Test: {published_text} -> {dateutil_parser.parse(published_text)}")
        # except:
        #     pass
    # ----------
    # ls_type_of_keys = []
    # for e in all_entries:
    #     if list(e.keys()) not in ls_type_of_keys:
    #         ls_type_of_keys.append(list(e.keys()))
    # ls_common_keys = list(set(ls_type_of_keys[0]).intersection(*ls_type_of_keys[1:]))
    # print(ls_common_keys)
    # ----------
    all_entries_clean = []
    for e in all_entries:
        all_entries_clean.append({
            "title": clean_text(e["title"]),
            "link": e["link"],
            "summary": clean_text(e["summary"]),
            "time": normalize_time(dateutil_parser.parse(e["published"])),
            "time_ago": humanize_ago(normalize_time(dateutil_parser.parse(e["published"]))),
            # "published_parsed": e["published_parsed"],
            # "title_detail": e["title_detail"],
            # "id": e["id"],
            # "links": e["links"],
            # "published": e["published"],
            # "guidislink": e["guidislink"],
            # "summary_detail": e["summary_detail"],
            "meta_url": e["meta_url"],
            "meta_domain": e["meta_domain"],
            "meta_label": e["meta_label"],
        })
    # ----------
    all_entries_sorted = sorted(all_entries_clean, key=lambda x: x["time"], reverse=True)
    # ----------
    return all_entries_sorted[:MAX_TOTAL_ENTRIES]