baobuiquang commited on
Commit
b5a1c35
·
verified ·
1 Parent(s): 073c296

initial commit

Browse files
Files changed (3) hide show
  1. _rss_parser.py +197 -0
  2. app.py +183 -0
  3. requirements.txt +0 -0
_rss_parser.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MAX_ENTRIES_EACH_RSS = 5
2
+ MAX_TOTAL_ENTRIES = 200
3
+
4
+ from dateutil import parser as dateutil_parser
5
+ from datetime import datetime, timezone
6
+ from bs4 import BeautifulSoup
7
+ import feedparser
8
+ import humanize
9
+ import html
10
+ import re
11
+
12
+ ls_rss_urls = [
13
+ {
14
+ "meta_url": "https://vneconomy.vn/chung-khoan.rss",
15
+ "meta_domain": "vneconomy.vn (Hội Khoa học Kinh tế Việt Nam)",
16
+ "meta_label": "chứng khoán",
17
+ },
18
+ {
19
+ "meta_url": "https://cafebiz.vn/rss/chung-khoan.rss",
20
+ "meta_domain": "cafebiz.vn",
21
+ "meta_label": "chứng khoán",
22
+ },
23
+ {
24
+ "meta_url": "https://vietstock.vn/830/chung-khoan/co-phieu.rss",
25
+ "meta_domain": "vietstock.vn",
26
+ "meta_label": "cổ phiếu",
27
+ },
28
+ {
29
+ "meta_url": "https://vietstock.vn/739/chung-khoan/giao-dich-noi-bo.rss",
30
+ "meta_domain": "vietstock.vn",
31
+ "meta_label": "giao dịch nội bộ",
32
+ },
33
+ {
34
+ "meta_url": "https://nhandan.vn/rss/chungkhoan-1191.rss",
35
+ "meta_domain": "nhandan.vn (Đảng Cộng sản Việt Nam)",
36
+ "meta_label": "chứng khoán",
37
+ },
38
+ {
39
+ "meta_url": "https://thanhnien.vn/rss/kinh-te/chung-khoan.rss",
40
+ "meta_domain": "thanhnien.vn (Hội Liên hiệp Thanh niên Việt Nam)",
41
+ "meta_label": "chứng khoán",
42
+ },
43
+ {
44
+ "meta_url": "https://www.sggp.org.vn/rss/kinhte-taichinhchuungkhoan-44.rss",
45
+ "meta_domain": "sggp.org.vn (Đảng bộ Đảng Cộng sản Việt Nam TP.HCM)",
46
+ "meta_label": "kinh tế, tài chính, chứng khoán",
47
+ },
48
+ {
49
+ "meta_url": "https://antt.vn/rss/chung-khoan.rss",
50
+ "meta_domain": "antt.vn",
51
+ "meta_label": "chứng khoán",
52
+ },
53
+ {
54
+ "meta_url": "https://nld.com.vn/rss/kinh-te/tai-chinh-chung-khoan.rss",
55
+ "meta_domain": "nld.com.vn (Thành ủy TP.HCM)",
56
+ "meta_label": "tài chính, chứng khoán",
57
+ },
58
+ {
59
+ "meta_url": "https://soha.vn/rss/kinh-doanh/chung-khoan.rss",
60
+ "meta_domain": "soha.vn",
61
+ "meta_label": "chứng khoán",
62
+ },
63
+ {
64
+ "meta_url": "https://tienphong.vn/rss/tai-chinh-chung-khoan-105.rss",
65
+ "meta_domain": "tienphong.vn (Đoàn TNCS Hồ Chí Minh)",
66
+ "meta_label": "tài chính, chứng khoán",
67
+ },
68
+ {
69
+ "meta_url": "https://www.nguoiduatin.vn/rss/kinh-te/tai-chinh-ngan-hang.rss",
70
+ "meta_domain": "nguoiduatin.vn (Hội Luật gia Việt Nam)",
71
+ "meta_label": "kinh tế, tài chính, ngân hàng",
72
+ },
73
+ {
74
+ "meta_url": "https://bnews.vn/rss/chung-khoan-33.rss",
75
+ "meta_domain": "bnews.vn (Thông tấn xã Việt Nam)",
76
+ "meta_label": "chứng khoán",
77
+ },
78
+ {
79
+ "meta_url": "https://vnexpress.net/rss/kinh-doanh.rss",
80
+ "meta_domain": "vnexpress.net (Bộ Khoa học và Công nghệ)",
81
+ "meta_label": "kinh doanh",
82
+ },
83
+ {
84
+ "meta_url": "https://vtv.vn/rss/kinh-te.rss",
85
+ "meta_domain": "vtv.vn (Đài Truyền hình Việt Nam)",
86
+ "meta_label": "kinh tế",
87
+ },
88
+ {
89
+ "meta_url": "https://rss.nytimes.com/services/xml/rss/nyt/Economy.xml",
90
+ "meta_domain": "nytimes.com (The New York Times - US)",
91
+ "meta_label": "economy",
92
+ },
93
+ {
94
+ "meta_url": "https://feeds.content.dowjones.io/public/rss/socialeconomyfeed",
95
+ "meta_domain": "wsj.com (The Wall Street Journal - US)",
96
+ "meta_label": "economy",
97
+ },
98
+ {
99
+ "meta_url": "https://abcnews.go.com/abcnews/moneyheadlines",
100
+ "meta_domain": "abcnews.com (ABC News - US)",
101
+ "meta_label": "money",
102
+ },
103
+ {
104
+ "meta_url": "https://www.wired.com/feed/category/business/latest/rss",
105
+ "meta_domain": "wired.com (WIRED - US)",
106
+ "meta_label": "business",
107
+ },
108
+ {
109
+ "meta_url": "https://www.theguardian.com/uk/business/rss",
110
+ "meta_domain": "theguardian.com (The Guardian - UK)",
111
+ "meta_label": "business",
112
+ },
113
+ {
114
+ "meta_url": "https://feeds.npr.org/1017/rss.xml",
115
+ "meta_domain": "npr.org (National Public Radio - US)",
116
+ "meta_label": "economy",
117
+ },
118
+ {
119
+ "meta_url": "https://www.economywatch.com/feed",
120
+ "meta_domain": "economywatch.com (Economy Watch - UK)",
121
+ "meta_label": "economy",
122
+ },
123
+ ]
124
+
125
+ def normalize_time(dt):
126
+ if dt.tzinfo is None:
127
+ return dt.replace(tzinfo=timezone.utc)
128
+ return dt.astimezone(timezone.utc)
129
+
130
+ def humanize_ago(dt):
131
+ now = datetime.now(timezone.utc)
132
+ dt_utc = dt.astimezone(timezone.utc)
133
+ return humanize.naturaltime(now - dt_utc).capitalize()
134
+
135
+ def clean_text(str_html):
136
+ # Fix missing the leading &
137
+ str_html = html.unescape(re.sub(r'#(\d+);', r'&#\1;', str_html))
138
+ # Remove all img tags
139
+ soup = BeautifulSoup(str_html, "html.parser")
140
+ for img in soup.find_all("img"):
141
+ img.decompose()
142
+ str_html = str(soup)
143
+ # 🍌 Remove all HTML tags
144
+ str_html = soup.get_text(separator=' ', strip=True)
145
+ # Return
146
+ return str_html
147
+
148
+ def rss_parser():
149
+ # ----------
150
+ all_entries = []
151
+ for myrss in ls_rss_urls:
152
+ try:
153
+ new_entries = feedparser.parse(myrss["meta_url"]).entries
154
+ for ent in new_entries:
155
+ ent["meta_url"] = myrss["meta_url"]
156
+ ent["meta_domain"] = myrss["meta_domain"]
157
+ ent["meta_label"] = myrss["meta_label"]
158
+ all_entries += new_entries[:MAX_ENTRIES_EACH_RSS]
159
+ print(f"{'✅' if len(new_entries)>0 else '❌'} {myrss['meta_url']} > {len(new_entries)}")
160
+ except:
161
+ pass
162
+ # try:
163
+ # published_text = new_entries[0]['published']
164
+ # print(f"Test: {published_text} -> {dateutil_parser.parse(published_text)}")
165
+ # except:
166
+ # pass
167
+ # ----------
168
+ # ls_type_of_keys = []
169
+ # for e in all_entries:
170
+ # if list(e.keys()) not in ls_type_of_keys:
171
+ # ls_type_of_keys.append(list(e.keys()))
172
+ # ls_common_keys = list(set(ls_type_of_keys[0]).intersection(*ls_type_of_keys[1:]))
173
+ # print(ls_common_keys)
174
+ # ----------
175
+ all_entries_clean = []
176
+ for e in all_entries:
177
+ all_entries_clean.append({
178
+ "title": clean_text(e["title"]),
179
+ "link": e["link"],
180
+ "summary": clean_text(e["summary"]),
181
+ "time": normalize_time(dateutil_parser.parse(e["published"])),
182
+ "time_ago": humanize_ago(normalize_time(dateutil_parser.parse(e["published"]))),
183
+ # "published_parsed": e["published_parsed"],
184
+ # "title_detail": e["title_detail"],
185
+ # "id": e["id"],
186
+ # "links": e["links"],
187
+ # "published": e["published"],
188
+ # "guidislink": e["guidislink"],
189
+ # "summary_detail": e["summary_detail"],
190
+ "meta_url": e["meta_url"],
191
+ "meta_domain": e["meta_domain"],
192
+ "meta_label": e["meta_label"],
193
+ })
194
+ # ----------
195
+ all_entries_sorted = sorted(all_entries_clean, key=lambda x: x["time"], reverse=True)
196
+ # ----------
197
+ return all_entries_sorted[:MAX_TOTAL_ENTRIES]
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from _rss_parser import rss_parser
3
+
4
+ # ====================================================================================================
5
+ # ====================================================================================================
6
+ # ====================================================================================================
7
+
8
+ def fn_display_all_news():
9
+ html_content = ""
10
+ html_content += "<div id='all-news-items'>"
11
+ for e in rss_parser():
12
+ html_content += f"""
13
+ <div class='news-item'>
14
+ <div class='news-info'>
15
+ {e['time_ago']} • {e['meta_domain']}
16
+ </div>
17
+ <a target='_blank' href={e['link']}>
18
+ <p class='news-title'>
19
+ {e['title']}
20
+ </p>
21
+ </a>
22
+ <div class='news-summary'>
23
+ {e['summary'][:500]}
24
+ </div>
25
+ </div>
26
+ """
27
+ html_content += "</div>"
28
+ return html_content
29
+
30
+ # ====================================================================================================
31
+ # ====================================================================================================
32
+ # ====================================================================================================
33
+
34
+ theme = gr.themes.Base(
35
+ primary_hue="neutral",
36
+ secondary_hue="neutral",
37
+ neutral_hue="neutral",
38
+ text_size="lg",
39
+ font=[gr.themes.GoogleFont('Inter')],
40
+ font_mono=[gr.themes.GoogleFont('Manufacturing Consent')],
41
+ )
42
+ head = """
43
+ <link rel="icon" href="https://cdn.jsdelivr.net/gh/OneLevelStudio/CORE/STATIC/1LV_LOGO_DARK.png">
44
+ """
45
+ # * { -ms-overflow-style: none; scrollbar-width: none; }
46
+ # *::-webkit-scrollbar { display: none; }
47
+ css = """
48
+ #huggingface-space-header { display: none !important; }
49
+ footer { display: none !important; }
50
+ body {
51
+ overflow: hidden !important;
52
+ }
53
+ main {
54
+ padding: 0 !important;
55
+ max-width: 100% !important;
56
+ }
57
+ textarea {
58
+ padding-top: 5px !important;
59
+ padding-bottom: 6px !important;
60
+ }
61
+ .row, .column {
62
+ gap: 0 !important;
63
+ }
64
+ /* ---------- Scrollbar ---------- */
65
+ ::-webkit-scrollbar {
66
+ background: transparent;
67
+ width: 8px;
68
+ border-radius: 999px;
69
+ }
70
+ ::-webkit-scrollbar-track {
71
+ background: transparent;
72
+ border-radius: 999px;
73
+ }
74
+ ::-webkit-scrollbar-thumb {
75
+ background: hsla(0, 0%, 50%, 0.5);
76
+ border-radius: 999px;
77
+ }
78
+ ::-webkit-scrollbar-thumb:hover {
79
+ background: hsla(0, 0%, 50%, 0.9);
80
+ }
81
+ /* ---------- Desktop/Mobile Only ---------- */
82
+ .desktop-only {
83
+ display: block;
84
+ }
85
+ @media only screen and (max-width: 1000px) {
86
+ .desktop-only {
87
+ display: none;
88
+ }
89
+ }
90
+ .mobile-only {
91
+ display: block;
92
+ }
93
+ @media only screen and (min-width: 1000px) {
94
+ .mobile-only {
95
+ display: none;
96
+ }
97
+ }
98
+ /* ---------- ---------- */
99
+ #all-news-items {
100
+ display: flex;
101
+ flex-direction: column;
102
+ height: 100svh;
103
+ overflow-y: scroll;
104
+ border-left: solid 1px hsla(0, 0%, 50%, 0.0);
105
+ border-right: solid 1px hsla(0, 0%, 50%, 0.0);
106
+ padding: 64px 33svw;
107
+ gap: 64px;
108
+ }
109
+ @media only screen and (max-width: 1000px) {
110
+ #all-news-items {
111
+ padding: 64px 16px;
112
+ }
113
+ }
114
+ .news-item {
115
+ border-radius: 8px;
116
+ background: hsla(0, 0%, 100%, 0.00);
117
+ border: solid 1px hsla(0, 0%, 100%, 0.00);
118
+ padding: 0px;
119
+ }
120
+ .news-item a {
121
+ padding: 0 !important;
122
+ text-align: left !important;
123
+ }
124
+ .news-item a .news-title {
125
+ font-size: 20px !important;
126
+ font-weight: 600 !important;
127
+ line-height: 1.3 !important;
128
+ margin: 0 !important;
129
+ color: white !important;
130
+ }
131
+ .news-info, .news-info * {
132
+ font-size: 14px !important;
133
+ color: grey !important;
134
+ }
135
+ .news-summary, .news-summary * {
136
+ font-size: 14px !important;
137
+ color: grey !important;
138
+ margin: 0 !important;
139
+ line-height: 1.5 !important;
140
+ text-decoration: none !important;
141
+ text-align: justify;
142
+ }
143
+ .news-info {
144
+ margin-bottom: 4px !important;
145
+ }
146
+ .news-summary {
147
+ margin-top: 8px !important;
148
+ }
149
+ /* ---------- ---------- */
150
+ #title-the-news {
151
+ border: none !important;
152
+ background: transparent !important;
153
+ }
154
+ #title-the-news .top-panel,
155
+ #title-the-news .cm-gutters {
156
+ display: none !important;
157
+ }
158
+ #title-the-news .cm-line {
159
+ font-size: 32px;
160
+ text-align: center;
161
+ }
162
+ """
163
+
164
+ # ====================================================================================================
165
+ # ====================================================================================================
166
+ # ====================================================================================================
167
+
168
+ with gr.Blocks(title="The News") as demo:
169
+ with gr.Row():
170
+ # with gr.Column(scale=1):
171
+ # gr.Markdown()
172
+ with gr.Column(scale=1):
173
+ gr.Code("The News", container=False, show_label=False, show_line_numbers=False, elem_id="title-the-news")
174
+ display_all_news = gr.HTML("<div style='text-align: center; margin-top: 37svh;'>🕷 Spiders are crawling the web 🕷</div>")
175
+ # with gr.Column(scale=1):
176
+ # gr.Markdown()
177
+ demo.load(
178
+ fn=lambda: fn_display_all_news(),
179
+ inputs=[],
180
+ outputs=[display_all_news],
181
+ show_progress="full",
182
+ )
183
+ demo.launch(theme=theme, head=head, css=css)
requirements.txt ADDED
Binary file (120 Bytes). View file