baobuiquang commited on
Commit
81867b7
·
verified ·
1 Parent(s): 175cb45

Create _spider.py

Browse files
Files changed (1) hide show
  1. _spider.py +60 -0
_spider.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dateutil import parser as dateutil_parser
2
+ import feedparser
3
+
4
+ ls_rss_urls = [
5
+ "https://vneconomy.vn/chung-khoan.rss",
6
+ "https://cafebiz.vn/rss/chung-khoan.rss",
7
+ "https://vietstock.vn/830/chung-khoan/co-phieu.rss",
8
+ "https://vietstock.vn/739/chung-khoan/giao-dich-noi-bo.rss",
9
+ "https://thitruongtaichinh.kinhtedothi.vn/rss/chung-khoan-182.rss",
10
+ "https://nhandan.vn/rss/chungkhoan-1191.rss",
11
+ "https://thanhnien.vn/rss/kinh-te/chung-khoan.rss",
12
+ "https://www.sggp.org.vn/rss/kinhte-taichinhchuungkhoan-44.rss",
13
+ "https://antt.vn/rss/chung-khoan.rss",
14
+ "https://nganhangvietnam.vn/rss/chung-khoan.rss",
15
+ "https://nld.com.vn/rss/kinh-te/tai-chinh-chung-khoan.rss",
16
+ "https://soha.vn/rss/kinh-doanh/chung-khoan.rss",
17
+ "https://tienphong.vn/rss/tai-chinh-chung-khoan-105.rss",
18
+ "https://www.nguoiduatin.vn/rss/kinh-te/tai-chinh-ngan-hang.rss",
19
+ "https://bnews.vn/rss/chung-khoan-33.rss",
20
+ ]
21
+
22
+ def rss_spider():
23
+ # ----------
24
+ all_entries = []
25
+ for rss_url in ls_rss_urls:
26
+ rss_res = feedparser.parse(rss_url)
27
+ all_entries += rss_res.entries
28
+ print(f"{'✅' if len(rss_res.entries)>0 else '❌'} {rss_url} > {len(rss_res.entries)}")
29
+ # try:
30
+ # published_text = rss_res.entries[0]['published']
31
+ # print(f"Test: {published_text} -> {dateutil_parser.parse(published_text)}")
32
+ # except:
33
+ # pass
34
+ # ----------
35
+ # ls_type_of_keys = []
36
+ # for e in all_entries:
37
+ # if list(e.keys()) not in ls_type_of_keys:
38
+ # ls_type_of_keys.append(list(e.keys()))
39
+ # ls_common_keys = list(set(ls_type_of_keys[0]).intersection(*ls_type_of_keys[1:]))
40
+ # print(ls_common_keys)
41
+ # ----------
42
+ all_entries_clean = []
43
+ for e in all_entries:
44
+ all_entries_clean.append({
45
+ "title": e["title"],
46
+ "link": e["link"],
47
+ "summary": e["summary"],
48
+ "time": dateutil_parser.parse(e["published"]),
49
+ # "published_parsed": e["published_parsed"],
50
+ # "title_detail": e["title_detail"],
51
+ # "id": e["id"],
52
+ # "links": e["links"],
53
+ # "published": e["published"],
54
+ # "guidislink": e["guidislink"],
55
+ # "summary_detail": e["summary_detail"],
56
+ })
57
+ # ----------
58
+ all_entries_sorted = sorted(all_entries_clean, key=lambda x: x["time"], reverse=True)
59
+ # ----------
60
+ return all_entries_sorted[:100]