Spaces:
Running
Running
Create _spider.py
Browse files- _spider.py +60 -0
_spider.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dateutil import parser as dateutil_parser
|
| 2 |
+
import feedparser
|
| 3 |
+
|
| 4 |
+
ls_rss_urls = [
|
| 5 |
+
"https://vneconomy.vn/chung-khoan.rss",
|
| 6 |
+
"https://cafebiz.vn/rss/chung-khoan.rss",
|
| 7 |
+
"https://vietstock.vn/830/chung-khoan/co-phieu.rss",
|
| 8 |
+
"https://vietstock.vn/739/chung-khoan/giao-dich-noi-bo.rss",
|
| 9 |
+
"https://thitruongtaichinh.kinhtedothi.vn/rss/chung-khoan-182.rss",
|
| 10 |
+
"https://nhandan.vn/rss/chungkhoan-1191.rss",
|
| 11 |
+
"https://thanhnien.vn/rss/kinh-te/chung-khoan.rss",
|
| 12 |
+
"https://www.sggp.org.vn/rss/kinhte-taichinhchuungkhoan-44.rss",
|
| 13 |
+
"https://antt.vn/rss/chung-khoan.rss",
|
| 14 |
+
"https://nganhangvietnam.vn/rss/chung-khoan.rss",
|
| 15 |
+
"https://nld.com.vn/rss/kinh-te/tai-chinh-chung-khoan.rss",
|
| 16 |
+
"https://soha.vn/rss/kinh-doanh/chung-khoan.rss",
|
| 17 |
+
"https://tienphong.vn/rss/tai-chinh-chung-khoan-105.rss",
|
| 18 |
+
"https://www.nguoiduatin.vn/rss/kinh-te/tai-chinh-ngan-hang.rss",
|
| 19 |
+
"https://bnews.vn/rss/chung-khoan-33.rss",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
def rss_spider():
|
| 23 |
+
# ----------
|
| 24 |
+
all_entries = []
|
| 25 |
+
for rss_url in ls_rss_urls:
|
| 26 |
+
rss_res = feedparser.parse(rss_url)
|
| 27 |
+
all_entries += rss_res.entries
|
| 28 |
+
print(f"{'✅' if len(rss_res.entries)>0 else '❌'} {rss_url} > {len(rss_res.entries)}")
|
| 29 |
+
# try:
|
| 30 |
+
# published_text = rss_res.entries[0]['published']
|
| 31 |
+
# print(f"Test: {published_text} -> {dateutil_parser.parse(published_text)}")
|
| 32 |
+
# except:
|
| 33 |
+
# pass
|
| 34 |
+
# ----------
|
| 35 |
+
# ls_type_of_keys = []
|
| 36 |
+
# for e in all_entries:
|
| 37 |
+
# if list(e.keys()) not in ls_type_of_keys:
|
| 38 |
+
# ls_type_of_keys.append(list(e.keys()))
|
| 39 |
+
# ls_common_keys = list(set(ls_type_of_keys[0]).intersection(*ls_type_of_keys[1:]))
|
| 40 |
+
# print(ls_common_keys)
|
| 41 |
+
# ----------
|
| 42 |
+
all_entries_clean = []
|
| 43 |
+
for e in all_entries:
|
| 44 |
+
all_entries_clean.append({
|
| 45 |
+
"title": e["title"],
|
| 46 |
+
"link": e["link"],
|
| 47 |
+
"summary": e["summary"],
|
| 48 |
+
"time": dateutil_parser.parse(e["published"]),
|
| 49 |
+
# "published_parsed": e["published_parsed"],
|
| 50 |
+
# "title_detail": e["title_detail"],
|
| 51 |
+
# "id": e["id"],
|
| 52 |
+
# "links": e["links"],
|
| 53 |
+
# "published": e["published"],
|
| 54 |
+
# "guidislink": e["guidislink"],
|
| 55 |
+
# "summary_detail": e["summary_detail"],
|
| 56 |
+
})
|
| 57 |
+
# ----------
|
| 58 |
+
all_entries_sorted = sorted(all_entries_clean, key=lambda x: x["time"], reverse=True)
|
| 59 |
+
# ----------
|
| 60 |
+
return all_entries_sorted[:100]
|