SPD / _spider.py
baobuiquang's picture
Create _spider.py
81867b7 verified
raw
history blame
2.45 kB
from dateutil import parser as dateutil_parser
import feedparser
ls_rss_urls = [
"https://vneconomy.vn/chung-khoan.rss",
"https://cafebiz.vn/rss/chung-khoan.rss",
"https://vietstock.vn/830/chung-khoan/co-phieu.rss",
"https://vietstock.vn/739/chung-khoan/giao-dich-noi-bo.rss",
"https://thitruongtaichinh.kinhtedothi.vn/rss/chung-khoan-182.rss",
"https://nhandan.vn/rss/chungkhoan-1191.rss",
"https://thanhnien.vn/rss/kinh-te/chung-khoan.rss",
"https://www.sggp.org.vn/rss/kinhte-taichinhchuungkhoan-44.rss",
"https://antt.vn/rss/chung-khoan.rss",
"https://nganhangvietnam.vn/rss/chung-khoan.rss",
"https://nld.com.vn/rss/kinh-te/tai-chinh-chung-khoan.rss",
"https://soha.vn/rss/kinh-doanh/chung-khoan.rss",
"https://tienphong.vn/rss/tai-chinh-chung-khoan-105.rss",
"https://www.nguoiduatin.vn/rss/kinh-te/tai-chinh-ngan-hang.rss",
"https://bnews.vn/rss/chung-khoan-33.rss",
]
def rss_spider():
# ----------
all_entries = []
for rss_url in ls_rss_urls:
rss_res = feedparser.parse(rss_url)
all_entries += rss_res.entries
print(f"{'βœ…' if len(rss_res.entries)>0 else '❌'} {rss_url} > {len(rss_res.entries)}")
# try:
# published_text = rss_res.entries[0]['published']
# print(f"Test: {published_text} -> {dateutil_parser.parse(published_text)}")
# except:
# pass
# ----------
# ls_type_of_keys = []
# for e in all_entries:
# if list(e.keys()) not in ls_type_of_keys:
# ls_type_of_keys.append(list(e.keys()))
# ls_common_keys = list(set(ls_type_of_keys[0]).intersection(*ls_type_of_keys[1:]))
# print(ls_common_keys)
# ----------
all_entries_clean = []
for e in all_entries:
all_entries_clean.append({
"title": e["title"],
"link": e["link"],
"summary": e["summary"],
"time": dateutil_parser.parse(e["published"]),
# "published_parsed": e["published_parsed"],
# "title_detail": e["title_detail"],
# "id": e["id"],
# "links": e["links"],
# "published": e["published"],
# "guidislink": e["guidislink"],
# "summary_detail": e["summary_detail"],
})
# ----------
all_entries_sorted = sorted(all_entries_clean, key=lambda x: x["time"], reverse=True)
# ----------
return all_entries_sorted[:100]