Daily_Paper_Reader / src /crawler.py
elfsong
Add .gitignore, update requirements, and implement Hugging Face paper crawler and reader
ea972e7
"""
Hugging Face Daily Papers Crawler
Usage:
python hf_paper_crawler.py # 默认爬取昨天的论文
python hf_paper_crawler.py --date 2026-03-10 # 指定日期
python hf_paper_crawler.py --date 2026-03-10 --output papers.json
"""
import argparse
import json
import ssl
from datetime import datetime, timedelta
from urllib.request import urlopen, Request
from urllib.error import HTTPError
# macOS Python 常见 SSL 证书问题的 workaround
SSL_CTX = ssl.create_default_context()
try:
import certifi
SSL_CTX.load_verify_locations(certifi.where())
except ImportError:
SSL_CTX.check_hostname = False
SSL_CTX.verify_mode = ssl.CERT_NONE
HF_API_URL = "https://huggingface.co/api/daily_papers"
HF_PAPER_URL = "https://huggingface.co/papers"
ARXIV_ABS_URL = "https://arxiv.org/abs"
ARXIV_PDF_URL = "https://arxiv.org/pdf"
def fetch_daily_papers(date: str) -> list[dict]:
"""通过 HuggingFace API 获取指定日期的论文列表"""
url = f"{HF_API_URL}?date={date}"
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
try:
with urlopen(req, timeout=30, context=SSL_CTX) as resp:
data = json.loads(resp.read().decode())
except HTTPError as e:
print(f"API 请求失败: {e.code} {e.reason}")
return []
papers = []
for item in data:
paper = item.get("paper", {})
paper_id = paper.get("id", "")
authors = [a.get("name", "") for a in paper.get("authors", [])]
papers.append({
"title": paper.get("title", ""),
"paper_id": paper_id,
"hf_url": f"{HF_PAPER_URL}/{paper_id}",
"arxiv_url": f"{ARXIV_ABS_URL}/{paper_id}",
"pdf_url": f"{ARXIV_PDF_URL}/{paper_id}",
"authors": authors,
"summary": paper.get("summary", ""),
"upvotes": item.get("paper", {}).get("upvotes", 0),
"published_at": paper.get("publishedAt", ""),
})
# 按 upvotes 降序排列
papers.sort(key=lambda x: x["upvotes"], reverse=True)
return papers
def print_papers(papers: list[dict]):
"""格式化打印论文列表"""
print(f"\n{'='*80}")
print(f"共 {len(papers)} 篇论文")
print(f"{'='*80}\n")
for i, p in enumerate(papers, 1):
authors_str = ", ".join(p["authors"][:5])
if len(p["authors"]) > 5:
authors_str += f" 等 {len(p['authors'])} 人"
print(f"[{i:2d}] 👍 {p['upvotes']:3d} {p['title']}")
print(f" 作者: {authors_str}")
print(f" 链接: {p['hf_url']}")
print(f" PDF: {p['pdf_url']}")
print()
def main():
parser = argparse.ArgumentParser(description="Hugging Face 每日论文爬虫")
yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
parser.add_argument("--date", type=str, default=yesterday, help="日期 (YYYY-MM-DD)")
parser.add_argument("--output", "-o", type=str, help="保存为 JSON 文件")
args = parser.parse_args()
# 校验日期格式
try:
datetime.strptime(args.date, "%Y-%m-%d")
except ValueError:
print(f"日期格式错误: {args.date},请使用 YYYY-MM-DD")
return
print(f"正在获取 {args.date} 的 Hugging Face 论文...")
papers = fetch_daily_papers(args.date)
if not papers:
print("未找到论文。")
return
print_papers(papers)
# 保存 JSON
output_path = args.output or f"hf_papers_{args.date}.json"
with open(output_path, "w", encoding="utf-8") as f:
json.dump(papers, f, ensure_ascii=False, indent=2)
print(f"已保存到 {output_path}")
if __name__ == "__main__":
main()