File size: 3,723 Bytes
ea972e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Hugging Face Daily Papers Crawler
Usage:
    python hf_paper_crawler.py                          # 默认爬取昨天的论文
    python hf_paper_crawler.py --date 2026-03-10        # 指定日期
    python hf_paper_crawler.py --date 2026-03-10 --output papers.json
"""

import argparse
import json
import ssl
from datetime import datetime, timedelta
from urllib.request import urlopen, Request
from urllib.error import HTTPError

# macOS Python 常见 SSL 证书问题的 workaround
SSL_CTX = ssl.create_default_context()
try:
    import certifi
    SSL_CTX.load_verify_locations(certifi.where())
except ImportError:
    SSL_CTX.check_hostname = False
    SSL_CTX.verify_mode = ssl.CERT_NONE


HF_API_URL = "https://huggingface.co/api/daily_papers"
HF_PAPER_URL = "https://huggingface.co/papers"
ARXIV_ABS_URL = "https://arxiv.org/abs"
ARXIV_PDF_URL = "https://arxiv.org/pdf"


def fetch_daily_papers(date: str) -> list[dict]:
    """通过 HuggingFace API 获取指定日期的论文列表"""
    url = f"{HF_API_URL}?date={date}"
    req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
    try:
        with urlopen(req, timeout=30, context=SSL_CTX) as resp:
            data = json.loads(resp.read().decode())
    except HTTPError as e:
        print(f"API 请求失败: {e.code} {e.reason}")
        return []

    papers = []
    for item in data:
        paper = item.get("paper", {})
        paper_id = paper.get("id", "")
        authors = [a.get("name", "") for a in paper.get("authors", [])]
        papers.append({
            "title": paper.get("title", ""),
            "paper_id": paper_id,
            "hf_url": f"{HF_PAPER_URL}/{paper_id}",
            "arxiv_url": f"{ARXIV_ABS_URL}/{paper_id}",
            "pdf_url": f"{ARXIV_PDF_URL}/{paper_id}",
            "authors": authors,
            "summary": paper.get("summary", ""),
            "upvotes": item.get("paper", {}).get("upvotes", 0),
            "published_at": paper.get("publishedAt", ""),
        })

    # 按 upvotes 降序排列
    papers.sort(key=lambda x: x["upvotes"], reverse=True)
    return papers


def print_papers(papers: list[dict]):
    """格式化打印论文列表"""
    print(f"\n{'='*80}")
    print(f"共 {len(papers)} 篇论文")
    print(f"{'='*80}\n")
    for i, p in enumerate(papers, 1):
        authors_str = ", ".join(p["authors"][:5])
        if len(p["authors"]) > 5:
            authors_str += f" 等 {len(p['authors'])} 人"
        print(f"[{i:2d}] 👍 {p['upvotes']:3d}  {p['title']}")
        print(f"     作者: {authors_str}")
        print(f"     链接: {p['hf_url']}")
        print(f"     PDF:  {p['pdf_url']}")
        print()


def main():
    parser = argparse.ArgumentParser(description="Hugging Face 每日论文爬虫")
    yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
    parser.add_argument("--date", type=str, default=yesterday, help="日期 (YYYY-MM-DD)")
    parser.add_argument("--output", "-o", type=str, help="保存为 JSON 文件")
    args = parser.parse_args()

    # 校验日期格式
    try:
        datetime.strptime(args.date, "%Y-%m-%d")
    except ValueError:
        print(f"日期格式错误: {args.date},请使用 YYYY-MM-DD")
        return

    print(f"正在获取 {args.date} 的 Hugging Face 论文...")
    papers = fetch_daily_papers(args.date)

    if not papers:
        print("未找到论文。")
        return

    print_papers(papers)

    # 保存 JSON
    output_path = args.output or f"hf_papers_{args.date}.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(papers, f, ensure_ascii=False, indent=2)
    print(f"已保存到 {output_path}")


if __name__ == "__main__":
    main()