""" Hugging Face Daily Papers Crawler Usage: python hf_paper_crawler.py # 默认爬取昨天的论文 python hf_paper_crawler.py --date 2026-03-10 # 指定日期 python hf_paper_crawler.py --date 2026-03-10 --output papers.json """ import argparse import json import ssl from datetime import datetime, timedelta from urllib.request import urlopen, Request from urllib.error import HTTPError # macOS Python 常见 SSL 证书问题的 workaround SSL_CTX = ssl.create_default_context() try: import certifi SSL_CTX.load_verify_locations(certifi.where()) except ImportError: SSL_CTX.check_hostname = False SSL_CTX.verify_mode = ssl.CERT_NONE HF_API_URL = "https://huggingface.co/api/daily_papers" HF_PAPER_URL = "https://huggingface.co/papers" ARXIV_ABS_URL = "https://arxiv.org/abs" ARXIV_PDF_URL = "https://arxiv.org/pdf" def fetch_daily_papers(date: str) -> list[dict]: """通过 HuggingFace API 获取指定日期的论文列表""" url = f"{HF_API_URL}?date={date}" req = Request(url, headers={"User-Agent": "Mozilla/5.0"}) try: with urlopen(req, timeout=30, context=SSL_CTX) as resp: data = json.loads(resp.read().decode()) except HTTPError as e: print(f"API 请求失败: {e.code} {e.reason}") return [] papers = [] for item in data: paper = item.get("paper", {}) paper_id = paper.get("id", "") authors = [a.get("name", "") for a in paper.get("authors", [])] papers.append({ "title": paper.get("title", ""), "paper_id": paper_id, "hf_url": f"{HF_PAPER_URL}/{paper_id}", "arxiv_url": f"{ARXIV_ABS_URL}/{paper_id}", "pdf_url": f"{ARXIV_PDF_URL}/{paper_id}", "authors": authors, "summary": paper.get("summary", ""), "upvotes": item.get("paper", {}).get("upvotes", 0), "published_at": paper.get("publishedAt", ""), }) # 按 upvotes 降序排列 papers.sort(key=lambda x: x["upvotes"], reverse=True) return papers def print_papers(papers: list[dict]): """格式化打印论文列表""" print(f"\n{'='*80}") print(f"共 {len(papers)} 篇论文") print(f"{'='*80}\n") for i, p in enumerate(papers, 1): authors_str = ", ".join(p["authors"][:5]) if len(p["authors"]) > 5: authors_str += f" 等 {len(p['authors'])} 人" print(f"[{i:2d}] 👍 {p['upvotes']:3d} {p['title']}") print(f" 作者: {authors_str}") print(f" 链接: {p['hf_url']}") print(f" PDF: {p['pdf_url']}") print() def main(): parser = argparse.ArgumentParser(description="Hugging Face 每日论文爬虫") yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") parser.add_argument("--date", type=str, default=yesterday, help="日期 (YYYY-MM-DD)") parser.add_argument("--output", "-o", type=str, help="保存为 JSON 文件") args = parser.parse_args() # 校验日期格式 try: datetime.strptime(args.date, "%Y-%m-%d") except ValueError: print(f"日期格式错误: {args.date},请使用 YYYY-MM-DD") return print(f"正在获取 {args.date} 的 Hugging Face 论文...") papers = fetch_daily_papers(args.date) if not papers: print("未找到论文。") return print_papers(papers) # 保存 JSON output_path = args.output or f"hf_papers_{args.date}.json" with open(output_path, "w", encoding="utf-8") as f: json.dump(papers, f, ensure_ascii=False, indent=2) print(f"已保存到 {output_path}") if __name__ == "__main__": main()