Spaces:
Running
Running
elfsong
Add .gitignore, update requirements, and implement Hugging Face paper crawler and reader
ea972e7 | """ | |
| Hugging Face Daily Papers Crawler | |
| Usage: | |
| python hf_paper_crawler.py # 默认爬取昨天的论文 | |
| python hf_paper_crawler.py --date 2026-03-10 # 指定日期 | |
| python hf_paper_crawler.py --date 2026-03-10 --output papers.json | |
| """ | |
| import argparse | |
| import json | |
| import ssl | |
| from datetime import datetime, timedelta | |
| from urllib.request import urlopen, Request | |
| from urllib.error import HTTPError | |
| # macOS Python 常见 SSL 证书问题的 workaround | |
| SSL_CTX = ssl.create_default_context() | |
| try: | |
| import certifi | |
| SSL_CTX.load_verify_locations(certifi.where()) | |
| except ImportError: | |
| SSL_CTX.check_hostname = False | |
| SSL_CTX.verify_mode = ssl.CERT_NONE | |
| HF_API_URL = "https://huggingface.co/api/daily_papers" | |
| HF_PAPER_URL = "https://huggingface.co/papers" | |
| ARXIV_ABS_URL = "https://arxiv.org/abs" | |
| ARXIV_PDF_URL = "https://arxiv.org/pdf" | |
| def fetch_daily_papers(date: str) -> list[dict]: | |
| """通过 HuggingFace API 获取指定日期的论文列表""" | |
| url = f"{HF_API_URL}?date={date}" | |
| req = Request(url, headers={"User-Agent": "Mozilla/5.0"}) | |
| try: | |
| with urlopen(req, timeout=30, context=SSL_CTX) as resp: | |
| data = json.loads(resp.read().decode()) | |
| except HTTPError as e: | |
| print(f"API 请求失败: {e.code} {e.reason}") | |
| return [] | |
| papers = [] | |
| for item in data: | |
| paper = item.get("paper", {}) | |
| paper_id = paper.get("id", "") | |
| authors = [a.get("name", "") for a in paper.get("authors", [])] | |
| papers.append({ | |
| "title": paper.get("title", ""), | |
| "paper_id": paper_id, | |
| "hf_url": f"{HF_PAPER_URL}/{paper_id}", | |
| "arxiv_url": f"{ARXIV_ABS_URL}/{paper_id}", | |
| "pdf_url": f"{ARXIV_PDF_URL}/{paper_id}", | |
| "authors": authors, | |
| "summary": paper.get("summary", ""), | |
| "upvotes": item.get("paper", {}).get("upvotes", 0), | |
| "published_at": paper.get("publishedAt", ""), | |
| }) | |
| # 按 upvotes 降序排列 | |
| papers.sort(key=lambda x: x["upvotes"], reverse=True) | |
| return papers | |
| def print_papers(papers: list[dict]): | |
| """格式化打印论文列表""" | |
| print(f"\n{'='*80}") | |
| print(f"共 {len(papers)} 篇论文") | |
| print(f"{'='*80}\n") | |
| for i, p in enumerate(papers, 1): | |
| authors_str = ", ".join(p["authors"][:5]) | |
| if len(p["authors"]) > 5: | |
| authors_str += f" 等 {len(p['authors'])} 人" | |
| print(f"[{i:2d}] 👍 {p['upvotes']:3d} {p['title']}") | |
| print(f" 作者: {authors_str}") | |
| print(f" 链接: {p['hf_url']}") | |
| print(f" PDF: {p['pdf_url']}") | |
| print() | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Hugging Face 每日论文爬虫") | |
| yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") | |
| parser.add_argument("--date", type=str, default=yesterday, help="日期 (YYYY-MM-DD)") | |
| parser.add_argument("--output", "-o", type=str, help="保存为 JSON 文件") | |
| args = parser.parse_args() | |
| # 校验日期格式 | |
| try: | |
| datetime.strptime(args.date, "%Y-%m-%d") | |
| except ValueError: | |
| print(f"日期格式错误: {args.date},请使用 YYYY-MM-DD") | |
| return | |
| print(f"正在获取 {args.date} 的 Hugging Face 论文...") | |
| papers = fetch_daily_papers(args.date) | |
| if not papers: | |
| print("未找到论文。") | |
| return | |
| print_papers(papers) | |
| # 保存 JSON | |
| output_path = args.output or f"hf_papers_{args.date}.json" | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump(papers, f, ensure_ascii=False, indent=2) | |
| print(f"已保存到 {output_path}") | |
| if __name__ == "__main__": | |
| main() | |