Spaces:

Elfsong
/

Daily_Paper_Reader

Running

elfsong

Add .gitignore, update requirements, and implement Hugging Face paper crawler and reader

ea972e7 2 days ago

3.72 kB

	"""
	Hugging Face Daily Papers Crawler
	Usage:
	python hf_paper_crawler.py # 默认爬取昨天的论文
	python hf_paper_crawler.py --date 2026-03-10 # 指定日期
	python hf_paper_crawler.py --date 2026-03-10 --output papers.json
	"""

	import argparse
	import json
	import ssl
	from datetime import datetime, timedelta
	from urllib.request import urlopen, Request
	from urllib.error import HTTPError

	# macOS Python 常见 SSL 证书问题的 workaround
	SSL_CTX = ssl.create_default_context()
	try:
	import certifi
	SSL_CTX.load_verify_locations(certifi.where())
	except ImportError:
	SSL_CTX.check_hostname = False
	SSL_CTX.verify_mode = ssl.CERT_NONE


	HF_API_URL = "https://huggingface.co/api/daily_papers"
	HF_PAPER_URL = "https://huggingface.co/papers"
	ARXIV_ABS_URL = "https://arxiv.org/abs"
	ARXIV_PDF_URL = "https://arxiv.org/pdf"


	def fetch_daily_papers(date: str) -> list[dict]:
	"""通过 HuggingFace API 获取指定日期的论文列表"""
	url = f"{HF_API_URL}?date={date}"
	req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
	try:
	with urlopen(req, timeout=30, context=SSL_CTX) as resp:
	data = json.loads(resp.read().decode())
	except HTTPError as e:
	print(f"API 请求失败: {e.code} {e.reason}")
	return []

	papers = []
	for item in data:
	paper = item.get("paper", {})
	paper_id = paper.get("id", "")
	authors = [a.get("name", "") for a in paper.get("authors", [])]
	papers.append({
	"title": paper.get("title", ""),
	"paper_id": paper_id,
	"hf_url": f"{HF_PAPER_URL}/{paper_id}",
	"arxiv_url": f"{ARXIV_ABS_URL}/{paper_id}",
	"pdf_url": f"{ARXIV_PDF_URL}/{paper_id}",
	"authors": authors,
	"summary": paper.get("summary", ""),
	"upvotes": item.get("paper", {}).get("upvotes", 0),
	"published_at": paper.get("publishedAt", ""),
	})

	# 按 upvotes 降序排列
	papers.sort(key=lambda x: x["upvotes"], reverse=True)
	return papers


	def print_papers(papers: list[dict]):
	"""格式化打印论文列表"""
	print(f"\n{'='*80}")
	print(f"共 {len(papers)} 篇论文")
	print(f"{'='*80}\n")
	for i, p in enumerate(papers, 1):
	authors_str = ", ".join(p["authors"][:5])
	if len(p["authors"]) > 5:
	authors_str += f" 等 {len(p['authors'])} 人"
	print(f"[{i:2d}] 👍 {p['upvotes']:3d} {p['title']}")
	print(f" 作者: {authors_str}")
	print(f" 链接: {p['hf_url']}")
	print(f" PDF: {p['pdf_url']}")
	print()


	def main():
	parser = argparse.ArgumentParser(description="Hugging Face 每日论文爬虫")
	yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
	parser.add_argument("--date", type=str, default=yesterday, help="日期 (YYYY-MM-DD)")
	parser.add_argument("--output", "-o", type=str, help="保存为 JSON 文件")
	args = parser.parse_args()

	# 校验日期格式
	try:
	datetime.strptime(args.date, "%Y-%m-%d")
	except ValueError:
	print(f"日期格式错误: {args.date}，请使用 YYYY-MM-DD")
	return

	print(f"正在获取 {args.date} 的 Hugging Face 论文...")
	papers = fetch_daily_papers(args.date)

	if not papers:
	print("未找到论文。")
	return

	print_papers(papers)

	# 保存 JSON
	output_path = args.output or f"hf_papers_{args.date}.json"
	with open(output_path, "w", encoding="utf-8") as f:
	json.dump(papers, f, ensure_ascii=False, indent=2)
	print(f"已保存到 {output_path}")


	if __name__ == "__main__":
	main()