Spaces:

smokxy
/

PaperFlux

Running

App Files Files Community

PaperFlux / tests /api_test.py

Vector73

Add scheduler for fetching papers and storing in db.

4a5a5c6 11 months ago

raw

history blame contribute delete

2.28 kB

	import aiohttp
	import asyncio
	import os
	from datetime import datetime

	API_URL = "https://huggingface.co/api/daily_papers"
	PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf"
	DOWNLOAD_DIR = "papers"


	async def fetch_papers(session):
	async with session.get(API_URL) as response:
	if response.status == 200:
	return await response.json()
	raise Exception(f"API request failed: {response.status}")


	async def download_pdf(session, paper_entry):
	try:
	paper_id = paper_entry["paper"]["id"]
	pdf_url = PDF_BASE_URL.format(id=paper_id)
	clean_id = paper_id.replace("/", "_")
	filename = f"{datetime.now().date()}_{clean_id}.pdf"
	filepath = os.path.join(DOWNLOAD_DIR, filename)

	async with session.get(pdf_url) as response:
	if response.status == 200:
	content = await response.read()
	with open(filepath, "wb") as f:
	f.write(content)
	return (paper_id, True)
	return (paper_id, False)
	except Exception as e:
	print(f"Error downloading {paper_id}: {str(e)}")
	return (paper_id, False)


	os.makedirs(DOWNLOAD_DIR, exist_ok=True)


	async def main():
	async with aiohttp.ClientSession() as session:
	papers = await fetch_papers(session)
	print(f"Found {len(papers)} papers")

	print(f"\nFound {len(papers)} papers:")
	for i, paper_entry in enumerate(papers, 1):
	paper = paper_entry.get("paper", {})
	print(f"\nPaper {i}:")
	print(f"ID: {paper.get('id')}")
	print(f"Title: {paper.get('title')}")
	print(
	f"Authors: {', '.join([author.get('name') for author in paper.get('authors', [])])}"
	)
	print(f"Published: {paper.get('publishedAt')}")
	print(f"Summary: {paper.get('summary')[:200]}...")
	print(f"PDF URL: {PDF_BASE_URL.format(id=paper.get('id'))}")

	tasks = [download_pdf(session, paper) for paper in papers]
	results = await asyncio.gather(*tasks)

	successful = sum(1 for _, status in results if status)
	print(f"Downloaded {successful}/{len(papers)} papers successfully")


	if __name__ == "__main__":
	asyncio.run(main())