PaperFlux / tests /api_test.py
Vector73's picture
Add scheduler for fetching papers and storing in db.
4a5a5c6
import aiohttp
import asyncio
import os
from datetime import datetime
API_URL = "https://huggingface.co/api/daily_papers"
PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf"
DOWNLOAD_DIR = "papers"
async def fetch_papers(session):
async with session.get(API_URL) as response:
if response.status == 200:
return await response.json()
raise Exception(f"API request failed: {response.status}")
async def download_pdf(session, paper_entry):
try:
paper_id = paper_entry["paper"]["id"]
pdf_url = PDF_BASE_URL.format(id=paper_id)
clean_id = paper_id.replace("/", "_")
filename = f"{datetime.now().date()}_{clean_id}.pdf"
filepath = os.path.join(DOWNLOAD_DIR, filename)
async with session.get(pdf_url) as response:
if response.status == 200:
content = await response.read()
with open(filepath, "wb") as f:
f.write(content)
return (paper_id, True)
return (paper_id, False)
except Exception as e:
print(f"Error downloading {paper_id}: {str(e)}")
return (paper_id, False)
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
async def main():
async with aiohttp.ClientSession() as session:
papers = await fetch_papers(session)
print(f"Found {len(papers)} papers")
print(f"\nFound {len(papers)} papers:")
for i, paper_entry in enumerate(papers, 1):
paper = paper_entry.get("paper", {})
print(f"\nPaper {i}:")
print(f"ID: {paper.get('id')}")
print(f"Title: {paper.get('title')}")
print(
f"Authors: {', '.join([author.get('name') for author in paper.get('authors', [])])}"
)
print(f"Published: {paper.get('publishedAt')}")
print(f"Summary: {paper.get('summary')[:200]}...")
print(f"PDF URL: {PDF_BASE_URL.format(id=paper.get('id'))}")
tasks = [download_pdf(session, paper) for paper in papers]
results = await asyncio.gather(*tasks)
successful = sum(1 for _, status in results if status)
print(f"Downloaded {successful}/{len(papers)} papers successfully")
if __name__ == "__main__":
asyncio.run(main())