import aiohttp import asyncio import os from datetime import datetime API_URL = "https://huggingface.co/api/daily_papers" PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf" DOWNLOAD_DIR = "papers" async def fetch_papers(session): async with session.get(API_URL) as response: if response.status == 200: return await response.json() raise Exception(f"API request failed: {response.status}") async def download_pdf(session, paper_entry): try: paper_id = paper_entry["paper"]["id"] pdf_url = PDF_BASE_URL.format(id=paper_id) clean_id = paper_id.replace("/", "_") filename = f"{datetime.now().date()}_{clean_id}.pdf" filepath = os.path.join(DOWNLOAD_DIR, filename) async with session.get(pdf_url) as response: if response.status == 200: content = await response.read() with open(filepath, "wb") as f: f.write(content) return (paper_id, True) return (paper_id, False) except Exception as e: print(f"Error downloading {paper_id}: {str(e)}") return (paper_id, False) os.makedirs(DOWNLOAD_DIR, exist_ok=True) async def main(): async with aiohttp.ClientSession() as session: papers = await fetch_papers(session) print(f"Found {len(papers)} papers") print(f"\nFound {len(papers)} papers:") for i, paper_entry in enumerate(papers, 1): paper = paper_entry.get("paper", {}) print(f"\nPaper {i}:") print(f"ID: {paper.get('id')}") print(f"Title: {paper.get('title')}") print( f"Authors: {', '.join([author.get('name') for author in paper.get('authors', [])])}" ) print(f"Published: {paper.get('publishedAt')}") print(f"Summary: {paper.get('summary')[:200]}...") print(f"PDF URL: {PDF_BASE_URL.format(id=paper.get('id'))}") tasks = [download_pdf(session, paper) for paper in papers] results = await asyncio.gather(*tasks) successful = sum(1 for _, status in results if status) print(f"Downloaded {successful}/{len(papers)} papers successfully") if __name__ == "__main__": asyncio.run(main())