File size: 2,279 Bytes
1d07116
 
 
 
 
 
 
 
 
4a5a5c6
1d07116
4a5a5c6
 
 
 
 
1d07116
 
4a5a5c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d07116
4a5a5c6
 
1d07116
 
4a5a5c6
1d07116
 
 
 
4a5a5c6
1d07116
 
 
 
 
 
4a5a5c6
 
 
1d07116
 
 
 
 
 
4a5a5c6
1d07116
 
 
4a5a5c6
1d07116
4a5a5c6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import aiohttp
import asyncio
import os
from datetime import datetime

API_URL = "https://huggingface.co/api/daily_papers"
PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf"
DOWNLOAD_DIR = "papers"


async def fetch_papers(session):
    async with session.get(API_URL) as response:
        if response.status == 200:
            return await response.json()
        raise Exception(f"API request failed: {response.status}")


async def download_pdf(session, paper_entry):
    try:
        paper_id = paper_entry["paper"]["id"]
        pdf_url = PDF_BASE_URL.format(id=paper_id)
        clean_id = paper_id.replace("/", "_")
        filename = f"{datetime.now().date()}_{clean_id}.pdf"
        filepath = os.path.join(DOWNLOAD_DIR, filename)

        async with session.get(pdf_url) as response:
            if response.status == 200:
                content = await response.read()
                with open(filepath, "wb") as f:
                    f.write(content)
                return (paper_id, True)
            return (paper_id, False)
    except Exception as e:
        print(f"Error downloading {paper_id}: {str(e)}")
        return (paper_id, False)


os.makedirs(DOWNLOAD_DIR, exist_ok=True)


async def main():
    async with aiohttp.ClientSession() as session:
        papers = await fetch_papers(session)
        print(f"Found {len(papers)} papers")

        print(f"\nFound {len(papers)} papers:")
        for i, paper_entry in enumerate(papers, 1):
            paper = paper_entry.get("paper", {})
            print(f"\nPaper {i}:")
            print(f"ID: {paper.get('id')}")
            print(f"Title: {paper.get('title')}")
            print(
                f"Authors: {', '.join([author.get('name') for author in paper.get('authors', [])])}"
            )
            print(f"Published: {paper.get('publishedAt')}")
            print(f"Summary: {paper.get('summary')[:200]}...")
            print(f"PDF URL: {PDF_BASE_URL.format(id=paper.get('id'))}")

        tasks = [download_pdf(session, paper) for paper in papers]
        results = await asyncio.gather(*tasks)

        successful = sum(1 for _, status in results if status)
        print(f"Downloaded {successful}/{len(papers)} papers successfully")


if __name__ == "__main__":
    asyncio.run(main())