File size: 1,288 Bytes
b5fafa1
 
f1368c4
 
b5fafa1
d708554
b5fafa1
b1060b0
f1368c4
 
 
 
 
b5fafa1
 
f1368c4
 
 
 
 
 
 
 
 
b1060b0
b5fafa1
e040f4f
 
f1368c4
 
 
 
 
 
 
e040f4f
b5fafa1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from pathlib import Path

import aiofiles
import crawl4ai
import httpx
import pytest


async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
    """Extract markdown content from a URL using crawl4ai."""
    async with crawl4ai.AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=url)
        return result.markdown


async def download_pdf_async(url: str, output_path: Path) -> str:
    """Download a PDF file from a URL."""
    timeout = httpx.Timeout(30.0, connect=10.0)
    async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
        response = await client.get(url)
        response.raise_for_status()
    async with aiofiles.open(output_path, "wb") as f:
        await f.write(response.content)
    return output_path


async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
    """Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
    # Extract the arXiv ID from the URL
    if "/abs/" in url:
        arxiv_id = url.split("/abs/")[1].rstrip("/")
        pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    else:
        # If it's already a PDF URL, use it as is
        pdf_url = url

    return await download_pdf_async(pdf_url, output_path)