Spaces:
Runtime error
Runtime error
File size: 1,288 Bytes
b5fafa1 f1368c4 b5fafa1 d708554 b5fafa1 b1060b0 f1368c4 b5fafa1 f1368c4 b1060b0 b5fafa1 e040f4f f1368c4 e040f4f b5fafa1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
from pathlib import Path
import aiofiles
import crawl4ai
import httpx
import pytest
async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
"""Extract markdown content from a URL using crawl4ai."""
async with crawl4ai.AsyncWebCrawler() as crawler:
result = await crawler.arun(url=url)
return result.markdown
async def download_pdf_async(url: str, output_path: Path) -> str:
"""Download a PDF file from a URL."""
timeout = httpx.Timeout(30.0, connect=10.0)
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
response = await client.get(url)
response.raise_for_status()
async with aiofiles.open(output_path, "wb") as f:
await f.write(response.content)
return output_path
async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
"""Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
# Extract the arXiv ID from the URL
if "/abs/" in url:
arxiv_id = url.split("/abs/")[1].rstrip("/")
pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
else:
# If it's already a PDF URL, use it as is
pdf_url = url
return await download_pdf_async(pdf_url, output_path)
|