Spaces:

charles-azam
/

deepdraft

Runtime error

File size: 1,288 Bytes

b5fafa1
 
f1368c4
 
b5fafa1
d708554
b5fafa1
b1060b0
f1368c4
 
 
 
 
b5fafa1
 
f1368c4
 
 
 
 
 
 
 
 
b1060b0
b5fafa1
e040f4f
 
f1368c4
 
 
 
 
 
 
e040f4f
b5fafa1

from pathlib import Path

import aiofiles
import crawl4ai
import httpx
import pytest


async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
    """Extract markdown content from a URL using crawl4ai."""
    async with crawl4ai.AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=url)
        return result.markdown


async def download_pdf_async(url: str, output_path: Path) -> str:
    """Download a PDF file from a URL."""
    timeout = httpx.Timeout(30.0, connect=10.0)
    async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
        response = await client.get(url)
        response.raise_for_status()
    async with aiofiles.open(output_path, "wb") as f:
        await f.write(response.content)
    return output_path


async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
    """Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
    # Extract the arXiv ID from the URL
    if "/abs/" in url:
        arxiv_id = url.split("/abs/")[1].rstrip("/")
        pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    else:
        # If it's already a PDF URL, use it as is
        pdf_url = url

    return await download_pdf_async(pdf_url, output_path)