File size: 1,181 Bytes
f1368c4
b5fafa1
f1368c4
 
 
e040f4f
f1368c4
b5fafa1
 
f1368c4
4fbfc2b
f1368c4
 
 
 
 
 
 
 
 
 
 
 
 
 
b5fafa1
f1368c4
 
 
 
 
e040f4f
f1368c4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pytest
from deepengineer.common_path import DATA_DIR
from deepengineer.webcrawler.async_crawl import (
    crawl4ai_extract_markdown_of_url_async,
    download_pdf_async,
    download_pdf_or_arxiv_pdf_async,
)
from deepengineer.webcrawler.testing import ARXIV_URL, URL_PDF, URL_WIKIPEDIA


@pytest.mark.playwright
@pytest.mark.asyncio
async def test_crawl4ai_extract_markdown_of_url_async():
    markdown = await crawl4ai_extract_markdown_of_url_async(URL_WIKIPEDIA)
    assert isinstance(markdown, str)
    assert "Graphite-moderated reactor" in markdown

@pytest.mark.asyncio
async def test_download_pdf_async():
    output_path = DATA_DIR / "temp.pdf"
    output_path.unlink(missing_ok=True)
    pdf_path = await download_pdf_async(URL_PDF, output_path=output_path)
    assert pdf_path == output_path
    assert output_path.exists()


@pytest.mark.asyncio
async def test_arxiv_download_pdf_async():
    output_path = DATA_DIR / "temp.pdf"
    output_path.unlink(missing_ok=True)
    assert not output_path.exists()
    pdf_path = await download_pdf_or_arxiv_pdf_async(ARXIV_URL, output_path=output_path)
    assert pdf_path == output_path
    assert output_path.exists()