File size: 1,759 Bytes
e040f4f
b5fafa1
e040f4f
b5fafa1
 
 
 
 
 
 
 
 
 
 
 
 
 
e040f4f
 
b5fafa1
e040f4f
 
 
 
 
 
 
b5fafa1
e040f4f
 
 
 
 
b5fafa1
 
 
e040f4f
 
 
 
 
 
b5fafa1
e040f4f
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import asyncio

from mistralai import OCRResponse

from deepengineer.common_path import DATA_DIR
from deepengineer.webcrawler.async_crawl import (
    crawl4ai_extract_markdown_of_url_async,
    download_pdf_or_arxiv_pdf_async,
)
from deepengineer.webcrawler.pdf_utils import (
    convert_pdf_to_markdown_async,
    convert_raw_markdown_to_ocr_response,
)
from deepengineer.webcrawler.utils import sanitize_filename


class DataBase:
    def __init__(self):
        self.urls_to_markdown: dict[str, OCRResponse] = {}

    @staticmethod
    def preprocess_url(url: str) -> str:
        """Preprocess the url to make it a valid url."""
        if "arxiv.org/abs/" in url:
            return url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
        else:
            return url

    def crawl_url(self, url: str) -> str:
        """Crawl the url, if the url is a pdf, download the pdf and save and return the markdown."""
        url = self.preprocess_url(url)
        if "pdf" in url:
            output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf")
            pdf_path = asyncio.run(
                download_pdf_or_arxiv_pdf_async(url, output_path=output_path)
            )
            ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path))
        else:
            markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
            ocr_response = convert_raw_markdown_to_ocr_response(markdown)
        self.urls_to_markdown[url] = ocr_response
        return ocr_response

    def get_markdown_of_url(self, url: str) -> OCRResponse:
        url = self.preprocess_url(url)
        if url in self.urls_to_markdown:
            return self.urls_to_markdown[url]
        else:
            return self.crawl_url(url)