Spaces:
Runtime error
Runtime error
File size: 1,759 Bytes
e040f4f b5fafa1 e040f4f b5fafa1 e040f4f b5fafa1 e040f4f b5fafa1 e040f4f b5fafa1 e040f4f b5fafa1 e040f4f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import asyncio
from mistralai import OCRResponse
from deepengineer.common_path import DATA_DIR
from deepengineer.webcrawler.async_crawl import (
crawl4ai_extract_markdown_of_url_async,
download_pdf_or_arxiv_pdf_async,
)
from deepengineer.webcrawler.pdf_utils import (
convert_pdf_to_markdown_async,
convert_raw_markdown_to_ocr_response,
)
from deepengineer.webcrawler.utils import sanitize_filename
class DataBase:
def __init__(self):
self.urls_to_markdown: dict[str, OCRResponse] = {}
@staticmethod
def preprocess_url(url: str) -> str:
"""Preprocess the url to make it a valid url."""
if "arxiv.org/abs/" in url:
return url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
else:
return url
def crawl_url(self, url: str) -> str:
"""Crawl the url, if the url is a pdf, download the pdf and save and return the markdown."""
url = self.preprocess_url(url)
if "pdf" in url:
output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf")
pdf_path = asyncio.run(
download_pdf_or_arxiv_pdf_async(url, output_path=output_path)
)
ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path))
else:
markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
ocr_response = convert_raw_markdown_to_ocr_response(markdown)
self.urls_to_markdown[url] = ocr_response
return ocr_response
def get_markdown_of_url(self, url: str) -> OCRResponse:
url = self.preprocess_url(url)
if url in self.urls_to_markdown:
return self.urls_to_markdown[url]
else:
return self.crawl_url(url)
|