Spaces:
Runtime error
Runtime error
| import asyncio | |
| from mistralai import OCRResponse | |
| from deepengineer.common_path import DATA_DIR | |
| from deepengineer.webcrawler.async_crawl import ( | |
| crawl4ai_extract_markdown_of_url_async, | |
| download_pdf_or_arxiv_pdf_async, | |
| ) | |
| from deepengineer.webcrawler.pdf_utils import ( | |
| convert_pdf_to_markdown_async, | |
| convert_raw_markdown_to_ocr_response, | |
| ) | |
| from deepengineer.webcrawler.utils import sanitize_filename | |
| class DataBase: | |
| def __init__(self): | |
| self.urls_to_markdown: dict[str, OCRResponse] = {} | |
| def preprocess_url(url: str) -> str: | |
| """Preprocess the url to make it a valid url.""" | |
| if "arxiv.org/abs/" in url: | |
| return url.replace("arxiv.org/abs/", "arxiv.org/pdf/") | |
| else: | |
| return url | |
| def crawl_url(self, url: str) -> str: | |
| """Crawl the url, if the url is a pdf, download the pdf and save and return the markdown.""" | |
| url = self.preprocess_url(url) | |
| if "pdf" in url: | |
| output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf") | |
| pdf_path = asyncio.run( | |
| download_pdf_or_arxiv_pdf_async(url, output_path=output_path) | |
| ) | |
| ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path)) | |
| else: | |
| markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url)) | |
| ocr_response = convert_raw_markdown_to_ocr_response(markdown) | |
| self.urls_to_markdown[url] = ocr_response | |
| return ocr_response | |
| def get_markdown_of_url(self, url: str) -> OCRResponse: | |
| url = self.preprocess_url(url) | |
| if url in self.urls_to_markdown: | |
| return self.urls_to_markdown[url] | |
| else: | |
| return self.crawl_url(url) | |