Spaces:
Runtime error
Runtime error
| import pytest | |
| from deepengineer.common_path import DATA_DIR | |
| from deepengineer.webcrawler.pdf_utils import ( | |
| convert_ocr_response_to_markdown, | |
| convert_pdf_to_markdown_async, | |
| find_in_markdown, | |
| get_markdown_by_page_numbers, | |
| get_table_of_contents_per_page_markdown, | |
| ) | |
| from mistralai import OCRResponse | |
| def load_mock_ocr_response() -> OCRResponse: | |
| with open(DATA_DIR / "report_thermal_neutron.json") as f: | |
| return OCRResponse.model_validate_json(f.read()) | |
| async def test_convert_pdf_to_markdown_async(): | |
| pdf_path = DATA_DIR / "report_thermal_neutron.pdf" | |
| assert pdf_path.exists() | |
| ocr_response = await convert_pdf_to_markdown_async(pdf_path) | |
| markdown = convert_ocr_response_to_markdown(ocr_response) | |
| assert isinstance(ocr_response, OCRResponse) | |
| assert len(ocr_response.pages) == 16 | |
| assert "where each cylinder represent" in markdown | |
| def test_table_of_contents_per_page_pdf(): | |
| ocr_response = load_mock_ocr_response() | |
| table_of_contents = get_table_of_contents_per_page_markdown(ocr_response) | |
| assert "References - Page 15" in table_of_contents | |
| def test_find_in_pdf(): | |
| ocr_response = load_mock_ocr_response() | |
| page_numbers = find_in_markdown(ocr_response, "where each cylinder represent") | |
| assert page_numbers == [7] | |
| def test_get_markdown_by_page_numbers(): | |
| ocr_response = load_mock_ocr_response() | |
| page_numbers = [7, 15] | |
| markdown = get_markdown_by_page_numbers(ocr_response, page_numbers) | |
| assert "Page 7" in markdown | |
| assert "Page 15" in markdown | |
| assert "References" in markdown | |
| assert "where each cylinder represent" in markdown | |