from io import BytesIO from pathlib import Path import numpy as np import pytest import requests from doctr import io def _check_doc_content(doc_tensors, num_pages): # 1 doc of 8 pages assert len(doc_tensors) == num_pages assert all(isinstance(page, np.ndarray) for page in doc_tensors) assert all(page.dtype == np.uint8 for page in doc_tensors) def test_read_pdf(mock_pdf): doc = io.read_pdf(mock_pdf) _check_doc_content(doc, 2) # Test with Path doc = io.read_pdf(Path(mock_pdf)) _check_doc_content(doc, 2) with open(mock_pdf, "rb") as f: doc = io.read_pdf(f.read()) _check_doc_content(doc, 2) # Wrong input type with pytest.raises(TypeError): _ = io.read_pdf(123) # Wrong path with pytest.raises(FileNotFoundError): _ = io.read_pdf("my_imaginary_file.pdf") def test_read_img_as_numpy(tmpdir_factory, mock_pdf): # Wrong input type with pytest.raises(TypeError): _ = io.read_img_as_numpy(123) # Non-existing file with pytest.raises(FileNotFoundError): io.read_img_as_numpy("my_imaginary_file.jpg") # Invalid image with pytest.raises(ValueError): io.read_img_as_numpy(str(mock_pdf)) # From path url = "https://doctr-static.mindee.com/models?id=v0.2.1/Grace_Hopper.jpg&src=0" file = BytesIO(requests.get(url).content) tmp_path = str(tmpdir_factory.mktemp("data").join("mock_img_file.jpg")) with open(tmp_path, "wb") as f: f.write(file.getbuffer()) # Path & stream with open(tmp_path, "rb") as f: page_stream = io.read_img_as_numpy(f.read()) for page in (io.read_img_as_numpy(tmp_path), page_stream): # Data type assert isinstance(page, np.ndarray) assert page.dtype == np.uint8 # Shape assert page.shape == (606, 517, 3) # RGB bgr_page = io.read_img_as_numpy(tmp_path, rgb_output=False) assert np.all(page == bgr_page[..., ::-1]) # Resize target_size = (200, 150) resized_page = io.read_img_as_numpy(tmp_path, target_size) assert resized_page.shape[:2] == target_size def test_read_html(): url = "https://www.google.com" pdf_stream = io.read_html(url) assert isinstance(pdf_stream, bytes) def test_document_file(mock_pdf, mock_image_stream): pages = io.DocumentFile.from_images(mock_image_stream) _check_doc_content(pages, 1) assert isinstance(io.DocumentFile.from_pdf(mock_pdf), list) assert isinstance(io.DocumentFile.from_url("https://www.google.com"), list) def test_pdf(mock_pdf): pages = io.DocumentFile.from_pdf(mock_pdf) # As images num_pages = 2 _check_doc_content(pages, num_pages)