Spaces:

MikeTrizna
/

doctr_demo_fork

Running

File size: 2,709 Bytes

f3270e6

from io import BytesIO
from pathlib import Path

import numpy as np
import pytest
import requests

from doctr import io


def _check_doc_content(doc_tensors, num_pages):
    # 1 doc of 8 pages
    assert len(doc_tensors) == num_pages
    assert all(isinstance(page, np.ndarray) for page in doc_tensors)
    assert all(page.dtype == np.uint8 for page in doc_tensors)


def test_read_pdf(mock_pdf):
    doc = io.read_pdf(mock_pdf)
    _check_doc_content(doc, 2)

    # Test with Path
    doc = io.read_pdf(Path(mock_pdf))
    _check_doc_content(doc, 2)

    with open(mock_pdf, "rb") as f:
        doc = io.read_pdf(f.read())
    _check_doc_content(doc, 2)

    # Wrong input type
    with pytest.raises(TypeError):
        _ = io.read_pdf(123)

    # Wrong path
    with pytest.raises(FileNotFoundError):
        _ = io.read_pdf("my_imaginary_file.pdf")


def test_read_img_as_numpy(tmpdir_factory, mock_pdf):
    # Wrong input type
    with pytest.raises(TypeError):
        _ = io.read_img_as_numpy(123)

    # Non-existing file
    with pytest.raises(FileNotFoundError):
        io.read_img_as_numpy("my_imaginary_file.jpg")

    # Invalid image
    with pytest.raises(ValueError):
        io.read_img_as_numpy(str(mock_pdf))

    # From path
    url = "https://doctr-static.mindee.com/models?id=v0.2.1/Grace_Hopper.jpg&src=0"
    file = BytesIO(requests.get(url).content)
    tmp_path = str(tmpdir_factory.mktemp("data").join("mock_img_file.jpg"))
    with open(tmp_path, "wb") as f:
        f.write(file.getbuffer())

    # Path & stream
    with open(tmp_path, "rb") as f:
        page_stream = io.read_img_as_numpy(f.read())

    for page in (io.read_img_as_numpy(tmp_path), page_stream):
        # Data type
        assert isinstance(page, np.ndarray)
        assert page.dtype == np.uint8
        # Shape
        assert page.shape == (606, 517, 3)

    # RGB
    bgr_page = io.read_img_as_numpy(tmp_path, rgb_output=False)
    assert np.all(page == bgr_page[..., ::-1])

    # Resize
    target_size = (200, 150)
    resized_page = io.read_img_as_numpy(tmp_path, target_size)
    assert resized_page.shape[:2] == target_size


def test_read_html():
    url = "https://www.google.com"
    pdf_stream = io.read_html(url)
    assert isinstance(pdf_stream, bytes)


def test_document_file(mock_pdf, mock_image_stream):
    pages = io.DocumentFile.from_images(mock_image_stream)
    _check_doc_content(pages, 1)

    assert isinstance(io.DocumentFile.from_pdf(mock_pdf), list)
    assert isinstance(io.DocumentFile.from_url("https://www.google.com"), list)


def test_pdf(mock_pdf):
    pages = io.DocumentFile.from_pdf(mock_pdf)

    # As images
    num_pages = 2
    _check_doc_content(pages, num_pages)