Spaces:

XQ
/

Dokumentassistent

Sleeping

File size: 5,385 Bytes

31a2688

"""Tests for src.ingestion.pdf_parser."""

import os
import tempfile

import fitz  # PyMuPDF
import pytest

from src.ingestion.pdf_parser import PDFParser


@pytest.fixture
def parser() -> PDFParser:
    """Return a PDFParser instance."""
    return PDFParser()


def _create_pdf(tmp_dir: str, filename: str, pages: list[str]) -> str:
    """Helper: create a PDF with the given page texts and return its path."""
    path = os.path.join(tmp_dir, filename)
    doc = fitz.open()
    for text in pages:
        page = doc.new_page()
        page.insert_text((72, 72), text)
    doc.save(path)
    doc.close()
    return path


class TestPDFParser:
    """Tests for the PDFParser class."""

    # ---- 正常 PDF 解析 ----

    def test_parse_valid_pdf(self, parser: PDFParser) -> None:
        """Test parsing a valid single-page PDF returns correct structure."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = _create_pdf(tmp_dir, "valid.pdf", ["Hello world"])
            result = parser.parse(path)

            assert len(result) == 1
            assert "Hello world" in result[0]["text"]
            assert result[0]["page_number"] == 1
            assert result[0]["source"] == "valid.pdf"

    # ---- 空文件 ----

    def test_parse_empty_pdf(self, parser: PDFParser) -> None:
        """Test parsing a PDF with no text returns an empty list."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = _create_pdf(tmp_dir, "empty.pdf", [""])
            result = parser.parse(path)

            assert result == []

    def test_parse_whitespace_only_pdf(self, parser: PDFParser) -> None:
        """Test parsing a PDF with only whitespace text returns an empty list."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = _create_pdf(tmp_dir, "whitespace.pdf", ["   \n\t  "])
            result = parser.parse(path)

            assert result == []

    # ---- 损坏文件 ----

    def test_parse_corrupted_file_raises(self, parser: PDFParser) -> None:
        """Test that a corrupted file raises ValueError."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = os.path.join(tmp_dir, "corrupted.pdf")
            with open(path, "wb") as f:
                f.write(b"not a real pdf content")

            with pytest.raises(ValueError, match="Failed to open PDF"):
                parser.parse(path)

    def test_parse_nonexistent_file(self, parser: PDFParser) -> None:
        """Test that parsing a missing file raises FileNotFoundError."""
        with pytest.raises(FileNotFoundError):
            parser.parse("/nonexistent/path/to/file.pdf")

    def test_parse_non_pdf_extension(self, parser: PDFParser) -> None:
        """Test that a non-.pdf file raises ValueError."""
        with tempfile.NamedTemporaryFile(suffix=".txt") as tmp:
            with pytest.raises(ValueError, match="not a PDF"):
                parser.parse(tmp.name)

    # ---- 多页文档 ----

    def test_parse_multipage_pdf(self, parser: PDFParser) -> None:
        """Test parsing a multi-page PDF returns all pages with correct indices."""
        page_texts = ["Page one content", "Page two content", "Page three content"]
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = _create_pdf(tmp_dir, "multi.pdf", page_texts)
            result = parser.parse(path)

            assert len(result) == 3
            for i, page_data in enumerate(result):
                assert page_data["page_number"] == i + 1
                assert page_texts[i] in page_data["text"]
                assert page_data["source"] == "multi.pdf"

    def test_parse_multipage_skips_blank_pages(self, parser: PDFParser) -> None:
        """Test that blank pages in a multi-page PDF are skipped."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = _create_pdf(tmp_dir, "gaps.pdf", ["Content", "", "More content"])
            result = parser.parse(path)

            assert len(result) == 2
            assert result[0]["page_number"] == 1
            assert result[1]["page_number"] == 3

    # ---- 目录批量解析 ----

    def test_parse_directory_batch(self, parser: PDFParser) -> None:
        """Test parsing all PDFs in a directory."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            _create_pdf(tmp_dir, "a.pdf", ["File A"])
            _create_pdf(tmp_dir, "b.pdf", ["File B page 1", "File B page 2"])
            # non-PDF file should be ignored
            with open(os.path.join(tmp_dir, "readme.txt"), "w") as f:
                f.write("ignore me")

            pdf_files = sorted(
                f for f in os.listdir(tmp_dir) if f.lower().endswith(".pdf")
            )
            all_pages: list[dict[str, str | int]] = []
            for pdf_file in pdf_files:
                all_pages.extend(parser.parse(os.path.join(tmp_dir, pdf_file)))

            assert len(all_pages) == 3
            sources = {p["source"] for p in all_pages}
            assert sources == {"a.pdf", "b.pdf"}

    def test_parse_empty_directory(self, parser: PDFParser) -> None:
        """Test parsing an empty directory yields no results."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            pdf_files = [
                f for f in os.listdir(tmp_dir) if f.lower().endswith(".pdf")
            ]
            assert pdf_files == []