File size: 5,385 Bytes
31a2688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""Tests for src.ingestion.pdf_parser."""

import os
import tempfile

import fitz  # PyMuPDF
import pytest

from src.ingestion.pdf_parser import PDFParser


@pytest.fixture
def parser() -> PDFParser:
    """Return a PDFParser instance."""
    return PDFParser()


def _create_pdf(tmp_dir: str, filename: str, pages: list[str]) -> str:
    """Helper: create a PDF with the given page texts and return its path."""
    path = os.path.join(tmp_dir, filename)
    doc = fitz.open()
    for text in pages:
        page = doc.new_page()
        page.insert_text((72, 72), text)
    doc.save(path)
    doc.close()
    return path


class TestPDFParser:
    """Tests for the PDFParser class."""

    # ---- 正常 PDF 解析 ----

    def test_parse_valid_pdf(self, parser: PDFParser) -> None:
        """Test parsing a valid single-page PDF returns correct structure."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = _create_pdf(tmp_dir, "valid.pdf", ["Hello world"])
            result = parser.parse(path)

            assert len(result) == 1
            assert "Hello world" in result[0]["text"]
            assert result[0]["page_number"] == 1
            assert result[0]["source"] == "valid.pdf"

    # ---- 空文件 ----

    def test_parse_empty_pdf(self, parser: PDFParser) -> None:
        """Test parsing a PDF with no text returns an empty list."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = _create_pdf(tmp_dir, "empty.pdf", [""])
            result = parser.parse(path)

            assert result == []

    def test_parse_whitespace_only_pdf(self, parser: PDFParser) -> None:
        """Test parsing a PDF with only whitespace text returns an empty list."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = _create_pdf(tmp_dir, "whitespace.pdf", ["   \n\t  "])
            result = parser.parse(path)

            assert result == []

    # ---- 损坏文件 ----

    def test_parse_corrupted_file_raises(self, parser: PDFParser) -> None:
        """Test that a corrupted file raises ValueError."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = os.path.join(tmp_dir, "corrupted.pdf")
            with open(path, "wb") as f:
                f.write(b"not a real pdf content")

            with pytest.raises(ValueError, match="Failed to open PDF"):
                parser.parse(path)

    def test_parse_nonexistent_file(self, parser: PDFParser) -> None:
        """Test that parsing a missing file raises FileNotFoundError."""
        with pytest.raises(FileNotFoundError):
            parser.parse("/nonexistent/path/to/file.pdf")

    def test_parse_non_pdf_extension(self, parser: PDFParser) -> None:
        """Test that a non-.pdf file raises ValueError."""
        with tempfile.NamedTemporaryFile(suffix=".txt") as tmp:
            with pytest.raises(ValueError, match="not a PDF"):
                parser.parse(tmp.name)

    # ---- 多页文档 ----

    def test_parse_multipage_pdf(self, parser: PDFParser) -> None:
        """Test parsing a multi-page PDF returns all pages with correct indices."""
        page_texts = ["Page one content", "Page two content", "Page three content"]
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = _create_pdf(tmp_dir, "multi.pdf", page_texts)
            result = parser.parse(path)

            assert len(result) == 3
            for i, page_data in enumerate(result):
                assert page_data["page_number"] == i + 1
                assert page_texts[i] in page_data["text"]
                assert page_data["source"] == "multi.pdf"

    def test_parse_multipage_skips_blank_pages(self, parser: PDFParser) -> None:
        """Test that blank pages in a multi-page PDF are skipped."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = _create_pdf(tmp_dir, "gaps.pdf", ["Content", "", "More content"])
            result = parser.parse(path)

            assert len(result) == 2
            assert result[0]["page_number"] == 1
            assert result[1]["page_number"] == 3

    # ---- 目录批量解析 ----

    def test_parse_directory_batch(self, parser: PDFParser) -> None:
        """Test parsing all PDFs in a directory."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            _create_pdf(tmp_dir, "a.pdf", ["File A"])
            _create_pdf(tmp_dir, "b.pdf", ["File B page 1", "File B page 2"])
            # non-PDF file should be ignored
            with open(os.path.join(tmp_dir, "readme.txt"), "w") as f:
                f.write("ignore me")

            pdf_files = sorted(
                f for f in os.listdir(tmp_dir) if f.lower().endswith(".pdf")
            )
            all_pages: list[dict[str, str | int]] = []
            for pdf_file in pdf_files:
                all_pages.extend(parser.parse(os.path.join(tmp_dir, pdf_file)))

            assert len(all_pages) == 3
            sources = {p["source"] for p in all_pages}
            assert sources == {"a.pdf", "b.pdf"}

    def test_parse_empty_directory(self, parser: PDFParser) -> None:
        """Test parsing an empty directory yields no results."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            pdf_files = [
                f for f in os.listdir(tmp_dir) if f.lower().endswith(".pdf")
            ]
            assert pdf_files == []