Dokumentassistent / tests /ingestion /test_pdf_parser.py
XQ
init
31a2688
raw
history blame
5.39 kB
"""Tests for src.ingestion.pdf_parser."""
import os
import tempfile
import fitz # PyMuPDF
import pytest
from src.ingestion.pdf_parser import PDFParser
@pytest.fixture
def parser() -> PDFParser:
"""Return a PDFParser instance."""
return PDFParser()
def _create_pdf(tmp_dir: str, filename: str, pages: list[str]) -> str:
"""Helper: create a PDF with the given page texts and return its path."""
path = os.path.join(tmp_dir, filename)
doc = fitz.open()
for text in pages:
page = doc.new_page()
page.insert_text((72, 72), text)
doc.save(path)
doc.close()
return path
class TestPDFParser:
"""Tests for the PDFParser class."""
# ---- 正常 PDF 解析 ----
def test_parse_valid_pdf(self, parser: PDFParser) -> None:
"""Test parsing a valid single-page PDF returns correct structure."""
with tempfile.TemporaryDirectory() as tmp_dir:
path = _create_pdf(tmp_dir, "valid.pdf", ["Hello world"])
result = parser.parse(path)
assert len(result) == 1
assert "Hello world" in result[0]["text"]
assert result[0]["page_number"] == 1
assert result[0]["source"] == "valid.pdf"
# ---- 空文件 ----
def test_parse_empty_pdf(self, parser: PDFParser) -> None:
"""Test parsing a PDF with no text returns an empty list."""
with tempfile.TemporaryDirectory() as tmp_dir:
path = _create_pdf(tmp_dir, "empty.pdf", [""])
result = parser.parse(path)
assert result == []
def test_parse_whitespace_only_pdf(self, parser: PDFParser) -> None:
"""Test parsing a PDF with only whitespace text returns an empty list."""
with tempfile.TemporaryDirectory() as tmp_dir:
path = _create_pdf(tmp_dir, "whitespace.pdf", [" \n\t "])
result = parser.parse(path)
assert result == []
# ---- 损坏文件 ----
def test_parse_corrupted_file_raises(self, parser: PDFParser) -> None:
"""Test that a corrupted file raises ValueError."""
with tempfile.TemporaryDirectory() as tmp_dir:
path = os.path.join(tmp_dir, "corrupted.pdf")
with open(path, "wb") as f:
f.write(b"not a real pdf content")
with pytest.raises(ValueError, match="Failed to open PDF"):
parser.parse(path)
def test_parse_nonexistent_file(self, parser: PDFParser) -> None:
"""Test that parsing a missing file raises FileNotFoundError."""
with pytest.raises(FileNotFoundError):
parser.parse("/nonexistent/path/to/file.pdf")
def test_parse_non_pdf_extension(self, parser: PDFParser) -> None:
"""Test that a non-.pdf file raises ValueError."""
with tempfile.NamedTemporaryFile(suffix=".txt") as tmp:
with pytest.raises(ValueError, match="not a PDF"):
parser.parse(tmp.name)
# ---- 多页文档 ----
def test_parse_multipage_pdf(self, parser: PDFParser) -> None:
"""Test parsing a multi-page PDF returns all pages with correct indices."""
page_texts = ["Page one content", "Page two content", "Page three content"]
with tempfile.TemporaryDirectory() as tmp_dir:
path = _create_pdf(tmp_dir, "multi.pdf", page_texts)
result = parser.parse(path)
assert len(result) == 3
for i, page_data in enumerate(result):
assert page_data["page_number"] == i + 1
assert page_texts[i] in page_data["text"]
assert page_data["source"] == "multi.pdf"
def test_parse_multipage_skips_blank_pages(self, parser: PDFParser) -> None:
"""Test that blank pages in a multi-page PDF are skipped."""
with tempfile.TemporaryDirectory() as tmp_dir:
path = _create_pdf(tmp_dir, "gaps.pdf", ["Content", "", "More content"])
result = parser.parse(path)
assert len(result) == 2
assert result[0]["page_number"] == 1
assert result[1]["page_number"] == 3
# ---- 目录批量解析 ----
def test_parse_directory_batch(self, parser: PDFParser) -> None:
"""Test parsing all PDFs in a directory."""
with tempfile.TemporaryDirectory() as tmp_dir:
_create_pdf(tmp_dir, "a.pdf", ["File A"])
_create_pdf(tmp_dir, "b.pdf", ["File B page 1", "File B page 2"])
# non-PDF file should be ignored
with open(os.path.join(tmp_dir, "readme.txt"), "w") as f:
f.write("ignore me")
pdf_files = sorted(
f for f in os.listdir(tmp_dir) if f.lower().endswith(".pdf")
)
all_pages: list[dict[str, str | int]] = []
for pdf_file in pdf_files:
all_pages.extend(parser.parse(os.path.join(tmp_dir, pdf_file)))
assert len(all_pages) == 3
sources = {p["source"] for p in all_pages}
assert sources == {"a.pdf", "b.pdf"}
def test_parse_empty_directory(self, parser: PDFParser) -> None:
"""Test parsing an empty directory yields no results."""
with tempfile.TemporaryDirectory() as tmp_dir:
pdf_files = [
f for f in os.listdir(tmp_dir) if f.lower().endswith(".pdf")
]
assert pdf_files == []