Scrapling / tests /spiders /test_checkpoint.py
Karim shoair
test: add tests for the spiders system
f37031b
"""Tests for the CheckpointManager and CheckpointData classes."""
import pickle
import tempfile
from pathlib import Path
import pytest
import anyio
from scrapling.spiders.request import Request
from scrapling.spiders.checkpoint import CheckpointData, CheckpointManager
class TestCheckpointData:
"""Test CheckpointData dataclass."""
def test_default_values(self):
"""Test CheckpointData with default values."""
data = CheckpointData()
assert data.requests == []
assert data.seen == set()
def test_with_requests_and_seen(self):
"""Test CheckpointData with requests and seen URLs."""
requests = [
Request("https://example.com/1", priority=10),
Request("https://example.com/2", priority=5),
]
seen = {"url1", "url2", "url3"}
data = CheckpointData(requests=requests, seen=seen)
assert len(data.requests) == 2
assert data.requests[0].url == "https://example.com/1"
assert data.seen == {"url1", "url2", "url3"}
def test_pickle_roundtrip(self):
"""Test that CheckpointData can be pickled and unpickled."""
requests = [Request("https://example.com", priority=5)]
seen = {"fingerprint1", "fingerprint2"}
data = CheckpointData(requests=requests, seen=seen)
pickled = pickle.dumps(data)
restored = pickle.loads(pickled)
assert len(restored.requests) == 1
assert restored.requests[0].url == "https://example.com"
assert restored.seen == {"fingerprint1", "fingerprint2"}
class TestCheckpointManagerInit:
"""Test CheckpointManager initialization."""
def test_init_with_string_path(self):
"""Test initialization with string path."""
manager = CheckpointManager("/tmp/test_crawl")
assert str(manager.crawldir) == "/tmp/test_crawl"
assert manager.interval == 300.0
def test_init_with_pathlib_path(self):
"""Test initialization with pathlib.Path."""
path = Path("/tmp/test_crawl")
manager = CheckpointManager(path)
assert str(manager.crawldir) == "/tmp/test_crawl"
def test_init_with_custom_interval(self):
"""Test initialization with custom interval."""
manager = CheckpointManager("/tmp/test", interval=60.0)
assert manager.interval == 60.0
def test_init_with_zero_interval(self):
"""Test initialization with zero interval (disable periodic checkpoints)."""
manager = CheckpointManager("/tmp/test", interval=0)
assert manager.interval == 0
def test_init_with_negative_interval_raises(self):
"""Test that negative interval raises ValueError."""
with pytest.raises(ValueError, match="greater than 0"):
CheckpointManager("/tmp/test", interval=-1)
def test_init_with_invalid_interval_type_raises(self):
"""Test that invalid interval type raises TypeError."""
with pytest.raises(TypeError, match="integer or float"):
CheckpointManager("/tmp/test", interval="invalid") # type: ignore
def test_checkpoint_file_path(self):
"""Test that checkpoint file path is correctly constructed."""
manager = CheckpointManager("/tmp/test_crawl")
expected_path = "/tmp/test_crawl/checkpoint.pkl"
assert str(manager._checkpoint_path) == expected_path
class TestCheckpointManagerOperations:
"""Test CheckpointManager save/load/cleanup operations."""
@pytest.fixture
def temp_dir(self):
"""Create a temporary directory for testing."""
with tempfile.TemporaryDirectory() as tmpdir:
yield Path(tmpdir)
@pytest.mark.asyncio
async def test_has_checkpoint_false_when_no_file(self, temp_dir: Path):
"""Test has_checkpoint returns False when no checkpoint exists."""
manager = CheckpointManager(temp_dir / "crawl")
result = await manager.has_checkpoint()
assert result is False
@pytest.mark.asyncio
async def test_save_creates_checkpoint_file(self, temp_dir: Path):
"""Test that save creates the checkpoint file."""
crawl_dir = temp_dir / "crawl"
manager = CheckpointManager(crawl_dir)
data = CheckpointData(
requests=[Request("https://example.com")],
seen={"fp1", "fp2"},
)
await manager.save(data)
checkpoint_path = crawl_dir / "checkpoint.pkl"
assert checkpoint_path.exists()
@pytest.mark.asyncio
async def test_save_creates_directory_if_not_exists(self, temp_dir: Path):
"""Test that save creates the directory if it doesn't exist."""
crawl_dir = temp_dir / "nested" / "crawl" / "dir"
manager = CheckpointManager(crawl_dir)
data = CheckpointData()
await manager.save(data)
assert crawl_dir.exists()
@pytest.mark.asyncio
async def test_has_checkpoint_true_after_save(self, temp_dir: Path):
"""Test has_checkpoint returns True after saving."""
manager = CheckpointManager(temp_dir / "crawl")
data = CheckpointData()
await manager.save(data)
result = await manager.has_checkpoint()
assert result is True
@pytest.mark.asyncio
async def test_load_returns_none_when_no_checkpoint(self, temp_dir: Path):
"""Test load returns None when no checkpoint exists."""
manager = CheckpointManager(temp_dir / "crawl")
result = await manager.load()
assert result is None
@pytest.mark.asyncio
async def test_save_and_load_roundtrip(self, temp_dir: Path):
"""Test saving and loading checkpoint data."""
manager = CheckpointManager(temp_dir / "crawl")
original_data = CheckpointData(
requests=[
Request("https://example.com/1", priority=10),
Request("https://example.com/2", priority=5),
],
seen={"fp1", "fp2", "fp3"},
)
await manager.save(original_data)
loaded_data = await manager.load()
assert loaded_data is not None
assert len(loaded_data.requests) == 2
assert loaded_data.requests[0].url == "https://example.com/1"
assert loaded_data.requests[0].priority == 10
assert loaded_data.seen == {"fp1", "fp2", "fp3"}
@pytest.mark.asyncio
async def test_save_is_atomic(self, temp_dir: Path):
"""Test that save uses atomic write (temp file + rename)."""
crawl_dir = temp_dir / "crawl"
manager = CheckpointManager(crawl_dir)
data = CheckpointData(requests=[Request("https://example.com")])
await manager.save(data)
# Temp file should not exist after successful save
temp_path = crawl_dir / "checkpoint.tmp"
assert not temp_path.exists()
# Checkpoint file should exist
checkpoint_path = crawl_dir / "checkpoint.pkl"
assert checkpoint_path.exists()
@pytest.mark.asyncio
async def test_cleanup_removes_checkpoint_file(self, temp_dir: Path):
"""Test that cleanup removes the checkpoint file."""
crawl_dir = temp_dir / "crawl"
manager = CheckpointManager(crawl_dir)
# Save a checkpoint first
data = CheckpointData()
await manager.save(data)
checkpoint_path = crawl_dir / "checkpoint.pkl"
assert checkpoint_path.exists()
# Cleanup should remove it
await manager.cleanup()
assert not checkpoint_path.exists()
@pytest.mark.asyncio
async def test_cleanup_no_error_when_no_file(self, temp_dir: Path):
"""Test that cleanup doesn't raise error when no file exists."""
manager = CheckpointManager(temp_dir / "crawl")
# Should not raise
await manager.cleanup()
@pytest.mark.asyncio
async def test_load_returns_none_on_corrupt_file(self, temp_dir: Path):
"""Test load returns None when checkpoint file is corrupt."""
crawl_dir = temp_dir / "crawl"
crawl_dir.mkdir(parents=True)
checkpoint_path = crawl_dir / "checkpoint.pkl"
checkpoint_path.write_bytes(b"not valid pickle data")
manager = CheckpointManager(crawl_dir)
result = await manager.load()
assert result is None
@pytest.mark.asyncio
async def test_multiple_saves_overwrite(self, temp_dir: Path):
"""Test that multiple saves overwrite the checkpoint."""
manager = CheckpointManager(temp_dir / "crawl")
# First save
data1 = CheckpointData(
requests=[Request("https://example.com/1")],
seen={"fp1"},
)
await manager.save(data1)
# Second save
data2 = CheckpointData(
requests=[Request("https://example.com/2"), Request("https://example.com/3")],
seen={"fp2", "fp3"},
)
await manager.save(data2)
# Load should return the second save
loaded = await manager.load()
assert loaded is not None
assert len(loaded.requests) == 2
assert loaded.requests[0].url == "https://example.com/2"
assert loaded.seen == {"fp2", "fp3"}
class TestCheckpointManagerEdgeCases:
"""Test edge cases for CheckpointManager."""
@pytest.fixture
def temp_dir(self):
"""Create a temporary directory for testing."""
with tempfile.TemporaryDirectory() as tmpdir:
yield Path(tmpdir)
@pytest.mark.asyncio
async def test_save_empty_checkpoint(self, temp_dir: Path):
"""Test saving empty checkpoint data."""
manager = CheckpointManager(temp_dir / "crawl")
data = CheckpointData(requests=[], seen=set())
await manager.save(data)
loaded = await manager.load()
assert loaded is not None
assert loaded.requests == []
assert loaded.seen == set()
@pytest.mark.asyncio
async def test_save_large_checkpoint(self, temp_dir: Path):
"""Test saving checkpoint with many requests."""
manager = CheckpointManager(temp_dir / "crawl")
# Create 1000 requests
requests = [
Request(f"https://example.com/{i}", priority=i % 10)
for i in range(1000)
]
seen = {f"fp_{i}" for i in range(2000)}
data = CheckpointData(requests=requests, seen=seen)
await manager.save(data)
loaded = await manager.load()
assert loaded is not None
assert len(loaded.requests) == 1000
assert len(loaded.seen) == 2000
@pytest.mark.asyncio
async def test_requests_preserve_metadata(self, temp_dir: Path):
"""Test that request metadata is preserved through checkpoint."""
manager = CheckpointManager(temp_dir / "crawl")
original_request = Request(
url="https://example.com",
sid="my_session",
priority=42,
dont_filter=True,
meta={"item_id": 123, "page": 5},
proxy="http://proxy:8080",
)
data = CheckpointData(requests=[original_request], seen=set())
await manager.save(data)
loaded = await manager.load()
assert loaded is not None
restored = loaded.requests[0]
assert restored.url == "https://example.com"
assert restored.sid == "my_session"
assert restored.priority == 42
assert restored.dont_filter is True
assert restored.meta == {"item_id": 123, "page": 5}
assert restored._session_kwargs == {"proxy": "http://proxy:8080"}