Karim shoair commited on
Commit ·
f37031b
1
Parent(s): fab7a59
test: add tests for the spiders system
Browse filesIt's generated by Opus on Claude Code. It's very good as per my review and instructions, but I will have another look later.
- tests/spiders/__init__.py +0 -0
- tests/spiders/test_checkpoint.py +341 -0
- tests/spiders/test_request.py +363 -0
- tests/spiders/test_result.py +327 -0
- tests/spiders/test_scheduler.py +390 -0
- tests/spiders/test_session.py +352 -0
- tests/spiders/test_spider.py +574 -0
tests/spiders/__init__.py
ADDED
|
File without changes
|
tests/spiders/test_checkpoint.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the CheckpointManager and CheckpointData classes."""
|
| 2 |
+
|
| 3 |
+
import pickle
|
| 4 |
+
import tempfile
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
import anyio
|
| 9 |
+
|
| 10 |
+
from scrapling.spiders.request import Request
|
| 11 |
+
from scrapling.spiders.checkpoint import CheckpointData, CheckpointManager
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TestCheckpointData:
|
| 15 |
+
"""Test CheckpointData dataclass."""
|
| 16 |
+
|
| 17 |
+
def test_default_values(self):
|
| 18 |
+
"""Test CheckpointData with default values."""
|
| 19 |
+
data = CheckpointData()
|
| 20 |
+
|
| 21 |
+
assert data.requests == []
|
| 22 |
+
assert data.seen == set()
|
| 23 |
+
|
| 24 |
+
def test_with_requests_and_seen(self):
|
| 25 |
+
"""Test CheckpointData with requests and seen URLs."""
|
| 26 |
+
requests = [
|
| 27 |
+
Request("https://example.com/1", priority=10),
|
| 28 |
+
Request("https://example.com/2", priority=5),
|
| 29 |
+
]
|
| 30 |
+
seen = {"url1", "url2", "url3"}
|
| 31 |
+
|
| 32 |
+
data = CheckpointData(requests=requests, seen=seen)
|
| 33 |
+
|
| 34 |
+
assert len(data.requests) == 2
|
| 35 |
+
assert data.requests[0].url == "https://example.com/1"
|
| 36 |
+
assert data.seen == {"url1", "url2", "url3"}
|
| 37 |
+
|
| 38 |
+
def test_pickle_roundtrip(self):
|
| 39 |
+
"""Test that CheckpointData can be pickled and unpickled."""
|
| 40 |
+
requests = [Request("https://example.com", priority=5)]
|
| 41 |
+
seen = {"fingerprint1", "fingerprint2"}
|
| 42 |
+
data = CheckpointData(requests=requests, seen=seen)
|
| 43 |
+
|
| 44 |
+
pickled = pickle.dumps(data)
|
| 45 |
+
restored = pickle.loads(pickled)
|
| 46 |
+
|
| 47 |
+
assert len(restored.requests) == 1
|
| 48 |
+
assert restored.requests[0].url == "https://example.com"
|
| 49 |
+
assert restored.seen == {"fingerprint1", "fingerprint2"}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class TestCheckpointManagerInit:
|
| 53 |
+
"""Test CheckpointManager initialization."""
|
| 54 |
+
|
| 55 |
+
def test_init_with_string_path(self):
|
| 56 |
+
"""Test initialization with string path."""
|
| 57 |
+
manager = CheckpointManager("/tmp/test_crawl")
|
| 58 |
+
|
| 59 |
+
assert str(manager.crawldir) == "/tmp/test_crawl"
|
| 60 |
+
assert manager.interval == 300.0
|
| 61 |
+
|
| 62 |
+
def test_init_with_pathlib_path(self):
|
| 63 |
+
"""Test initialization with pathlib.Path."""
|
| 64 |
+
path = Path("/tmp/test_crawl")
|
| 65 |
+
manager = CheckpointManager(path)
|
| 66 |
+
|
| 67 |
+
assert str(manager.crawldir) == "/tmp/test_crawl"
|
| 68 |
+
|
| 69 |
+
def test_init_with_custom_interval(self):
|
| 70 |
+
"""Test initialization with custom interval."""
|
| 71 |
+
manager = CheckpointManager("/tmp/test", interval=60.0)
|
| 72 |
+
assert manager.interval == 60.0
|
| 73 |
+
|
| 74 |
+
def test_init_with_zero_interval(self):
|
| 75 |
+
"""Test initialization with zero interval (disable periodic checkpoints)."""
|
| 76 |
+
manager = CheckpointManager("/tmp/test", interval=0)
|
| 77 |
+
assert manager.interval == 0
|
| 78 |
+
|
| 79 |
+
def test_init_with_negative_interval_raises(self):
|
| 80 |
+
"""Test that negative interval raises ValueError."""
|
| 81 |
+
with pytest.raises(ValueError, match="greater than 0"):
|
| 82 |
+
CheckpointManager("/tmp/test", interval=-1)
|
| 83 |
+
|
| 84 |
+
def test_init_with_invalid_interval_type_raises(self):
|
| 85 |
+
"""Test that invalid interval type raises TypeError."""
|
| 86 |
+
with pytest.raises(TypeError, match="integer or float"):
|
| 87 |
+
CheckpointManager("/tmp/test", interval="invalid") # type: ignore
|
| 88 |
+
|
| 89 |
+
def test_checkpoint_file_path(self):
|
| 90 |
+
"""Test that checkpoint file path is correctly constructed."""
|
| 91 |
+
manager = CheckpointManager("/tmp/test_crawl")
|
| 92 |
+
|
| 93 |
+
expected_path = "/tmp/test_crawl/checkpoint.pkl"
|
| 94 |
+
assert str(manager._checkpoint_path) == expected_path
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class TestCheckpointManagerOperations:
|
| 98 |
+
"""Test CheckpointManager save/load/cleanup operations."""
|
| 99 |
+
|
| 100 |
+
@pytest.fixture
|
| 101 |
+
def temp_dir(self):
|
| 102 |
+
"""Create a temporary directory for testing."""
|
| 103 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 104 |
+
yield Path(tmpdir)
|
| 105 |
+
|
| 106 |
+
@pytest.mark.asyncio
|
| 107 |
+
async def test_has_checkpoint_false_when_no_file(self, temp_dir: Path):
|
| 108 |
+
"""Test has_checkpoint returns False when no checkpoint exists."""
|
| 109 |
+
manager = CheckpointManager(temp_dir / "crawl")
|
| 110 |
+
|
| 111 |
+
result = await manager.has_checkpoint()
|
| 112 |
+
|
| 113 |
+
assert result is False
|
| 114 |
+
|
| 115 |
+
@pytest.mark.asyncio
|
| 116 |
+
async def test_save_creates_checkpoint_file(self, temp_dir: Path):
|
| 117 |
+
"""Test that save creates the checkpoint file."""
|
| 118 |
+
crawl_dir = temp_dir / "crawl"
|
| 119 |
+
manager = CheckpointManager(crawl_dir)
|
| 120 |
+
|
| 121 |
+
data = CheckpointData(
|
| 122 |
+
requests=[Request("https://example.com")],
|
| 123 |
+
seen={"fp1", "fp2"},
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
await manager.save(data)
|
| 127 |
+
|
| 128 |
+
checkpoint_path = crawl_dir / "checkpoint.pkl"
|
| 129 |
+
assert checkpoint_path.exists()
|
| 130 |
+
|
| 131 |
+
@pytest.mark.asyncio
|
| 132 |
+
async def test_save_creates_directory_if_not_exists(self, temp_dir: Path):
|
| 133 |
+
"""Test that save creates the directory if it doesn't exist."""
|
| 134 |
+
crawl_dir = temp_dir / "nested" / "crawl" / "dir"
|
| 135 |
+
manager = CheckpointManager(crawl_dir)
|
| 136 |
+
|
| 137 |
+
data = CheckpointData()
|
| 138 |
+
await manager.save(data)
|
| 139 |
+
|
| 140 |
+
assert crawl_dir.exists()
|
| 141 |
+
|
| 142 |
+
@pytest.mark.asyncio
|
| 143 |
+
async def test_has_checkpoint_true_after_save(self, temp_dir: Path):
|
| 144 |
+
"""Test has_checkpoint returns True after saving."""
|
| 145 |
+
manager = CheckpointManager(temp_dir / "crawl")
|
| 146 |
+
|
| 147 |
+
data = CheckpointData()
|
| 148 |
+
await manager.save(data)
|
| 149 |
+
|
| 150 |
+
result = await manager.has_checkpoint()
|
| 151 |
+
assert result is True
|
| 152 |
+
|
| 153 |
+
@pytest.mark.asyncio
|
| 154 |
+
async def test_load_returns_none_when_no_checkpoint(self, temp_dir: Path):
|
| 155 |
+
"""Test load returns None when no checkpoint exists."""
|
| 156 |
+
manager = CheckpointManager(temp_dir / "crawl")
|
| 157 |
+
|
| 158 |
+
result = await manager.load()
|
| 159 |
+
|
| 160 |
+
assert result is None
|
| 161 |
+
|
| 162 |
+
@pytest.mark.asyncio
|
| 163 |
+
async def test_save_and_load_roundtrip(self, temp_dir: Path):
|
| 164 |
+
"""Test saving and loading checkpoint data."""
|
| 165 |
+
manager = CheckpointManager(temp_dir / "crawl")
|
| 166 |
+
|
| 167 |
+
original_data = CheckpointData(
|
| 168 |
+
requests=[
|
| 169 |
+
Request("https://example.com/1", priority=10),
|
| 170 |
+
Request("https://example.com/2", priority=5),
|
| 171 |
+
],
|
| 172 |
+
seen={"fp1", "fp2", "fp3"},
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
await manager.save(original_data)
|
| 176 |
+
loaded_data = await manager.load()
|
| 177 |
+
|
| 178 |
+
assert loaded_data is not None
|
| 179 |
+
assert len(loaded_data.requests) == 2
|
| 180 |
+
assert loaded_data.requests[0].url == "https://example.com/1"
|
| 181 |
+
assert loaded_data.requests[0].priority == 10
|
| 182 |
+
assert loaded_data.seen == {"fp1", "fp2", "fp3"}
|
| 183 |
+
|
| 184 |
+
@pytest.mark.asyncio
|
| 185 |
+
async def test_save_is_atomic(self, temp_dir: Path):
|
| 186 |
+
"""Test that save uses atomic write (temp file + rename)."""
|
| 187 |
+
crawl_dir = temp_dir / "crawl"
|
| 188 |
+
manager = CheckpointManager(crawl_dir)
|
| 189 |
+
|
| 190 |
+
data = CheckpointData(requests=[Request("https://example.com")])
|
| 191 |
+
await manager.save(data)
|
| 192 |
+
|
| 193 |
+
# Temp file should not exist after successful save
|
| 194 |
+
temp_path = crawl_dir / "checkpoint.tmp"
|
| 195 |
+
assert not temp_path.exists()
|
| 196 |
+
|
| 197 |
+
# Checkpoint file should exist
|
| 198 |
+
checkpoint_path = crawl_dir / "checkpoint.pkl"
|
| 199 |
+
assert checkpoint_path.exists()
|
| 200 |
+
|
| 201 |
+
@pytest.mark.asyncio
|
| 202 |
+
async def test_cleanup_removes_checkpoint_file(self, temp_dir: Path):
|
| 203 |
+
"""Test that cleanup removes the checkpoint file."""
|
| 204 |
+
crawl_dir = temp_dir / "crawl"
|
| 205 |
+
manager = CheckpointManager(crawl_dir)
|
| 206 |
+
|
| 207 |
+
# Save a checkpoint first
|
| 208 |
+
data = CheckpointData()
|
| 209 |
+
await manager.save(data)
|
| 210 |
+
|
| 211 |
+
checkpoint_path = crawl_dir / "checkpoint.pkl"
|
| 212 |
+
assert checkpoint_path.exists()
|
| 213 |
+
|
| 214 |
+
# Cleanup should remove it
|
| 215 |
+
await manager.cleanup()
|
| 216 |
+
|
| 217 |
+
assert not checkpoint_path.exists()
|
| 218 |
+
|
| 219 |
+
@pytest.mark.asyncio
|
| 220 |
+
async def test_cleanup_no_error_when_no_file(self, temp_dir: Path):
|
| 221 |
+
"""Test that cleanup doesn't raise error when no file exists."""
|
| 222 |
+
manager = CheckpointManager(temp_dir / "crawl")
|
| 223 |
+
|
| 224 |
+
# Should not raise
|
| 225 |
+
await manager.cleanup()
|
| 226 |
+
|
| 227 |
+
@pytest.mark.asyncio
|
| 228 |
+
async def test_load_returns_none_on_corrupt_file(self, temp_dir: Path):
|
| 229 |
+
"""Test load returns None when checkpoint file is corrupt."""
|
| 230 |
+
crawl_dir = temp_dir / "crawl"
|
| 231 |
+
crawl_dir.mkdir(parents=True)
|
| 232 |
+
|
| 233 |
+
checkpoint_path = crawl_dir / "checkpoint.pkl"
|
| 234 |
+
checkpoint_path.write_bytes(b"not valid pickle data")
|
| 235 |
+
|
| 236 |
+
manager = CheckpointManager(crawl_dir)
|
| 237 |
+
|
| 238 |
+
result = await manager.load()
|
| 239 |
+
|
| 240 |
+
assert result is None
|
| 241 |
+
|
| 242 |
+
@pytest.mark.asyncio
|
| 243 |
+
async def test_multiple_saves_overwrite(self, temp_dir: Path):
|
| 244 |
+
"""Test that multiple saves overwrite the checkpoint."""
|
| 245 |
+
manager = CheckpointManager(temp_dir / "crawl")
|
| 246 |
+
|
| 247 |
+
# First save
|
| 248 |
+
data1 = CheckpointData(
|
| 249 |
+
requests=[Request("https://example.com/1")],
|
| 250 |
+
seen={"fp1"},
|
| 251 |
+
)
|
| 252 |
+
await manager.save(data1)
|
| 253 |
+
|
| 254 |
+
# Second save
|
| 255 |
+
data2 = CheckpointData(
|
| 256 |
+
requests=[Request("https://example.com/2"), Request("https://example.com/3")],
|
| 257 |
+
seen={"fp2", "fp3"},
|
| 258 |
+
)
|
| 259 |
+
await manager.save(data2)
|
| 260 |
+
|
| 261 |
+
# Load should return the second save
|
| 262 |
+
loaded = await manager.load()
|
| 263 |
+
|
| 264 |
+
assert loaded is not None
|
| 265 |
+
assert len(loaded.requests) == 2
|
| 266 |
+
assert loaded.requests[0].url == "https://example.com/2"
|
| 267 |
+
assert loaded.seen == {"fp2", "fp3"}
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
class TestCheckpointManagerEdgeCases:
|
| 271 |
+
"""Test edge cases for CheckpointManager."""
|
| 272 |
+
|
| 273 |
+
@pytest.fixture
|
| 274 |
+
def temp_dir(self):
|
| 275 |
+
"""Create a temporary directory for testing."""
|
| 276 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 277 |
+
yield Path(tmpdir)
|
| 278 |
+
|
| 279 |
+
@pytest.mark.asyncio
|
| 280 |
+
async def test_save_empty_checkpoint(self, temp_dir: Path):
|
| 281 |
+
"""Test saving empty checkpoint data."""
|
| 282 |
+
manager = CheckpointManager(temp_dir / "crawl")
|
| 283 |
+
|
| 284 |
+
data = CheckpointData(requests=[], seen=set())
|
| 285 |
+
await manager.save(data)
|
| 286 |
+
|
| 287 |
+
loaded = await manager.load()
|
| 288 |
+
|
| 289 |
+
assert loaded is not None
|
| 290 |
+
assert loaded.requests == []
|
| 291 |
+
assert loaded.seen == set()
|
| 292 |
+
|
| 293 |
+
@pytest.mark.asyncio
|
| 294 |
+
async def test_save_large_checkpoint(self, temp_dir: Path):
|
| 295 |
+
"""Test saving checkpoint with many requests."""
|
| 296 |
+
manager = CheckpointManager(temp_dir / "crawl")
|
| 297 |
+
|
| 298 |
+
# Create 1000 requests
|
| 299 |
+
requests = [
|
| 300 |
+
Request(f"https://example.com/{i}", priority=i % 10)
|
| 301 |
+
for i in range(1000)
|
| 302 |
+
]
|
| 303 |
+
seen = {f"fp_{i}" for i in range(2000)}
|
| 304 |
+
|
| 305 |
+
data = CheckpointData(requests=requests, seen=seen)
|
| 306 |
+
await manager.save(data)
|
| 307 |
+
|
| 308 |
+
loaded = await manager.load()
|
| 309 |
+
|
| 310 |
+
assert loaded is not None
|
| 311 |
+
assert len(loaded.requests) == 1000
|
| 312 |
+
assert len(loaded.seen) == 2000
|
| 313 |
+
|
| 314 |
+
@pytest.mark.asyncio
|
| 315 |
+
async def test_requests_preserve_metadata(self, temp_dir: Path):
|
| 316 |
+
"""Test that request metadata is preserved through checkpoint."""
|
| 317 |
+
manager = CheckpointManager(temp_dir / "crawl")
|
| 318 |
+
|
| 319 |
+
original_request = Request(
|
| 320 |
+
url="https://example.com",
|
| 321 |
+
sid="my_session",
|
| 322 |
+
priority=42,
|
| 323 |
+
dont_filter=True,
|
| 324 |
+
meta={"item_id": 123, "page": 5},
|
| 325 |
+
proxy="http://proxy:8080",
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
data = CheckpointData(requests=[original_request], seen=set())
|
| 329 |
+
await manager.save(data)
|
| 330 |
+
|
| 331 |
+
loaded = await manager.load()
|
| 332 |
+
|
| 333 |
+
assert loaded is not None
|
| 334 |
+
restored = loaded.requests[0]
|
| 335 |
+
|
| 336 |
+
assert restored.url == "https://example.com"
|
| 337 |
+
assert restored.sid == "my_session"
|
| 338 |
+
assert restored.priority == 42
|
| 339 |
+
assert restored.dont_filter is True
|
| 340 |
+
assert restored.meta == {"item_id": 123, "page": 5}
|
| 341 |
+
assert restored._session_kwargs == {"proxy": "http://proxy:8080"}
|
tests/spiders/test_request.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the Request class."""
|
| 2 |
+
|
| 3 |
+
import pickle
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
from scrapling.spiders.request import Request
|
| 8 |
+
from scrapling.core._types import Any, Dict, AsyncGenerator
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TestRequestCreation:
|
| 12 |
+
"""Test Request initialization and basic attributes."""
|
| 13 |
+
|
| 14 |
+
def test_basic_request_creation(self):
|
| 15 |
+
"""Test creating a request with just a URL."""
|
| 16 |
+
request = Request("https://example.com")
|
| 17 |
+
|
| 18 |
+
assert request.url == "https://example.com"
|
| 19 |
+
assert request.sid == ""
|
| 20 |
+
assert request.callback is None
|
| 21 |
+
assert request.priority == 0
|
| 22 |
+
assert request.dont_filter is False
|
| 23 |
+
assert request.meta == {}
|
| 24 |
+
assert request._retry_count == 0
|
| 25 |
+
assert request._session_kwargs == {}
|
| 26 |
+
|
| 27 |
+
def test_request_with_all_parameters(self):
|
| 28 |
+
"""Test creating a request with all parameters."""
|
| 29 |
+
|
| 30 |
+
async def my_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 31 |
+
yield {"test": "data"}
|
| 32 |
+
|
| 33 |
+
request = Request(
|
| 34 |
+
url="https://example.com/page",
|
| 35 |
+
sid="my_session",
|
| 36 |
+
callback=my_callback,
|
| 37 |
+
priority=10,
|
| 38 |
+
dont_filter=True,
|
| 39 |
+
meta={"key": "value"},
|
| 40 |
+
_retry_count=2,
|
| 41 |
+
proxy="http://proxy:8080",
|
| 42 |
+
timeout=30,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
assert request.url == "https://example.com/page"
|
| 46 |
+
assert request.sid == "my_session"
|
| 47 |
+
assert request.callback == my_callback
|
| 48 |
+
assert request.priority == 10
|
| 49 |
+
assert request.dont_filter is True
|
| 50 |
+
assert request.meta == {"key": "value"}
|
| 51 |
+
assert request._retry_count == 2
|
| 52 |
+
assert request._session_kwargs == {"proxy": "http://proxy:8080", "timeout": 30}
|
| 53 |
+
|
| 54 |
+
def test_request_meta_default_is_empty_dict(self):
|
| 55 |
+
"""Test that meta defaults to empty dict, not shared reference."""
|
| 56 |
+
r1 = Request("https://example.com")
|
| 57 |
+
r2 = Request("https://example.com")
|
| 58 |
+
|
| 59 |
+
r1.meta["key"] = "value"
|
| 60 |
+
|
| 61 |
+
assert r1.meta == {"key": "value"}
|
| 62 |
+
assert r2.meta == {}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class TestRequestProperties:
|
| 66 |
+
"""Test Request computed properties."""
|
| 67 |
+
|
| 68 |
+
def test_domain_extraction(self):
|
| 69 |
+
"""Test domain property extracts netloc correctly."""
|
| 70 |
+
request = Request("https://www.example.com/path/page.html?query=1")
|
| 71 |
+
assert request.domain == "www.example.com"
|
| 72 |
+
|
| 73 |
+
def test_domain_with_port(self):
|
| 74 |
+
"""Test domain extraction with port number."""
|
| 75 |
+
request = Request("http://localhost:8080/api")
|
| 76 |
+
assert request.domain == "localhost:8080"
|
| 77 |
+
|
| 78 |
+
def test_domain_with_subdomain(self):
|
| 79 |
+
"""Test domain extraction with subdomains."""
|
| 80 |
+
request = Request("https://api.v2.example.com/endpoint")
|
| 81 |
+
assert request.domain == "api.v2.example.com"
|
| 82 |
+
|
| 83 |
+
def test_fingerprint_includes_session_and_url(self):
|
| 84 |
+
"""Test fingerprint generation."""
|
| 85 |
+
request = Request("https://example.com", sid="session1")
|
| 86 |
+
assert request._fp == "session1:https://example.com"
|
| 87 |
+
|
| 88 |
+
def test_fingerprint_empty_session(self):
|
| 89 |
+
"""Test fingerprint with empty session ID."""
|
| 90 |
+
request = Request("https://example.com")
|
| 91 |
+
assert request._fp == ":https://example.com"
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class TestRequestCopy:
|
| 95 |
+
"""Test Request copy functionality."""
|
| 96 |
+
|
| 97 |
+
def test_copy_creates_independent_request(self):
|
| 98 |
+
"""Test that copy creates a new independent request."""
|
| 99 |
+
|
| 100 |
+
async def callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 101 |
+
yield None
|
| 102 |
+
|
| 103 |
+
original = Request(
|
| 104 |
+
url="https://example.com",
|
| 105 |
+
sid="session",
|
| 106 |
+
callback=callback,
|
| 107 |
+
priority=5,
|
| 108 |
+
dont_filter=True,
|
| 109 |
+
meta={"original": True},
|
| 110 |
+
_retry_count=1,
|
| 111 |
+
proxy="http://proxy:8080",
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
copied = original.copy()
|
| 115 |
+
|
| 116 |
+
# Check all values are copied
|
| 117 |
+
assert copied.url == original.url
|
| 118 |
+
assert copied.sid == original.sid
|
| 119 |
+
assert copied.callback == original.callback
|
| 120 |
+
assert copied.priority == original.priority
|
| 121 |
+
assert copied.dont_filter == original.dont_filter
|
| 122 |
+
assert copied.meta == original.meta
|
| 123 |
+
assert copied._retry_count == original._retry_count
|
| 124 |
+
assert copied._session_kwargs == original._session_kwargs
|
| 125 |
+
|
| 126 |
+
# Check they are different objects
|
| 127 |
+
assert copied is not original
|
| 128 |
+
assert copied.meta is not original.meta # Meta should be a copy
|
| 129 |
+
|
| 130 |
+
def test_copy_meta_is_independent(self):
|
| 131 |
+
"""Test that modifying copied meta doesn't affect original."""
|
| 132 |
+
original = Request("https://example.com", meta={"key": "original"})
|
| 133 |
+
copied = original.copy()
|
| 134 |
+
|
| 135 |
+
copied.meta["key"] = "modified"
|
| 136 |
+
copied.meta["new_key"] = "new_value"
|
| 137 |
+
|
| 138 |
+
assert original.meta == {"key": "original"}
|
| 139 |
+
assert copied.meta == {"key": "modified", "new_key": "new_value"}
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
class TestRequestComparison:
|
| 143 |
+
"""Test Request comparison operators."""
|
| 144 |
+
|
| 145 |
+
def test_priority_less_than(self):
|
| 146 |
+
"""Test less than comparison by priority."""
|
| 147 |
+
low_priority = Request("https://example.com/1", priority=1)
|
| 148 |
+
high_priority = Request("https://example.com/2", priority=10)
|
| 149 |
+
|
| 150 |
+
assert low_priority < high_priority
|
| 151 |
+
assert not high_priority < low_priority
|
| 152 |
+
|
| 153 |
+
def test_priority_greater_than(self):
|
| 154 |
+
"""Test greater than comparison by priority."""
|
| 155 |
+
low_priority = Request("https://example.com/1", priority=1)
|
| 156 |
+
high_priority = Request("https://example.com/2", priority=10)
|
| 157 |
+
|
| 158 |
+
assert high_priority > low_priority
|
| 159 |
+
assert not low_priority > high_priority
|
| 160 |
+
|
| 161 |
+
def test_equality_by_fingerprint(self):
|
| 162 |
+
"""Test equality comparison by fingerprint."""
|
| 163 |
+
r1 = Request("https://example.com", sid="session1")
|
| 164 |
+
r2 = Request("https://example.com", sid="session1")
|
| 165 |
+
r3 = Request("https://example.com", sid="session2")
|
| 166 |
+
|
| 167 |
+
assert r1 == r2
|
| 168 |
+
assert r1 != r3
|
| 169 |
+
|
| 170 |
+
def test_equality_different_priorities_same_fingerprint(self):
|
| 171 |
+
"""Test requests with same fingerprint are equal despite different priorities."""
|
| 172 |
+
r1 = Request("https://example.com", sid="s1", priority=1)
|
| 173 |
+
r2 = Request("https://example.com", sid="s1", priority=100)
|
| 174 |
+
|
| 175 |
+
assert r1 == r2 # Same fingerprint means equal
|
| 176 |
+
|
| 177 |
+
def test_comparison_with_non_request(self):
|
| 178 |
+
"""Test comparison with non-Request types returns NotImplemented."""
|
| 179 |
+
request = Request("https://example.com")
|
| 180 |
+
|
| 181 |
+
assert request.__lt__("not a request") == NotImplemented
|
| 182 |
+
assert request.__gt__("not a request") == NotImplemented
|
| 183 |
+
assert request.__eq__("not a request") == NotImplemented
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
class TestRequestStringRepresentation:
|
| 187 |
+
"""Test Request string representations."""
|
| 188 |
+
|
| 189 |
+
def test_str_returns_url(self):
|
| 190 |
+
"""Test __str__ returns the URL."""
|
| 191 |
+
request = Request("https://example.com/page")
|
| 192 |
+
assert str(request) == "https://example.com/page"
|
| 193 |
+
|
| 194 |
+
def test_repr_without_callback(self):
|
| 195 |
+
"""Test __repr__ without callback."""
|
| 196 |
+
request = Request("https://example.com", priority=5)
|
| 197 |
+
repr_str = repr(request)
|
| 198 |
+
|
| 199 |
+
assert "Request" in repr_str
|
| 200 |
+
assert "https://example.com" in repr_str
|
| 201 |
+
assert "priority=5" in repr_str
|
| 202 |
+
assert "callback=None" in repr_str
|
| 203 |
+
|
| 204 |
+
def test_repr_with_callback(self):
|
| 205 |
+
"""Test __repr__ with named callback."""
|
| 206 |
+
|
| 207 |
+
async def my_custom_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 208 |
+
yield None
|
| 209 |
+
|
| 210 |
+
request = Request("https://example.com", callback=my_custom_callback)
|
| 211 |
+
repr_str = repr(request)
|
| 212 |
+
|
| 213 |
+
assert "callback=my_custom_callback" in repr_str
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
class TestRequestPickling:
|
| 217 |
+
"""Test Request serialization for checkpointing."""
|
| 218 |
+
|
| 219 |
+
def test_pickle_without_callback(self):
|
| 220 |
+
"""Test pickling request without callback."""
|
| 221 |
+
original = Request(
|
| 222 |
+
url="https://example.com",
|
| 223 |
+
sid="session",
|
| 224 |
+
priority=5,
|
| 225 |
+
meta={"key": "value"},
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
pickled = pickle.dumps(original)
|
| 229 |
+
restored = pickle.loads(pickled)
|
| 230 |
+
|
| 231 |
+
assert restored.url == original.url
|
| 232 |
+
assert restored.sid == original.sid
|
| 233 |
+
assert restored.priority == original.priority
|
| 234 |
+
assert restored.meta == original.meta
|
| 235 |
+
assert restored.callback is None
|
| 236 |
+
|
| 237 |
+
def test_pickle_with_callback_stores_name(self):
|
| 238 |
+
"""Test that callback name is stored when pickling."""
|
| 239 |
+
|
| 240 |
+
async def parse_page(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 241 |
+
yield {"data": "test"}
|
| 242 |
+
|
| 243 |
+
original = Request("https://example.com", callback=parse_page)
|
| 244 |
+
|
| 245 |
+
# Check getstate stores callback name
|
| 246 |
+
state = original.__getstate__()
|
| 247 |
+
assert state["_callback_name"] == "parse_page"
|
| 248 |
+
assert state["callback"] is None
|
| 249 |
+
|
| 250 |
+
def test_pickle_with_none_callback(self):
|
| 251 |
+
"""Test pickling with None callback."""
|
| 252 |
+
original = Request("https://example.com", callback=None)
|
| 253 |
+
|
| 254 |
+
state = original.__getstate__()
|
| 255 |
+
assert state["_callback_name"] is None
|
| 256 |
+
assert state["callback"] is None
|
| 257 |
+
|
| 258 |
+
def test_setstate_stores_callback_name(self):
|
| 259 |
+
"""Test that setstate correctly handles callback name."""
|
| 260 |
+
request = Request("https://example.com")
|
| 261 |
+
state = {
|
| 262 |
+
"url": "https://example.com",
|
| 263 |
+
"sid": "",
|
| 264 |
+
"callback": None,
|
| 265 |
+
"priority": 0,
|
| 266 |
+
"dont_filter": False,
|
| 267 |
+
"meta": {},
|
| 268 |
+
"_retry_count": 0,
|
| 269 |
+
"_session_kwargs": {},
|
| 270 |
+
"_callback_name": "custom_parse",
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
request.__setstate__(state)
|
| 274 |
+
|
| 275 |
+
assert hasattr(request, "_callback_name")
|
| 276 |
+
assert request._callback_name == "custom_parse"
|
| 277 |
+
|
| 278 |
+
def test_pickle_roundtrip_preserves_session_kwargs(self):
|
| 279 |
+
"""Test that session kwargs are preserved through pickle."""
|
| 280 |
+
original = Request(
|
| 281 |
+
"https://example.com",
|
| 282 |
+
proxy="http://proxy:8080",
|
| 283 |
+
timeout=30,
|
| 284 |
+
headers={"User-Agent": "test"},
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
pickled = pickle.dumps(original)
|
| 288 |
+
restored = pickle.loads(pickled)
|
| 289 |
+
|
| 290 |
+
assert restored._session_kwargs == {
|
| 291 |
+
"proxy": "http://proxy:8080",
|
| 292 |
+
"timeout": 30,
|
| 293 |
+
"headers": {"User-Agent": "test"},
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
class TestRequestRestoreCallback:
|
| 298 |
+
"""Test callback restoration from spider."""
|
| 299 |
+
|
| 300 |
+
def test_restore_callback_from_spider(self):
|
| 301 |
+
"""Test restoring callback from spider instance."""
|
| 302 |
+
|
| 303 |
+
class MockSpider:
|
| 304 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 305 |
+
yield None
|
| 306 |
+
|
| 307 |
+
async def parse_detail(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 308 |
+
yield {"detail": True}
|
| 309 |
+
|
| 310 |
+
spider = MockSpider()
|
| 311 |
+
request = Request("https://example.com")
|
| 312 |
+
request._callback_name = "parse_detail"
|
| 313 |
+
|
| 314 |
+
request._restore_callback(spider) # type: ignore[arg-type]
|
| 315 |
+
|
| 316 |
+
assert request.callback == spider.parse_detail
|
| 317 |
+
assert not hasattr(request, "_callback_name")
|
| 318 |
+
|
| 319 |
+
def test_restore_callback_falls_back_to_parse(self):
|
| 320 |
+
"""Test that missing callback falls back to spider.parse."""
|
| 321 |
+
|
| 322 |
+
class MockSpider:
|
| 323 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 324 |
+
yield None
|
| 325 |
+
|
| 326 |
+
spider = MockSpider()
|
| 327 |
+
request = Request("https://example.com")
|
| 328 |
+
request._callback_name = "nonexistent_method"
|
| 329 |
+
|
| 330 |
+
request._restore_callback(spider) # type: ignore[arg-type]
|
| 331 |
+
|
| 332 |
+
assert request.callback == spider.parse
|
| 333 |
+
assert not hasattr(request, "_callback_name")
|
| 334 |
+
|
| 335 |
+
def test_restore_callback_with_none_name(self):
|
| 336 |
+
"""Test restore callback when _callback_name is None."""
|
| 337 |
+
|
| 338 |
+
class MockSpider:
|
| 339 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 340 |
+
yield None
|
| 341 |
+
|
| 342 |
+
spider = MockSpider()
|
| 343 |
+
request = Request("https://example.com")
|
| 344 |
+
request._callback_name = None
|
| 345 |
+
|
| 346 |
+
request._restore_callback(spider) # type: ignore[arg-type]
|
| 347 |
+
|
| 348 |
+
# Should clean up _callback_name attribute
|
| 349 |
+
assert not hasattr(request, "_callback_name")
|
| 350 |
+
|
| 351 |
+
def test_restore_callback_without_callback_name_attr(self):
|
| 352 |
+
"""Test restore callback when _callback_name attribute doesn't exist."""
|
| 353 |
+
|
| 354 |
+
class MockSpider:
|
| 355 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 356 |
+
yield None
|
| 357 |
+
|
| 358 |
+
spider = MockSpider()
|
| 359 |
+
request = Request("https://example.com")
|
| 360 |
+
# Don't set _callback_name
|
| 361 |
+
|
| 362 |
+
# Should not raise an error
|
| 363 |
+
request._restore_callback(spider) # type: ignore[arg-type]
|
tests/spiders/test_result.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the result module (ItemList, CrawlStats, CrawlResult)."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import tempfile
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
from scrapling.spiders.result import ItemList, CrawlStats, CrawlResult
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class TestItemList:
|
| 13 |
+
"""Test ItemList functionality."""
|
| 14 |
+
|
| 15 |
+
def test_itemlist_is_list(self):
|
| 16 |
+
"""Test that ItemList is a list subclass."""
|
| 17 |
+
items = ItemList()
|
| 18 |
+
|
| 19 |
+
assert isinstance(items, list)
|
| 20 |
+
|
| 21 |
+
def test_itemlist_basic_operations(self):
|
| 22 |
+
"""Test basic list operations work."""
|
| 23 |
+
items = ItemList()
|
| 24 |
+
|
| 25 |
+
items.append({"id": 1})
|
| 26 |
+
items.append({"id": 2})
|
| 27 |
+
|
| 28 |
+
assert len(items) == 2
|
| 29 |
+
assert items[0] == {"id": 1}
|
| 30 |
+
|
| 31 |
+
def test_to_json_creates_file(self):
|
| 32 |
+
"""Test to_json creates JSON file."""
|
| 33 |
+
items = ItemList()
|
| 34 |
+
items.append({"name": "test", "value": 123})
|
| 35 |
+
items.append({"name": "test2", "value": 456})
|
| 36 |
+
|
| 37 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 38 |
+
path = Path(tmpdir) / "output.json"
|
| 39 |
+
items.to_json(path)
|
| 40 |
+
|
| 41 |
+
assert path.exists()
|
| 42 |
+
|
| 43 |
+
content = json.loads(path.read_text())
|
| 44 |
+
assert len(content) == 2
|
| 45 |
+
assert content[0]["name"] == "test"
|
| 46 |
+
|
| 47 |
+
def test_to_json_creates_parent_directory(self):
|
| 48 |
+
"""Test to_json creates parent directories."""
|
| 49 |
+
items = ItemList()
|
| 50 |
+
items.append({"data": "test"})
|
| 51 |
+
|
| 52 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 53 |
+
path = Path(tmpdir) / "nested" / "dirs" / "output.json"
|
| 54 |
+
items.to_json(path)
|
| 55 |
+
|
| 56 |
+
assert path.exists()
|
| 57 |
+
|
| 58 |
+
def test_to_json_with_indent(self):
|
| 59 |
+
"""Test to_json with indentation."""
|
| 60 |
+
items = ItemList()
|
| 61 |
+
items.append({"key": "value"})
|
| 62 |
+
|
| 63 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 64 |
+
path = Path(tmpdir) / "output.json"
|
| 65 |
+
items.to_json(path, indent=True)
|
| 66 |
+
|
| 67 |
+
content = path.read_text()
|
| 68 |
+
# Indented JSON should have newlines
|
| 69 |
+
assert "\n" in content
|
| 70 |
+
|
| 71 |
+
def test_to_jsonl_creates_file(self):
|
| 72 |
+
"""Test to_jsonl creates JSON Lines file."""
|
| 73 |
+
items = ItemList()
|
| 74 |
+
items.append({"id": 1, "name": "first"})
|
| 75 |
+
items.append({"id": 2, "name": "second"})
|
| 76 |
+
items.append({"id": 3, "name": "third"})
|
| 77 |
+
|
| 78 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 79 |
+
path = Path(tmpdir) / "output.jsonl"
|
| 80 |
+
items.to_jsonl(path)
|
| 81 |
+
|
| 82 |
+
assert path.exists()
|
| 83 |
+
|
| 84 |
+
lines = path.read_text().strip().split("\n")
|
| 85 |
+
assert len(lines) == 3
|
| 86 |
+
|
| 87 |
+
# Each line should be valid JSON
|
| 88 |
+
for line in lines:
|
| 89 |
+
parsed = json.loads(line)
|
| 90 |
+
assert "id" in parsed
|
| 91 |
+
assert "name" in parsed
|
| 92 |
+
|
| 93 |
+
def test_to_jsonl_one_object_per_line(self):
|
| 94 |
+
"""Test that JSONL has one JSON object per line."""
|
| 95 |
+
items = ItemList()
|
| 96 |
+
items.append({"line": 1})
|
| 97 |
+
items.append({"line": 2})
|
| 98 |
+
|
| 99 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 100 |
+
path = Path(tmpdir) / "output.jsonl"
|
| 101 |
+
items.to_jsonl(path)
|
| 102 |
+
|
| 103 |
+
lines = path.read_text().strip().split("\n")
|
| 104 |
+
|
| 105 |
+
assert json.loads(lines[0])["line"] == 1
|
| 106 |
+
assert json.loads(lines[1])["line"] == 2
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class TestCrawlStats:
|
| 110 |
+
"""Test CrawlStats dataclass."""
|
| 111 |
+
|
| 112 |
+
def test_default_values(self):
|
| 113 |
+
"""Test CrawlStats default values."""
|
| 114 |
+
stats = CrawlStats()
|
| 115 |
+
|
| 116 |
+
assert stats.requests_count == 0
|
| 117 |
+
assert stats.concurrent_requests == 0
|
| 118 |
+
assert stats.failed_requests_count == 0
|
| 119 |
+
assert stats.response_bytes == 0
|
| 120 |
+
assert stats.items_scraped == 0
|
| 121 |
+
assert stats.items_dropped == 0
|
| 122 |
+
assert stats.start_time == 0.0
|
| 123 |
+
assert stats.end_time == 0.0
|
| 124 |
+
assert stats.custom_stats == {}
|
| 125 |
+
assert stats.response_status_count == {}
|
| 126 |
+
assert stats.proxies == []
|
| 127 |
+
|
| 128 |
+
def test_elapsed_seconds(self):
|
| 129 |
+
"""Test elapsed_seconds property."""
|
| 130 |
+
stats = CrawlStats(start_time=100.0, end_time=150.0)
|
| 131 |
+
|
| 132 |
+
assert stats.elapsed_seconds == 50.0
|
| 133 |
+
|
| 134 |
+
def test_requests_per_second(self):
|
| 135 |
+
"""Test requests_per_second calculation."""
|
| 136 |
+
stats = CrawlStats(
|
| 137 |
+
requests_count=100,
|
| 138 |
+
start_time=0.0,
|
| 139 |
+
end_time=10.0,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
assert stats.requests_per_second == 10.0
|
| 143 |
+
|
| 144 |
+
def test_requests_per_second_zero_elapsed(self):
|
| 145 |
+
"""Test requests_per_second when elapsed is zero."""
|
| 146 |
+
stats = CrawlStats(
|
| 147 |
+
requests_count=100,
|
| 148 |
+
start_time=0.0,
|
| 149 |
+
end_time=0.0,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
assert stats.requests_per_second == 0.0
|
| 153 |
+
|
| 154 |
+
def test_increment_status(self):
|
| 155 |
+
"""Test increment_status method."""
|
| 156 |
+
stats = CrawlStats()
|
| 157 |
+
|
| 158 |
+
stats.increment_status(200)
|
| 159 |
+
stats.increment_status(200)
|
| 160 |
+
stats.increment_status(404)
|
| 161 |
+
|
| 162 |
+
assert stats.response_status_count == {"status_200": 2, "status_404": 1}
|
| 163 |
+
|
| 164 |
+
def test_increment_response_bytes(self):
|
| 165 |
+
"""Test increment_response_bytes method."""
|
| 166 |
+
stats = CrawlStats()
|
| 167 |
+
|
| 168 |
+
stats.increment_response_bytes("example.com", 1000)
|
| 169 |
+
stats.increment_response_bytes("example.com", 500)
|
| 170 |
+
stats.increment_response_bytes("other.com", 2000)
|
| 171 |
+
|
| 172 |
+
assert stats.response_bytes == 3500
|
| 173 |
+
assert stats.domains_response_bytes == {
|
| 174 |
+
"example.com": 1500,
|
| 175 |
+
"other.com": 2000,
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
def test_increment_requests_count(self):
|
| 179 |
+
"""Test increment_requests_count method."""
|
| 180 |
+
stats = CrawlStats()
|
| 181 |
+
|
| 182 |
+
stats.increment_requests_count("session1")
|
| 183 |
+
stats.increment_requests_count("session1")
|
| 184 |
+
stats.increment_requests_count("session2")
|
| 185 |
+
|
| 186 |
+
assert stats.requests_count == 3
|
| 187 |
+
assert stats.sessions_requests_count == {"session1": 2, "session2": 1}
|
| 188 |
+
|
| 189 |
+
def test_to_dict(self):
|
| 190 |
+
"""Test to_dict method returns all stats."""
|
| 191 |
+
stats = CrawlStats(
|
| 192 |
+
items_scraped=10,
|
| 193 |
+
items_dropped=2,
|
| 194 |
+
requests_count=15,
|
| 195 |
+
start_time=0.0,
|
| 196 |
+
end_time=5.0,
|
| 197 |
+
)
|
| 198 |
+
stats.increment_status(200)
|
| 199 |
+
|
| 200 |
+
result = stats.to_dict()
|
| 201 |
+
|
| 202 |
+
assert result["items_scraped"] == 10
|
| 203 |
+
assert result["items_dropped"] == 2
|
| 204 |
+
assert result["requests_count"] == 15
|
| 205 |
+
assert result["elapsed_seconds"] == 5.0
|
| 206 |
+
assert result["requests_per_second"] == 3.0
|
| 207 |
+
assert result["response_status_count"] == {"status_200": 1}
|
| 208 |
+
|
| 209 |
+
def test_custom_stats(self):
|
| 210 |
+
"""Test custom_stats can be used."""
|
| 211 |
+
stats = CrawlStats()
|
| 212 |
+
stats.custom_stats["my_metric"] = 42
|
| 213 |
+
stats.custom_stats["another"] = "value"
|
| 214 |
+
|
| 215 |
+
assert stats.custom_stats["my_metric"] == 42
|
| 216 |
+
assert stats.to_dict()["custom_stats"]["my_metric"] == 42
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
class TestCrawlResult:
|
| 220 |
+
"""Test CrawlResult dataclass."""
|
| 221 |
+
|
| 222 |
+
def test_basic_creation(self):
|
| 223 |
+
"""Test basic CrawlResult creation."""
|
| 224 |
+
stats = CrawlStats(items_scraped=5)
|
| 225 |
+
items = ItemList()
|
| 226 |
+
items.extend([{"id": i} for i in range(5)])
|
| 227 |
+
|
| 228 |
+
result = CrawlResult(stats=stats, items=items)
|
| 229 |
+
|
| 230 |
+
assert result.stats.items_scraped == 5
|
| 231 |
+
assert len(result.items) == 5
|
| 232 |
+
assert result.paused is False
|
| 233 |
+
|
| 234 |
+
def test_completed_property_true_when_not_paused(self):
|
| 235 |
+
"""Test completed is True when not paused."""
|
| 236 |
+
result = CrawlResult(
|
| 237 |
+
stats=CrawlStats(),
|
| 238 |
+
items=ItemList(),
|
| 239 |
+
paused=False,
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
assert result.completed is True
|
| 243 |
+
|
| 244 |
+
def test_completed_property_false_when_paused(self):
|
| 245 |
+
"""Test completed is False when paused."""
|
| 246 |
+
result = CrawlResult(
|
| 247 |
+
stats=CrawlStats(),
|
| 248 |
+
items=ItemList(),
|
| 249 |
+
paused=True,
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
assert result.completed is False
|
| 253 |
+
|
| 254 |
+
def test_len_returns_item_count(self):
|
| 255 |
+
"""Test len returns number of items."""
|
| 256 |
+
items = ItemList()
|
| 257 |
+
items.extend([{"id": i} for i in range(10)])
|
| 258 |
+
|
| 259 |
+
result = CrawlResult(stats=CrawlStats(), items=items)
|
| 260 |
+
|
| 261 |
+
assert len(result) == 10
|
| 262 |
+
|
| 263 |
+
def test_iter_yields_items(self):
|
| 264 |
+
"""Test iteration yields items."""
|
| 265 |
+
items = ItemList()
|
| 266 |
+
items.extend([{"id": 1}, {"id": 2}, {"id": 3}])
|
| 267 |
+
|
| 268 |
+
result = CrawlResult(stats=CrawlStats(), items=items)
|
| 269 |
+
|
| 270 |
+
collected = list(result)
|
| 271 |
+
|
| 272 |
+
assert collected == [{"id": 1}, {"id": 2}, {"id": 3}]
|
| 273 |
+
|
| 274 |
+
def test_result_with_stats(self):
|
| 275 |
+
"""Test CrawlResult with populated stats."""
|
| 276 |
+
stats = CrawlStats(
|
| 277 |
+
requests_count=100,
|
| 278 |
+
items_scraped=50,
|
| 279 |
+
failed_requests_count=5,
|
| 280 |
+
start_time=0.0,
|
| 281 |
+
end_time=10.0,
|
| 282 |
+
)
|
| 283 |
+
items = ItemList()
|
| 284 |
+
|
| 285 |
+
result = CrawlResult(stats=stats, items=items)
|
| 286 |
+
|
| 287 |
+
assert result.stats.requests_count == 100
|
| 288 |
+
assert result.stats.items_scraped == 50
|
| 289 |
+
assert result.stats.requests_per_second == 10.0
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
class TestCrawlResultIntegration:
|
| 293 |
+
"""Integration tests for result classes."""
|
| 294 |
+
|
| 295 |
+
def test_full_workflow(self):
|
| 296 |
+
"""Test realistic workflow with all result classes."""
|
| 297 |
+
# Simulate a crawl
|
| 298 |
+
stats = CrawlStats(start_time=1000.0)
|
| 299 |
+
|
| 300 |
+
# Simulate requests
|
| 301 |
+
for _ in range(10):
|
| 302 |
+
stats.increment_requests_count("default")
|
| 303 |
+
stats.increment_status(200)
|
| 304 |
+
stats.increment_response_bytes("example.com", 5000)
|
| 305 |
+
|
| 306 |
+
# Simulate some failures
|
| 307 |
+
stats.failed_requests_count = 2
|
| 308 |
+
stats.blocked_requests_count = 1
|
| 309 |
+
|
| 310 |
+
# Collect items
|
| 311 |
+
items = ItemList()
|
| 312 |
+
for i in range(8):
|
| 313 |
+
items.append({"product_id": i, "name": f"Product {i}"})
|
| 314 |
+
stats.items_scraped += 1
|
| 315 |
+
|
| 316 |
+
# Finish crawl
|
| 317 |
+
stats.end_time = 1005.0
|
| 318 |
+
|
| 319 |
+
# Create result
|
| 320 |
+
result = CrawlResult(stats=stats, items=items, paused=False)
|
| 321 |
+
|
| 322 |
+
# Verify
|
| 323 |
+
assert result.completed is True
|
| 324 |
+
assert len(result) == 8
|
| 325 |
+
assert result.stats.requests_count == 10
|
| 326 |
+
assert result.stats.requests_per_second == 2.0
|
| 327 |
+
assert result.stats.response_bytes == 50000
|
tests/spiders/test_scheduler.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the Scheduler class."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from scrapling.spiders.request import Request
|
| 6 |
+
from scrapling.spiders.scheduler import Scheduler
|
| 7 |
+
from scrapling.spiders.checkpoint import CheckpointData
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TestSchedulerInit:
|
| 11 |
+
"""Test Scheduler initialization."""
|
| 12 |
+
|
| 13 |
+
def test_scheduler_starts_empty(self):
|
| 14 |
+
"""Test that scheduler starts with empty queue."""
|
| 15 |
+
scheduler = Scheduler()
|
| 16 |
+
|
| 17 |
+
assert len(scheduler) == 0
|
| 18 |
+
assert scheduler.is_empty is True
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class TestSchedulerEnqueue:
|
| 22 |
+
"""Test Scheduler enqueue functionality."""
|
| 23 |
+
|
| 24 |
+
@pytest.mark.asyncio
|
| 25 |
+
async def test_enqueue_single_request(self):
|
| 26 |
+
"""Test enqueueing a single request."""
|
| 27 |
+
scheduler = Scheduler()
|
| 28 |
+
request = Request("https://example.com")
|
| 29 |
+
|
| 30 |
+
result = await scheduler.enqueue(request)
|
| 31 |
+
|
| 32 |
+
assert result is True
|
| 33 |
+
assert len(scheduler) == 1
|
| 34 |
+
assert scheduler.is_empty is False
|
| 35 |
+
|
| 36 |
+
@pytest.mark.asyncio
|
| 37 |
+
async def test_enqueue_multiple_requests(self):
|
| 38 |
+
"""Test enqueueing multiple requests."""
|
| 39 |
+
scheduler = Scheduler()
|
| 40 |
+
|
| 41 |
+
for i in range(5):
|
| 42 |
+
request = Request(f"https://example.com/{i}")
|
| 43 |
+
await scheduler.enqueue(request)
|
| 44 |
+
|
| 45 |
+
assert len(scheduler) == 5
|
| 46 |
+
|
| 47 |
+
@pytest.mark.asyncio
|
| 48 |
+
async def test_enqueue_duplicate_filtered(self):
|
| 49 |
+
"""Test that duplicate requests are filtered by default."""
|
| 50 |
+
scheduler = Scheduler()
|
| 51 |
+
|
| 52 |
+
request1 = Request("https://example.com", sid="s1")
|
| 53 |
+
request2 = Request("https://example.com", sid="s1") # Same fingerprint
|
| 54 |
+
|
| 55 |
+
result1 = await scheduler.enqueue(request1)
|
| 56 |
+
result2 = await scheduler.enqueue(request2)
|
| 57 |
+
|
| 58 |
+
assert result1 is True
|
| 59 |
+
assert result2 is False # Duplicate filtered
|
| 60 |
+
assert len(scheduler) == 1
|
| 61 |
+
|
| 62 |
+
@pytest.mark.asyncio
|
| 63 |
+
async def test_enqueue_duplicate_allowed_with_dont_filter(self):
|
| 64 |
+
"""Test that dont_filter allows duplicate requests."""
|
| 65 |
+
scheduler = Scheduler()
|
| 66 |
+
|
| 67 |
+
request1 = Request("https://example.com", sid="s1")
|
| 68 |
+
request2 = Request("https://example.com", sid="s1", dont_filter=True)
|
| 69 |
+
|
| 70 |
+
result1 = await scheduler.enqueue(request1)
|
| 71 |
+
result2 = await scheduler.enqueue(request2)
|
| 72 |
+
|
| 73 |
+
assert result1 is True
|
| 74 |
+
assert result2 is True
|
| 75 |
+
assert len(scheduler) == 2
|
| 76 |
+
|
| 77 |
+
@pytest.mark.asyncio
|
| 78 |
+
async def test_enqueue_different_sessions_not_duplicate(self):
|
| 79 |
+
"""Test that same URL with different sessions are not duplicates."""
|
| 80 |
+
scheduler = Scheduler()
|
| 81 |
+
|
| 82 |
+
request1 = Request("https://example.com", sid="session1")
|
| 83 |
+
request2 = Request("https://example.com", sid="session2")
|
| 84 |
+
|
| 85 |
+
result1 = await scheduler.enqueue(request1)
|
| 86 |
+
result2 = await scheduler.enqueue(request2)
|
| 87 |
+
|
| 88 |
+
assert result1 is True
|
| 89 |
+
assert result2 is True
|
| 90 |
+
assert len(scheduler) == 2
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class TestSchedulerDequeue:
|
| 94 |
+
"""Test Scheduler dequeue functionality."""
|
| 95 |
+
|
| 96 |
+
@pytest.mark.asyncio
|
| 97 |
+
async def test_dequeue_returns_request(self):
|
| 98 |
+
"""Test that dequeue returns the enqueued request."""
|
| 99 |
+
scheduler = Scheduler()
|
| 100 |
+
original = Request("https://example.com")
|
| 101 |
+
|
| 102 |
+
await scheduler.enqueue(original)
|
| 103 |
+
dequeued = await scheduler.dequeue()
|
| 104 |
+
|
| 105 |
+
assert dequeued.url == original.url
|
| 106 |
+
|
| 107 |
+
@pytest.mark.asyncio
|
| 108 |
+
async def test_dequeue_respects_priority_order(self):
|
| 109 |
+
"""Test that higher priority requests are dequeued first."""
|
| 110 |
+
scheduler = Scheduler()
|
| 111 |
+
|
| 112 |
+
low = Request("https://example.com/low", priority=1)
|
| 113 |
+
high = Request("https://example.com/high", priority=10)
|
| 114 |
+
medium = Request("https://example.com/medium", priority=5)
|
| 115 |
+
|
| 116 |
+
await scheduler.enqueue(low)
|
| 117 |
+
await scheduler.enqueue(high)
|
| 118 |
+
await scheduler.enqueue(medium)
|
| 119 |
+
|
| 120 |
+
# Should get high priority first
|
| 121 |
+
first = await scheduler.dequeue()
|
| 122 |
+
assert first.url == "https://example.com/high"
|
| 123 |
+
|
| 124 |
+
second = await scheduler.dequeue()
|
| 125 |
+
assert second.url == "https://example.com/medium"
|
| 126 |
+
|
| 127 |
+
third = await scheduler.dequeue()
|
| 128 |
+
assert third.url == "https://example.com/low"
|
| 129 |
+
|
| 130 |
+
@pytest.mark.asyncio
|
| 131 |
+
async def test_dequeue_fifo_for_same_priority(self):
|
| 132 |
+
"""Test FIFO ordering for requests with same priority."""
|
| 133 |
+
scheduler = Scheduler()
|
| 134 |
+
|
| 135 |
+
for i in range(3):
|
| 136 |
+
request = Request(f"https://example.com/{i}", priority=5)
|
| 137 |
+
await scheduler.enqueue(request)
|
| 138 |
+
|
| 139 |
+
first = await scheduler.dequeue()
|
| 140 |
+
second = await scheduler.dequeue()
|
| 141 |
+
third = await scheduler.dequeue()
|
| 142 |
+
|
| 143 |
+
# Should be in FIFO order since same priority
|
| 144 |
+
assert first.url == "https://example.com/0"
|
| 145 |
+
assert second.url == "https://example.com/1"
|
| 146 |
+
assert third.url == "https://example.com/2"
|
| 147 |
+
|
| 148 |
+
@pytest.mark.asyncio
|
| 149 |
+
async def test_dequeue_updates_length(self):
|
| 150 |
+
"""Test that dequeue decreases the queue length."""
|
| 151 |
+
scheduler = Scheduler()
|
| 152 |
+
|
| 153 |
+
await scheduler.enqueue(Request("https://example.com/1"))
|
| 154 |
+
await scheduler.enqueue(Request("https://example.com/2"))
|
| 155 |
+
|
| 156 |
+
assert len(scheduler) == 2
|
| 157 |
+
|
| 158 |
+
await scheduler.dequeue()
|
| 159 |
+
assert len(scheduler) == 1
|
| 160 |
+
|
| 161 |
+
await scheduler.dequeue()
|
| 162 |
+
assert len(scheduler) == 0
|
| 163 |
+
assert scheduler.is_empty is True
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
class TestSchedulerSnapshot:
|
| 167 |
+
"""Test Scheduler snapshot functionality for checkpointing."""
|
| 168 |
+
|
| 169 |
+
@pytest.mark.asyncio
|
| 170 |
+
async def test_snapshot_empty_scheduler(self):
|
| 171 |
+
"""Test snapshot of empty scheduler."""
|
| 172 |
+
scheduler = Scheduler()
|
| 173 |
+
|
| 174 |
+
requests, seen = scheduler.snapshot()
|
| 175 |
+
|
| 176 |
+
assert requests == []
|
| 177 |
+
assert seen == set()
|
| 178 |
+
|
| 179 |
+
@pytest.mark.asyncio
|
| 180 |
+
async def test_snapshot_captures_pending_requests(self):
|
| 181 |
+
"""Test snapshot captures all pending requests."""
|
| 182 |
+
scheduler = Scheduler()
|
| 183 |
+
|
| 184 |
+
await scheduler.enqueue(Request("https://example.com/1", priority=5))
|
| 185 |
+
await scheduler.enqueue(Request("https://example.com/2", priority=10))
|
| 186 |
+
await scheduler.enqueue(Request("https://example.com/3", priority=1))
|
| 187 |
+
|
| 188 |
+
requests, seen = scheduler.snapshot()
|
| 189 |
+
|
| 190 |
+
assert len(requests) == 3
|
| 191 |
+
# Should be sorted by priority (highest first due to negative priority in queue)
|
| 192 |
+
assert requests[0].url == "https://example.com/2" # priority 10
|
| 193 |
+
assert requests[1].url == "https://example.com/1" # priority 5
|
| 194 |
+
assert requests[2].url == "https://example.com/3" # priority 1
|
| 195 |
+
|
| 196 |
+
@pytest.mark.asyncio
|
| 197 |
+
async def test_snapshot_captures_seen_set(self):
|
| 198 |
+
"""Test snapshot captures seen URLs."""
|
| 199 |
+
scheduler = Scheduler()
|
| 200 |
+
|
| 201 |
+
await scheduler.enqueue(Request("https://example.com/1", sid="s1"))
|
| 202 |
+
await scheduler.enqueue(Request("https://example.com/2", sid="s1"))
|
| 203 |
+
|
| 204 |
+
requests, seen = scheduler.snapshot()
|
| 205 |
+
|
| 206 |
+
assert len(seen) == 2
|
| 207 |
+
assert "s1:https://example.com/1" in seen
|
| 208 |
+
assert "s1:https://example.com/2" in seen
|
| 209 |
+
|
| 210 |
+
@pytest.mark.asyncio
|
| 211 |
+
async def test_snapshot_returns_copies(self):
|
| 212 |
+
"""Test that snapshot returns copies, not references."""
|
| 213 |
+
scheduler = Scheduler()
|
| 214 |
+
|
| 215 |
+
await scheduler.enqueue(Request("https://example.com"))
|
| 216 |
+
|
| 217 |
+
requests, seen = scheduler.snapshot()
|
| 218 |
+
|
| 219 |
+
# Modifying snapshot shouldn't affect scheduler
|
| 220 |
+
requests.append(Request("https://modified.com"))
|
| 221 |
+
seen.add("new_fingerprint")
|
| 222 |
+
|
| 223 |
+
original_requests, original_seen = scheduler.snapshot()
|
| 224 |
+
|
| 225 |
+
assert len(original_requests) == 1
|
| 226 |
+
assert "new_fingerprint" not in original_seen
|
| 227 |
+
|
| 228 |
+
@pytest.mark.asyncio
|
| 229 |
+
async def test_snapshot_excludes_dequeued_requests(self):
|
| 230 |
+
"""Test snapshot only includes pending requests."""
|
| 231 |
+
scheduler = Scheduler()
|
| 232 |
+
|
| 233 |
+
await scheduler.enqueue(Request("https://example.com/1"))
|
| 234 |
+
await scheduler.enqueue(Request("https://example.com/2"))
|
| 235 |
+
await scheduler.enqueue(Request("https://example.com/3"))
|
| 236 |
+
|
| 237 |
+
# Dequeue one
|
| 238 |
+
await scheduler.dequeue()
|
| 239 |
+
|
| 240 |
+
requests, seen = scheduler.snapshot()
|
| 241 |
+
|
| 242 |
+
# Snapshot should only have 2 pending requests
|
| 243 |
+
assert len(requests) == 2
|
| 244 |
+
# But seen should still have all 3 (deduplication tracking)
|
| 245 |
+
assert len(seen) == 3
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
class TestSchedulerRestore:
|
| 249 |
+
"""Test Scheduler restore functionality from checkpoint."""
|
| 250 |
+
|
| 251 |
+
@pytest.mark.asyncio
|
| 252 |
+
async def test_restore_requests(self):
|
| 253 |
+
"""Test restoring requests from checkpoint data."""
|
| 254 |
+
scheduler = Scheduler()
|
| 255 |
+
|
| 256 |
+
checkpoint_requests = [
|
| 257 |
+
Request("https://example.com/1", priority=10),
|
| 258 |
+
Request("https://example.com/2", priority=5),
|
| 259 |
+
]
|
| 260 |
+
checkpoint_seen = {"fp1", "fp2", "fp3"}
|
| 261 |
+
|
| 262 |
+
data = CheckpointData(requests=checkpoint_requests, seen=checkpoint_seen)
|
| 263 |
+
|
| 264 |
+
scheduler.restore(data)
|
| 265 |
+
|
| 266 |
+
assert len(scheduler) == 2
|
| 267 |
+
|
| 268 |
+
@pytest.mark.asyncio
|
| 269 |
+
async def test_restore_seen_set(self):
|
| 270 |
+
"""Test that restore sets up seen fingerprints."""
|
| 271 |
+
scheduler = Scheduler()
|
| 272 |
+
|
| 273 |
+
data = CheckpointData(
|
| 274 |
+
requests=[],
|
| 275 |
+
seen={"fp1", "fp2"},
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
scheduler.restore(data)
|
| 279 |
+
|
| 280 |
+
# Now try to enqueue a request with matching fingerprint
|
| 281 |
+
request = Request("https://example.com")
|
| 282 |
+
request.sid = "" # Empty sid
|
| 283 |
+
# Manually set fingerprint that matches seen
|
| 284 |
+
# Since fingerprint is sid:url, we need to create matching ones
|
| 285 |
+
|
| 286 |
+
# Verify seen set was restored
|
| 287 |
+
_, seen = scheduler.snapshot()
|
| 288 |
+
assert seen == {"fp1", "fp2"}
|
| 289 |
+
|
| 290 |
+
@pytest.mark.asyncio
|
| 291 |
+
async def test_restore_maintains_priority_order(self):
|
| 292 |
+
"""Test that restored requests maintain priority order."""
|
| 293 |
+
scheduler = Scheduler()
|
| 294 |
+
|
| 295 |
+
# Requests should already be sorted by priority in checkpoint
|
| 296 |
+
checkpoint_requests = [
|
| 297 |
+
Request("https://example.com/high", priority=10),
|
| 298 |
+
Request("https://example.com/low", priority=1),
|
| 299 |
+
]
|
| 300 |
+
|
| 301 |
+
data = CheckpointData(requests=checkpoint_requests, seen=set())
|
| 302 |
+
scheduler.restore(data)
|
| 303 |
+
|
| 304 |
+
# Dequeue should return high priority first
|
| 305 |
+
first = await scheduler.dequeue()
|
| 306 |
+
assert first.url == "https://example.com/high"
|
| 307 |
+
|
| 308 |
+
second = await scheduler.dequeue()
|
| 309 |
+
assert second.url == "https://example.com/low"
|
| 310 |
+
|
| 311 |
+
@pytest.mark.asyncio
|
| 312 |
+
async def test_restore_empty_checkpoint(self):
|
| 313 |
+
"""Test restoring from empty checkpoint."""
|
| 314 |
+
scheduler = Scheduler()
|
| 315 |
+
|
| 316 |
+
data = CheckpointData(requests=[], seen=set())
|
| 317 |
+
scheduler.restore(data)
|
| 318 |
+
|
| 319 |
+
assert len(scheduler) == 0
|
| 320 |
+
assert scheduler.is_empty is True
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
class TestSchedulerIntegration:
|
| 324 |
+
"""Integration tests for Scheduler with checkpoint roundtrip."""
|
| 325 |
+
|
| 326 |
+
@pytest.mark.asyncio
|
| 327 |
+
async def test_snapshot_and_restore_roundtrip(self):
|
| 328 |
+
"""Test that snapshot -> restore works correctly."""
|
| 329 |
+
# Create and populate original scheduler
|
| 330 |
+
original = Scheduler()
|
| 331 |
+
|
| 332 |
+
await original.enqueue(Request("https://example.com/1", sid="s1", priority=10))
|
| 333 |
+
await original.enqueue(Request("https://example.com/2", sid="s1", priority=5))
|
| 334 |
+
await original.enqueue(Request("https://example.com/3", sid="s2", priority=7))
|
| 335 |
+
|
| 336 |
+
# Snapshot
|
| 337 |
+
requests, seen = original.snapshot()
|
| 338 |
+
data = CheckpointData(requests=requests, seen=seen)
|
| 339 |
+
|
| 340 |
+
# Restore to new scheduler
|
| 341 |
+
restored = Scheduler()
|
| 342 |
+
restored.restore(data)
|
| 343 |
+
|
| 344 |
+
# Verify state matches
|
| 345 |
+
assert len(restored) == len(original)
|
| 346 |
+
|
| 347 |
+
# Dequeue from both and compare
|
| 348 |
+
for _ in range(3):
|
| 349 |
+
orig_req = await original.dequeue()
|
| 350 |
+
rest_req = await restored.dequeue()
|
| 351 |
+
assert orig_req.url == rest_req.url
|
| 352 |
+
assert orig_req.priority == rest_req.priority
|
| 353 |
+
|
| 354 |
+
@pytest.mark.asyncio
|
| 355 |
+
async def test_partial_processing_then_checkpoint(self):
|
| 356 |
+
"""Test checkpointing after partial processing."""
|
| 357 |
+
scheduler = Scheduler()
|
| 358 |
+
|
| 359 |
+
# Enqueue 5 requests
|
| 360 |
+
for i in range(5):
|
| 361 |
+
await scheduler.enqueue(Request(f"https://example.com/{i}"))
|
| 362 |
+
|
| 363 |
+
# Process 2
|
| 364 |
+
await scheduler.dequeue()
|
| 365 |
+
await scheduler.dequeue()
|
| 366 |
+
|
| 367 |
+
# Snapshot should show 3 pending, 5 seen
|
| 368 |
+
requests, seen = scheduler.snapshot()
|
| 369 |
+
|
| 370 |
+
assert len(requests) == 3
|
| 371 |
+
assert len(seen) == 5
|
| 372 |
+
|
| 373 |
+
@pytest.mark.asyncio
|
| 374 |
+
async def test_deduplication_after_restore(self):
|
| 375 |
+
"""Test that deduplication works after restore."""
|
| 376 |
+
scheduler = Scheduler()
|
| 377 |
+
|
| 378 |
+
await scheduler.enqueue(Request("https://example.com", sid="s1"))
|
| 379 |
+
|
| 380 |
+
requests, seen = scheduler.snapshot()
|
| 381 |
+
data = CheckpointData(requests=requests, seen=seen)
|
| 382 |
+
|
| 383 |
+
# Restore to new scheduler
|
| 384 |
+
new_scheduler = Scheduler()
|
| 385 |
+
new_scheduler.restore(data)
|
| 386 |
+
|
| 387 |
+
# Try to add duplicate - should be filtered
|
| 388 |
+
result = await new_scheduler.enqueue(Request("https://example.com", sid="s1"))
|
| 389 |
+
|
| 390 |
+
assert result is False # Duplicate filtered based on restored seen set
|
tests/spiders/test_session.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the SessionManager class."""
|
| 2 |
+
|
| 3 |
+
from scrapling.core._types import Any
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
from scrapling.spiders.session import SessionManager
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class MockSession: # type: ignore[type-arg]
|
| 10 |
+
"""Mock session for testing without actual network calls."""
|
| 11 |
+
|
| 12 |
+
def __init__(self, name: str = "mock"):
|
| 13 |
+
self.name = name
|
| 14 |
+
self._is_alive = False
|
| 15 |
+
self._started = False
|
| 16 |
+
self._closed = False
|
| 17 |
+
|
| 18 |
+
async def __aenter__(self):
|
| 19 |
+
self._is_alive = True
|
| 20 |
+
self._started = True
|
| 21 |
+
return self
|
| 22 |
+
|
| 23 |
+
async def __aexit__(self, *args):
|
| 24 |
+
self._is_alive = False
|
| 25 |
+
self._closed = True
|
| 26 |
+
|
| 27 |
+
async def fetch(self, url: str, **kwargs):
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class TestSessionManagerInit:
|
| 32 |
+
"""Test SessionManager initialization."""
|
| 33 |
+
|
| 34 |
+
def test_manager_starts_empty(self):
|
| 35 |
+
"""Test that manager starts with no sessions."""
|
| 36 |
+
manager = SessionManager()
|
| 37 |
+
|
| 38 |
+
assert len(manager) == 0
|
| 39 |
+
|
| 40 |
+
def test_manager_no_default_session_when_empty(self):
|
| 41 |
+
"""Test that accessing default_session_id raises when empty."""
|
| 42 |
+
manager = SessionManager()
|
| 43 |
+
|
| 44 |
+
with pytest.raises(RuntimeError, match="No sessions registered"):
|
| 45 |
+
_ = manager.default_session_id
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class TestSessionManagerAdd:
|
| 49 |
+
"""Test SessionManager add functionality."""
|
| 50 |
+
|
| 51 |
+
def test_add_single_session(self):
|
| 52 |
+
"""Test adding a single session."""
|
| 53 |
+
manager = SessionManager()
|
| 54 |
+
session = MockSession()
|
| 55 |
+
|
| 56 |
+
manager.add("test", session)
|
| 57 |
+
|
| 58 |
+
assert len(manager) == 1
|
| 59 |
+
assert "test" in manager
|
| 60 |
+
assert manager.session_ids == ["test"]
|
| 61 |
+
|
| 62 |
+
def test_first_session_becomes_default(self):
|
| 63 |
+
"""Test that first added session becomes default."""
|
| 64 |
+
manager = SessionManager()
|
| 65 |
+
session = MockSession()
|
| 66 |
+
|
| 67 |
+
manager.add("first", session)
|
| 68 |
+
|
| 69 |
+
assert manager.default_session_id == "first"
|
| 70 |
+
|
| 71 |
+
def test_add_multiple_sessions(self):
|
| 72 |
+
"""Test adding multiple sessions."""
|
| 73 |
+
manager = SessionManager()
|
| 74 |
+
|
| 75 |
+
manager.add("session1", MockSession("s1"))
|
| 76 |
+
manager.add("session2", MockSession("s2"))
|
| 77 |
+
manager.add("session3", MockSession("s3"))
|
| 78 |
+
|
| 79 |
+
assert len(manager) == 3
|
| 80 |
+
assert "session1" in manager
|
| 81 |
+
assert "session2" in manager
|
| 82 |
+
assert "session3" in manager
|
| 83 |
+
|
| 84 |
+
def test_explicit_default_session(self):
|
| 85 |
+
"""Test setting explicit default session."""
|
| 86 |
+
manager = SessionManager()
|
| 87 |
+
|
| 88 |
+
manager.add("first", MockSession())
|
| 89 |
+
manager.add("second", MockSession(), default=True)
|
| 90 |
+
|
| 91 |
+
assert manager.default_session_id == "second"
|
| 92 |
+
|
| 93 |
+
def test_add_duplicate_id_raises(self):
|
| 94 |
+
"""Test that adding duplicate session ID raises."""
|
| 95 |
+
manager = SessionManager()
|
| 96 |
+
manager.add("test", MockSession())
|
| 97 |
+
|
| 98 |
+
with pytest.raises(ValueError, match="already registered"):
|
| 99 |
+
manager.add("test", MockSession())
|
| 100 |
+
|
| 101 |
+
def test_add_returns_self_for_chaining(self):
|
| 102 |
+
"""Test that add returns self for method chaining."""
|
| 103 |
+
manager = SessionManager()
|
| 104 |
+
|
| 105 |
+
result = manager.add("test", MockSession())
|
| 106 |
+
|
| 107 |
+
assert result is manager
|
| 108 |
+
|
| 109 |
+
def test_method_chaining(self):
|
| 110 |
+
"""Test fluent interface for adding sessions."""
|
| 111 |
+
manager = SessionManager()
|
| 112 |
+
|
| 113 |
+
manager.add("s1", MockSession()).add("s2", MockSession()).add("s3", MockSession())
|
| 114 |
+
|
| 115 |
+
assert len(manager) == 3
|
| 116 |
+
|
| 117 |
+
def test_add_lazy_session(self):
|
| 118 |
+
"""Test adding lazy session."""
|
| 119 |
+
manager = SessionManager()
|
| 120 |
+
|
| 121 |
+
manager.add("lazy", MockSession(), lazy=True)
|
| 122 |
+
|
| 123 |
+
assert "lazy" in manager
|
| 124 |
+
assert "lazy" in manager._lazy_sessions
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
class TestSessionManagerRemove:
|
| 128 |
+
"""Test SessionManager remove/pop functionality."""
|
| 129 |
+
|
| 130 |
+
def test_remove_session(self):
|
| 131 |
+
"""Test removing a session."""
|
| 132 |
+
manager = SessionManager()
|
| 133 |
+
manager.add("test", MockSession())
|
| 134 |
+
|
| 135 |
+
manager.remove("test")
|
| 136 |
+
|
| 137 |
+
assert "test" not in manager
|
| 138 |
+
assert len(manager) == 0
|
| 139 |
+
|
| 140 |
+
def test_remove_nonexistent_raises(self):
|
| 141 |
+
"""Test removing nonexistent session raises."""
|
| 142 |
+
manager = SessionManager()
|
| 143 |
+
|
| 144 |
+
with pytest.raises(KeyError, match="not found"):
|
| 145 |
+
manager.remove("nonexistent")
|
| 146 |
+
|
| 147 |
+
def test_pop_returns_session(self):
|
| 148 |
+
"""Test pop returns the removed session."""
|
| 149 |
+
manager = SessionManager()
|
| 150 |
+
session = MockSession("original")
|
| 151 |
+
manager.add("test", session)
|
| 152 |
+
|
| 153 |
+
popped = manager.pop("test")
|
| 154 |
+
|
| 155 |
+
assert popped is session
|
| 156 |
+
assert "test" not in manager
|
| 157 |
+
|
| 158 |
+
def test_remove_default_updates_default(self):
|
| 159 |
+
"""Test that removing default session updates default."""
|
| 160 |
+
manager = SessionManager()
|
| 161 |
+
manager.add("first", MockSession())
|
| 162 |
+
manager.add("second", MockSession())
|
| 163 |
+
|
| 164 |
+
assert manager.default_session_id == "first"
|
| 165 |
+
|
| 166 |
+
manager.remove("first")
|
| 167 |
+
|
| 168 |
+
assert manager.default_session_id == "second"
|
| 169 |
+
|
| 170 |
+
def test_remove_lazy_session_cleans_up(self):
|
| 171 |
+
"""Test that removing lazy session cleans up lazy set."""
|
| 172 |
+
manager = SessionManager()
|
| 173 |
+
manager.add("lazy", MockSession(), lazy=True)
|
| 174 |
+
|
| 175 |
+
manager.remove("lazy")
|
| 176 |
+
|
| 177 |
+
assert "lazy" not in manager._lazy_sessions
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
class TestSessionManagerGet:
|
| 181 |
+
"""Test SessionManager get functionality."""
|
| 182 |
+
|
| 183 |
+
def test_get_existing_session(self):
|
| 184 |
+
"""Test getting an existing session."""
|
| 185 |
+
manager = SessionManager()
|
| 186 |
+
session = MockSession("test")
|
| 187 |
+
manager.add("test", session)
|
| 188 |
+
|
| 189 |
+
retrieved = manager.get("test")
|
| 190 |
+
|
| 191 |
+
assert retrieved is session
|
| 192 |
+
|
| 193 |
+
def test_get_nonexistent_raises_with_available(self):
|
| 194 |
+
"""Test getting nonexistent session shows available sessions."""
|
| 195 |
+
manager = SessionManager()
|
| 196 |
+
manager.add("session1", MockSession())
|
| 197 |
+
manager.add("session2", MockSession())
|
| 198 |
+
|
| 199 |
+
with pytest.raises(KeyError, match="Available:"):
|
| 200 |
+
manager.get("nonexistent")
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
class TestSessionManagerContains:
|
| 204 |
+
"""Test SessionManager contains functionality."""
|
| 205 |
+
|
| 206 |
+
def test_contains_existing(self):
|
| 207 |
+
"""Test contains for existing session."""
|
| 208 |
+
manager = SessionManager()
|
| 209 |
+
manager.add("test", MockSession())
|
| 210 |
+
|
| 211 |
+
assert "test" in manager
|
| 212 |
+
|
| 213 |
+
def test_not_contains_missing(self):
|
| 214 |
+
"""Test contains for missing session."""
|
| 215 |
+
manager = SessionManager()
|
| 216 |
+
manager.add("test", MockSession())
|
| 217 |
+
|
| 218 |
+
assert "other" not in manager
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
class TestSessionManagerAsyncContext:
|
| 222 |
+
"""Test SessionManager async context manager."""
|
| 223 |
+
|
| 224 |
+
@pytest.mark.asyncio
|
| 225 |
+
async def test_start_activates_sessions(self):
|
| 226 |
+
"""Test that start activates non-lazy sessions."""
|
| 227 |
+
manager = SessionManager()
|
| 228 |
+
session = MockSession()
|
| 229 |
+
manager.add("test", session)
|
| 230 |
+
|
| 231 |
+
await manager.start()
|
| 232 |
+
|
| 233 |
+
assert session._is_alive is True
|
| 234 |
+
assert manager._started is True
|
| 235 |
+
|
| 236 |
+
@pytest.mark.asyncio
|
| 237 |
+
async def test_start_skips_lazy_sessions(self):
|
| 238 |
+
"""Test that start skips lazy sessions."""
|
| 239 |
+
manager = SessionManager()
|
| 240 |
+
eager_session = MockSession("eager")
|
| 241 |
+
lazy_session = MockSession("lazy")
|
| 242 |
+
|
| 243 |
+
manager.add("eager", eager_session)
|
| 244 |
+
manager.add("lazy", lazy_session, lazy=True)
|
| 245 |
+
|
| 246 |
+
await manager.start()
|
| 247 |
+
|
| 248 |
+
assert eager_session._is_alive is True
|
| 249 |
+
assert lazy_session._is_alive is False
|
| 250 |
+
|
| 251 |
+
@pytest.mark.asyncio
|
| 252 |
+
async def test_close_deactivates_sessions(self):
|
| 253 |
+
"""Test that close deactivates all sessions."""
|
| 254 |
+
manager = SessionManager()
|
| 255 |
+
session = MockSession()
|
| 256 |
+
manager.add("test", session)
|
| 257 |
+
|
| 258 |
+
await manager.start()
|
| 259 |
+
assert session._is_alive is True
|
| 260 |
+
|
| 261 |
+
await manager.close()
|
| 262 |
+
assert session._is_alive is False
|
| 263 |
+
assert manager._started is False
|
| 264 |
+
|
| 265 |
+
@pytest.mark.asyncio
|
| 266 |
+
async def test_async_context_manager(self):
|
| 267 |
+
"""Test using SessionManager as async context manager."""
|
| 268 |
+
manager = SessionManager()
|
| 269 |
+
session = MockSession()
|
| 270 |
+
manager.add("test", session)
|
| 271 |
+
|
| 272 |
+
async with manager:
|
| 273 |
+
assert session._is_alive is True
|
| 274 |
+
|
| 275 |
+
assert session._is_alive is False
|
| 276 |
+
|
| 277 |
+
@pytest.mark.asyncio
|
| 278 |
+
async def test_start_idempotent(self):
|
| 279 |
+
"""Test that calling start multiple times is safe."""
|
| 280 |
+
manager = SessionManager()
|
| 281 |
+
session = MockSession()
|
| 282 |
+
manager.add("test", session)
|
| 283 |
+
|
| 284 |
+
await manager.start()
|
| 285 |
+
await manager.start() # Should not raise or double-start
|
| 286 |
+
|
| 287 |
+
assert session._started is True
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
class TestSessionManagerProperties:
|
| 291 |
+
"""Test SessionManager properties."""
|
| 292 |
+
|
| 293 |
+
def test_session_ids_returns_list(self):
|
| 294 |
+
"""Test session_ids returns list of IDs."""
|
| 295 |
+
manager = SessionManager()
|
| 296 |
+
manager.add("a", MockSession())
|
| 297 |
+
manager.add("b", MockSession())
|
| 298 |
+
manager.add("c", MockSession())
|
| 299 |
+
|
| 300 |
+
ids = manager.session_ids
|
| 301 |
+
|
| 302 |
+
assert isinstance(ids, list)
|
| 303 |
+
assert set(ids) == {"a", "b", "c"}
|
| 304 |
+
|
| 305 |
+
def test_len_returns_session_count(self):
|
| 306 |
+
"""Test len returns number of sessions."""
|
| 307 |
+
manager = SessionManager()
|
| 308 |
+
|
| 309 |
+
assert len(manager) == 0
|
| 310 |
+
|
| 311 |
+
manager.add("s1", MockSession())
|
| 312 |
+
assert len(manager) == 1
|
| 313 |
+
|
| 314 |
+
manager.add("s2", MockSession())
|
| 315 |
+
assert len(manager) == 2
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
class TestSessionManagerIntegration:
|
| 319 |
+
"""Integration tests for SessionManager."""
|
| 320 |
+
|
| 321 |
+
def test_realistic_setup(self):
|
| 322 |
+
"""Test realistic session manager setup."""
|
| 323 |
+
manager = SessionManager()
|
| 324 |
+
|
| 325 |
+
# Add different types of sessions
|
| 326 |
+
manager.add("default", MockSession("default"))
|
| 327 |
+
manager.add("backup", MockSession("backup"))
|
| 328 |
+
manager.add("lazy_special", MockSession("special"), lazy=True)
|
| 329 |
+
|
| 330 |
+
assert len(manager) == 3
|
| 331 |
+
assert manager.default_session_id == "default"
|
| 332 |
+
assert "lazy_special" in manager._lazy_sessions
|
| 333 |
+
|
| 334 |
+
@pytest.mark.asyncio
|
| 335 |
+
async def test_lifecycle_management(self):
|
| 336 |
+
"""Test complete lifecycle of session manager."""
|
| 337 |
+
manager = SessionManager()
|
| 338 |
+
sessions = [MockSession(f"s{i}") for i in range(3)]
|
| 339 |
+
|
| 340 |
+
for i, session in enumerate(sessions):
|
| 341 |
+
manager.add(f"session{i}", session)
|
| 342 |
+
|
| 343 |
+
# Before start - no sessions active
|
| 344 |
+
assert all(not s._is_alive for s in sessions)
|
| 345 |
+
|
| 346 |
+
# After start - all active
|
| 347 |
+
await manager.start()
|
| 348 |
+
assert all(s._is_alive for s in sessions)
|
| 349 |
+
|
| 350 |
+
# After close - all inactive
|
| 351 |
+
await manager.close()
|
| 352 |
+
assert all(not s._is_alive for s in sessions)
|
tests/spiders/test_spider.py
ADDED
|
@@ -0,0 +1,574 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the Spider class and related components."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import tempfile
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
from scrapling.spiders.spider import Spider, SessionConfigurationError, LogCounterHandler, BLOCKED_CODES
|
| 10 |
+
from scrapling.spiders.request import Request
|
| 11 |
+
from scrapling.spiders.session import SessionManager
|
| 12 |
+
from scrapling.spiders.result import CrawlStats
|
| 13 |
+
from scrapling.core._types import Any, Dict, AsyncGenerator
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class TestLogCounterHandler:
|
| 17 |
+
"""Test LogCounterHandler for tracking log counts."""
|
| 18 |
+
|
| 19 |
+
def test_initial_counts_are_zero(self):
|
| 20 |
+
"""Test that handler starts with zero counts."""
|
| 21 |
+
handler = LogCounterHandler()
|
| 22 |
+
counts = handler.get_counts()
|
| 23 |
+
|
| 24 |
+
assert counts["debug"] == 0
|
| 25 |
+
assert counts["info"] == 0
|
| 26 |
+
assert counts["warning"] == 0
|
| 27 |
+
assert counts["error"] == 0
|
| 28 |
+
assert counts["critical"] == 0
|
| 29 |
+
|
| 30 |
+
def test_counts_debug_messages(self):
|
| 31 |
+
"""Test counting debug level messages."""
|
| 32 |
+
handler = LogCounterHandler()
|
| 33 |
+
record = logging.LogRecord(
|
| 34 |
+
name="test",
|
| 35 |
+
level=logging.DEBUG,
|
| 36 |
+
pathname="",
|
| 37 |
+
lineno=0,
|
| 38 |
+
msg="test",
|
| 39 |
+
args=(),
|
| 40 |
+
exc_info=None,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
handler.emit(record)
|
| 44 |
+
handler.emit(record)
|
| 45 |
+
|
| 46 |
+
assert handler.get_counts()["debug"] == 2
|
| 47 |
+
|
| 48 |
+
def test_counts_info_messages(self):
|
| 49 |
+
"""Test counting info level messages."""
|
| 50 |
+
handler = LogCounterHandler()
|
| 51 |
+
record = logging.LogRecord(
|
| 52 |
+
name="test",
|
| 53 |
+
level=logging.INFO,
|
| 54 |
+
pathname="",
|
| 55 |
+
lineno=0,
|
| 56 |
+
msg="test",
|
| 57 |
+
args=(),
|
| 58 |
+
exc_info=None,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
handler.emit(record)
|
| 62 |
+
|
| 63 |
+
assert handler.get_counts()["info"] == 1
|
| 64 |
+
|
| 65 |
+
def test_counts_warning_messages(self):
|
| 66 |
+
"""Test counting warning level messages."""
|
| 67 |
+
handler = LogCounterHandler()
|
| 68 |
+
record = logging.LogRecord(
|
| 69 |
+
name="test",
|
| 70 |
+
level=logging.WARNING,
|
| 71 |
+
pathname="",
|
| 72 |
+
lineno=0,
|
| 73 |
+
msg="test",
|
| 74 |
+
args=(),
|
| 75 |
+
exc_info=None,
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
handler.emit(record)
|
| 79 |
+
|
| 80 |
+
assert handler.get_counts()["warning"] == 1
|
| 81 |
+
|
| 82 |
+
def test_counts_error_messages(self):
|
| 83 |
+
"""Test counting error level messages."""
|
| 84 |
+
handler = LogCounterHandler()
|
| 85 |
+
record = logging.LogRecord(
|
| 86 |
+
name="test",
|
| 87 |
+
level=logging.ERROR,
|
| 88 |
+
pathname="",
|
| 89 |
+
lineno=0,
|
| 90 |
+
msg="test",
|
| 91 |
+
args=(),
|
| 92 |
+
exc_info=None,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
handler.emit(record)
|
| 96 |
+
|
| 97 |
+
assert handler.get_counts()["error"] == 1
|
| 98 |
+
|
| 99 |
+
def test_counts_critical_messages(self):
|
| 100 |
+
"""Test counting critical level messages."""
|
| 101 |
+
handler = LogCounterHandler()
|
| 102 |
+
record = logging.LogRecord(
|
| 103 |
+
name="test",
|
| 104 |
+
level=logging.CRITICAL,
|
| 105 |
+
pathname="",
|
| 106 |
+
lineno=0,
|
| 107 |
+
msg="test",
|
| 108 |
+
args=(),
|
| 109 |
+
exc_info=None,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
handler.emit(record)
|
| 113 |
+
|
| 114 |
+
assert handler.get_counts()["critical"] == 1
|
| 115 |
+
|
| 116 |
+
def test_counts_multiple_levels(self):
|
| 117 |
+
"""Test counting messages at different levels."""
|
| 118 |
+
handler = LogCounterHandler()
|
| 119 |
+
|
| 120 |
+
levels = [
|
| 121 |
+
logging.DEBUG,
|
| 122 |
+
logging.DEBUG,
|
| 123 |
+
logging.INFO,
|
| 124 |
+
logging.WARNING,
|
| 125 |
+
logging.ERROR,
|
| 126 |
+
logging.ERROR,
|
| 127 |
+
logging.ERROR,
|
| 128 |
+
logging.CRITICAL,
|
| 129 |
+
]
|
| 130 |
+
|
| 131 |
+
for level in levels:
|
| 132 |
+
record = logging.LogRecord(
|
| 133 |
+
name="test",
|
| 134 |
+
level=level,
|
| 135 |
+
pathname="",
|
| 136 |
+
lineno=0,
|
| 137 |
+
msg="test",
|
| 138 |
+
args=(),
|
| 139 |
+
exc_info=None,
|
| 140 |
+
)
|
| 141 |
+
handler.emit(record)
|
| 142 |
+
|
| 143 |
+
counts = handler.get_counts()
|
| 144 |
+
assert counts["debug"] == 2
|
| 145 |
+
assert counts["info"] == 1
|
| 146 |
+
assert counts["warning"] == 1
|
| 147 |
+
assert counts["error"] == 3
|
| 148 |
+
assert counts["critical"] == 1
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
class TestBlockedCodes:
|
| 152 |
+
"""Test BLOCKED_CODES constant."""
|
| 153 |
+
|
| 154 |
+
def test_blocked_codes_contains_expected_values(self):
|
| 155 |
+
"""Test that BLOCKED_CODES contains expected HTTP status codes."""
|
| 156 |
+
assert 401 in BLOCKED_CODES # Unauthorized
|
| 157 |
+
assert 403 in BLOCKED_CODES # Forbidden
|
| 158 |
+
assert 407 in BLOCKED_CODES # Proxy Authentication Required
|
| 159 |
+
assert 429 in BLOCKED_CODES # Too Many Requests
|
| 160 |
+
assert 444 in BLOCKED_CODES # Connection Closed Without Response (nginx)
|
| 161 |
+
assert 500 in BLOCKED_CODES # Internal Server Error
|
| 162 |
+
assert 502 in BLOCKED_CODES # Bad Gateway
|
| 163 |
+
assert 503 in BLOCKED_CODES # Service Unavailable
|
| 164 |
+
assert 504 in BLOCKED_CODES # Gateway Timeout
|
| 165 |
+
|
| 166 |
+
def test_blocked_codes_does_not_contain_success(self):
|
| 167 |
+
"""Test that success codes are not blocked."""
|
| 168 |
+
assert 200 not in BLOCKED_CODES
|
| 169 |
+
assert 201 not in BLOCKED_CODES
|
| 170 |
+
assert 204 not in BLOCKED_CODES
|
| 171 |
+
assert 301 not in BLOCKED_CODES
|
| 172 |
+
assert 302 not in BLOCKED_CODES
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
class ConcreteSpider(Spider):
|
| 176 |
+
"""Concrete spider implementation for testing."""
|
| 177 |
+
|
| 178 |
+
name = "test_spider"
|
| 179 |
+
start_urls = ["https://example.com"]
|
| 180 |
+
|
| 181 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 182 |
+
yield {"url": str(response)}
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
class TestSpiderInit:
|
| 186 |
+
"""Test Spider initialization."""
|
| 187 |
+
|
| 188 |
+
def test_spider_requires_name(self):
|
| 189 |
+
"""Test that spider without name raises ValueError."""
|
| 190 |
+
|
| 191 |
+
class NoNameSpider(Spider):
|
| 192 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 193 |
+
yield None
|
| 194 |
+
|
| 195 |
+
with pytest.raises(ValueError, match="must have a name"):
|
| 196 |
+
NoNameSpider()
|
| 197 |
+
|
| 198 |
+
def test_spider_initializes_logger(self):
|
| 199 |
+
"""Test that spider creates a logger."""
|
| 200 |
+
spider = ConcreteSpider()
|
| 201 |
+
|
| 202 |
+
assert spider.logger is not None
|
| 203 |
+
assert spider.logger.name == "scrapling.spiders.test_spider"
|
| 204 |
+
|
| 205 |
+
def test_spider_logger_has_log_counter(self):
|
| 206 |
+
"""Test that spider logger has log counter handler."""
|
| 207 |
+
spider = ConcreteSpider()
|
| 208 |
+
|
| 209 |
+
assert spider._log_counter is not None
|
| 210 |
+
assert isinstance(spider._log_counter, LogCounterHandler)
|
| 211 |
+
|
| 212 |
+
def test_spider_with_crawldir(self):
|
| 213 |
+
"""Test spider initialization with crawldir."""
|
| 214 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 215 |
+
spider = ConcreteSpider(crawldir=tmpdir)
|
| 216 |
+
|
| 217 |
+
assert spider.crawldir == Path(tmpdir)
|
| 218 |
+
|
| 219 |
+
def test_spider_without_crawldir(self):
|
| 220 |
+
"""Test spider initialization without crawldir."""
|
| 221 |
+
spider = ConcreteSpider()
|
| 222 |
+
|
| 223 |
+
assert spider.crawldir is None
|
| 224 |
+
|
| 225 |
+
def test_spider_custom_interval(self):
|
| 226 |
+
"""Test spider with custom checkpoint interval."""
|
| 227 |
+
spider = ConcreteSpider(interval=60.0)
|
| 228 |
+
|
| 229 |
+
assert spider._interval == 60.0
|
| 230 |
+
|
| 231 |
+
def test_spider_default_interval(self):
|
| 232 |
+
"""Test spider has default checkpoint interval."""
|
| 233 |
+
spider = ConcreteSpider()
|
| 234 |
+
|
| 235 |
+
assert spider._interval == 300.0
|
| 236 |
+
|
| 237 |
+
def test_spider_repr(self):
|
| 238 |
+
"""Test spider string representation."""
|
| 239 |
+
spider = ConcreteSpider()
|
| 240 |
+
|
| 241 |
+
repr_str = repr(spider)
|
| 242 |
+
|
| 243 |
+
assert "ConcreteSpider" in repr_str
|
| 244 |
+
assert "test_spider" in repr_str
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
class TestSpiderClassAttributes:
|
| 248 |
+
"""Test Spider class attribute defaults."""
|
| 249 |
+
|
| 250 |
+
def test_default_concurrent_requests(self):
|
| 251 |
+
"""Test default concurrent_requests is 16."""
|
| 252 |
+
assert ConcreteSpider.concurrent_requests == 16
|
| 253 |
+
|
| 254 |
+
def test_default_concurrent_requests_per_domain(self):
|
| 255 |
+
"""Test default concurrent_requests_per_domain is 0 (disabled)."""
|
| 256 |
+
assert ConcreteSpider.concurrent_requests_per_domain == 0
|
| 257 |
+
|
| 258 |
+
def test_default_download_delay(self):
|
| 259 |
+
"""Test default download_delay is 0."""
|
| 260 |
+
assert ConcreteSpider.download_delay == 0.0
|
| 261 |
+
|
| 262 |
+
def test_default_max_blocked_retries(self):
|
| 263 |
+
"""Test default max_blocked_retries is 3."""
|
| 264 |
+
assert ConcreteSpider.max_blocked_retries == 3
|
| 265 |
+
|
| 266 |
+
def test_default_logging_level(self):
|
| 267 |
+
"""Test default logging level is DEBUG."""
|
| 268 |
+
assert ConcreteSpider.logging_level == logging.DEBUG
|
| 269 |
+
|
| 270 |
+
def test_default_allowed_domains_empty(self):
|
| 271 |
+
"""Test default allowed_domains is empty set."""
|
| 272 |
+
assert ConcreteSpider.allowed_domains == set()
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
class TestSpiderSessionConfiguration:
|
| 276 |
+
"""Test Spider session configuration."""
|
| 277 |
+
|
| 278 |
+
def test_default_configure_sessions(self):
|
| 279 |
+
"""Test that default configure_sessions adds a session."""
|
| 280 |
+
spider = ConcreteSpider()
|
| 281 |
+
|
| 282 |
+
assert len(spider._session_manager) > 0
|
| 283 |
+
|
| 284 |
+
def test_configure_sessions_error_raises_custom_exception(self):
|
| 285 |
+
"""Test that errors in configure_sessions raise SessionConfigurationError."""
|
| 286 |
+
|
| 287 |
+
class BadSessionSpider(Spider):
|
| 288 |
+
name = "bad_spider"
|
| 289 |
+
|
| 290 |
+
def configure_sessions(self, manager: SessionManager) -> None:
|
| 291 |
+
raise RuntimeError("Configuration failed!")
|
| 292 |
+
|
| 293 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 294 |
+
yield None
|
| 295 |
+
|
| 296 |
+
with pytest.raises(SessionConfigurationError, match="Configuration failed"):
|
| 297 |
+
BadSessionSpider()
|
| 298 |
+
|
| 299 |
+
def test_configure_sessions_no_sessions_raises(self):
|
| 300 |
+
"""Test that not adding any sessions raises SessionConfigurationError."""
|
| 301 |
+
|
| 302 |
+
class NoSessionSpider(Spider):
|
| 303 |
+
name = "no_session_spider"
|
| 304 |
+
|
| 305 |
+
def configure_sessions(self, manager: SessionManager) -> None:
|
| 306 |
+
pass # Don't add any sessions
|
| 307 |
+
|
| 308 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 309 |
+
yield None
|
| 310 |
+
|
| 311 |
+
with pytest.raises(SessionConfigurationError, match="did not add any sessions"):
|
| 312 |
+
NoSessionSpider()
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
class TestSpiderStartRequests:
|
| 316 |
+
"""Test Spider start_requests method."""
|
| 317 |
+
|
| 318 |
+
@pytest.mark.asyncio
|
| 319 |
+
async def test_start_requests_yields_from_start_urls(self):
|
| 320 |
+
"""Test that start_requests yields requests for start_urls."""
|
| 321 |
+
|
| 322 |
+
class MultiUrlSpider(Spider):
|
| 323 |
+
name = "multi_url"
|
| 324 |
+
start_urls = [
|
| 325 |
+
"https://example.com/1",
|
| 326 |
+
"https://example.com/2",
|
| 327 |
+
"https://example.com/3",
|
| 328 |
+
]
|
| 329 |
+
|
| 330 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 331 |
+
yield None
|
| 332 |
+
|
| 333 |
+
spider = MultiUrlSpider()
|
| 334 |
+
requests = [r async for r in spider.start_requests()]
|
| 335 |
+
|
| 336 |
+
assert len(requests) == 3
|
| 337 |
+
assert requests[0].url == "https://example.com/1"
|
| 338 |
+
assert requests[1].url == "https://example.com/2"
|
| 339 |
+
assert requests[2].url == "https://example.com/3"
|
| 340 |
+
|
| 341 |
+
@pytest.mark.asyncio
|
| 342 |
+
async def test_start_requests_no_urls_raises(self):
|
| 343 |
+
"""Test that start_requests raises when no start_urls."""
|
| 344 |
+
|
| 345 |
+
class NoUrlSpider(Spider):
|
| 346 |
+
name = "no_url"
|
| 347 |
+
start_urls = []
|
| 348 |
+
|
| 349 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 350 |
+
yield None
|
| 351 |
+
|
| 352 |
+
spider = NoUrlSpider()
|
| 353 |
+
|
| 354 |
+
with pytest.raises(RuntimeError, match="no starting point"):
|
| 355 |
+
async for _ in spider.start_requests():
|
| 356 |
+
pass
|
| 357 |
+
|
| 358 |
+
@pytest.mark.asyncio
|
| 359 |
+
async def test_start_requests_uses_default_session(self):
|
| 360 |
+
"""Test that start_requests uses default session ID."""
|
| 361 |
+
spider = ConcreteSpider()
|
| 362 |
+
requests = [r async for r in spider.start_requests()]
|
| 363 |
+
|
| 364 |
+
# Should use the default session from session manager
|
| 365 |
+
default_sid = spider._session_manager.default_session_id
|
| 366 |
+
assert requests[0].sid == default_sid
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
class TestSpiderHooks:
|
| 370 |
+
"""Test Spider lifecycle hooks."""
|
| 371 |
+
|
| 372 |
+
@pytest.mark.asyncio
|
| 373 |
+
async def test_on_start_default(self):
|
| 374 |
+
"""Test default on_start doesn't raise."""
|
| 375 |
+
spider = ConcreteSpider()
|
| 376 |
+
|
| 377 |
+
# Should not raise
|
| 378 |
+
await spider.on_start(resuming=False)
|
| 379 |
+
await spider.on_start(resuming=True)
|
| 380 |
+
|
| 381 |
+
@pytest.mark.asyncio
|
| 382 |
+
async def test_on_close_default(self):
|
| 383 |
+
"""Test default on_close doesn't raise."""
|
| 384 |
+
spider = ConcreteSpider()
|
| 385 |
+
|
| 386 |
+
# Should not raise
|
| 387 |
+
await spider.on_close()
|
| 388 |
+
|
| 389 |
+
@pytest.mark.asyncio
|
| 390 |
+
async def test_on_error_default(self):
|
| 391 |
+
"""Test default on_error logs the error."""
|
| 392 |
+
spider = ConcreteSpider()
|
| 393 |
+
request = Request("https://example.com")
|
| 394 |
+
error = ValueError("test error")
|
| 395 |
+
|
| 396 |
+
# Should not raise
|
| 397 |
+
await spider.on_error(request, error)
|
| 398 |
+
|
| 399 |
+
@pytest.mark.asyncio
|
| 400 |
+
async def test_on_scraped_item_default_returns_item(self):
|
| 401 |
+
"""Test default on_scraped_item returns the item unchanged."""
|
| 402 |
+
spider = ConcreteSpider()
|
| 403 |
+
item = {"key": "value", "nested": {"a": 1}}
|
| 404 |
+
|
| 405 |
+
result = await spider.on_scraped_item(item)
|
| 406 |
+
|
| 407 |
+
assert result == item
|
| 408 |
+
|
| 409 |
+
@pytest.mark.asyncio
|
| 410 |
+
async def test_is_blocked_default_checks_status_codes(self):
|
| 411 |
+
"""Test default is_blocked checks blocked status codes."""
|
| 412 |
+
|
| 413 |
+
class MockResponse:
|
| 414 |
+
def __init__(self, status: int):
|
| 415 |
+
self.status = status
|
| 416 |
+
|
| 417 |
+
spider = ConcreteSpider()
|
| 418 |
+
|
| 419 |
+
# Test blocked codes
|
| 420 |
+
assert await spider.is_blocked(MockResponse(403)) is True
|
| 421 |
+
assert await spider.is_blocked(MockResponse(429)) is True
|
| 422 |
+
assert await spider.is_blocked(MockResponse(503)) is True
|
| 423 |
+
|
| 424 |
+
# Test non-blocked codes
|
| 425 |
+
assert await spider.is_blocked(MockResponse(200)) is False
|
| 426 |
+
assert await spider.is_blocked(MockResponse(404)) is False
|
| 427 |
+
|
| 428 |
+
@pytest.mark.asyncio
|
| 429 |
+
async def test_retry_blocked_request_default_returns_request(self):
|
| 430 |
+
"""Test default retry_blocked_request returns the request unchanged."""
|
| 431 |
+
|
| 432 |
+
class MockResponse:
|
| 433 |
+
status = 429
|
| 434 |
+
|
| 435 |
+
spider = ConcreteSpider()
|
| 436 |
+
request = Request("https://example.com", priority=5)
|
| 437 |
+
|
| 438 |
+
result = await spider.retry_blocked_request(request, MockResponse())
|
| 439 |
+
|
| 440 |
+
assert result is request
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
class TestSpiderPause:
|
| 444 |
+
"""Test Spider pause functionality."""
|
| 445 |
+
|
| 446 |
+
def test_pause_without_engine_raises(self):
|
| 447 |
+
"""Test that pause without active engine raises RuntimeError."""
|
| 448 |
+
spider = ConcreteSpider()
|
| 449 |
+
|
| 450 |
+
with pytest.raises(RuntimeError, match="no crawl engine started"):
|
| 451 |
+
spider.pause()
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
class TestSpiderStats:
|
| 455 |
+
"""Test Spider stats property."""
|
| 456 |
+
|
| 457 |
+
def test_stats_without_engine_raises(self):
|
| 458 |
+
"""Test that accessing stats without active crawl raises."""
|
| 459 |
+
spider = ConcreteSpider()
|
| 460 |
+
|
| 461 |
+
with pytest.raises(RuntimeError, match="No active crawl"):
|
| 462 |
+
_ = spider.stats
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
class TestSpiderCustomization:
|
| 466 |
+
"""Test Spider customization patterns."""
|
| 467 |
+
|
| 468 |
+
def test_custom_concurrent_requests(self):
|
| 469 |
+
"""Test spider with custom concurrent_requests."""
|
| 470 |
+
|
| 471 |
+
class CustomSpider(Spider):
|
| 472 |
+
name = "custom"
|
| 473 |
+
concurrent_requests = 32
|
| 474 |
+
start_urls = ["https://example.com"]
|
| 475 |
+
|
| 476 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 477 |
+
yield None
|
| 478 |
+
|
| 479 |
+
spider = CustomSpider()
|
| 480 |
+
assert spider.concurrent_requests == 32
|
| 481 |
+
|
| 482 |
+
def test_custom_allowed_domains(self):
|
| 483 |
+
"""Test spider with allowed_domains."""
|
| 484 |
+
|
| 485 |
+
class DomainSpider(Spider):
|
| 486 |
+
name = "domain_spider"
|
| 487 |
+
start_urls = ["https://example.com"]
|
| 488 |
+
allowed_domains = {"example.com", "api.example.com"}
|
| 489 |
+
|
| 490 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 491 |
+
yield None
|
| 492 |
+
|
| 493 |
+
spider = DomainSpider()
|
| 494 |
+
assert "example.com" in spider.allowed_domains
|
| 495 |
+
assert "api.example.com" in spider.allowed_domains
|
| 496 |
+
|
| 497 |
+
def test_custom_download_delay(self):
|
| 498 |
+
"""Test spider with download delay."""
|
| 499 |
+
|
| 500 |
+
class SlowSpider(Spider):
|
| 501 |
+
name = "slow"
|
| 502 |
+
download_delay = 1.5
|
| 503 |
+
start_urls = ["https://example.com"]
|
| 504 |
+
|
| 505 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 506 |
+
yield None
|
| 507 |
+
|
| 508 |
+
spider = SlowSpider()
|
| 509 |
+
assert spider.download_delay == 1.5
|
| 510 |
+
|
| 511 |
+
|
| 512 |
+
class TestSpiderLogging:
|
| 513 |
+
"""Test Spider logging configuration."""
|
| 514 |
+
|
| 515 |
+
def test_custom_logging_level(self):
|
| 516 |
+
"""Test spider with custom logging level."""
|
| 517 |
+
|
| 518 |
+
class QuietSpider(Spider):
|
| 519 |
+
name = "quiet"
|
| 520 |
+
logging_level = logging.WARNING
|
| 521 |
+
start_urls = ["https://example.com"]
|
| 522 |
+
|
| 523 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 524 |
+
yield None
|
| 525 |
+
|
| 526 |
+
spider = QuietSpider()
|
| 527 |
+
assert spider.logger.level == logging.WARNING
|
| 528 |
+
|
| 529 |
+
def test_log_file_creates_handler(self):
|
| 530 |
+
"""Test spider with log file creates file handler."""
|
| 531 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 532 |
+
log_path = Path(tmpdir) / "spider.log"
|
| 533 |
+
|
| 534 |
+
class FileLogSpider(Spider):
|
| 535 |
+
name = "file_log"
|
| 536 |
+
log_file = str(log_path)
|
| 537 |
+
start_urls = ["https://example.com"]
|
| 538 |
+
|
| 539 |
+
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 540 |
+
yield None
|
| 541 |
+
|
| 542 |
+
spider = FileLogSpider()
|
| 543 |
+
|
| 544 |
+
# Should have a file handler
|
| 545 |
+
file_handlers = [
|
| 546 |
+
h for h in spider.logger.handlers if isinstance(h, logging.FileHandler)
|
| 547 |
+
]
|
| 548 |
+
assert len(file_handlers) == 1
|
| 549 |
+
|
| 550 |
+
# Clean up
|
| 551 |
+
for h in file_handlers:
|
| 552 |
+
h.close()
|
| 553 |
+
|
| 554 |
+
def test_logger_does_not_propagate(self):
|
| 555 |
+
"""Test that spider logger does not propagate to parent."""
|
| 556 |
+
spider = ConcreteSpider()
|
| 557 |
+
|
| 558 |
+
assert spider.logger.propagate is False
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
class TestSessionConfigurationError:
|
| 562 |
+
"""Test SessionConfigurationError exception."""
|
| 563 |
+
|
| 564 |
+
def test_exception_message(self):
|
| 565 |
+
"""Test that exception preserves message."""
|
| 566 |
+
error = SessionConfigurationError("Custom error message")
|
| 567 |
+
|
| 568 |
+
assert str(error) == "Custom error message"
|
| 569 |
+
|
| 570 |
+
def test_exception_is_exception(self):
|
| 571 |
+
"""Test that it's a proper exception."""
|
| 572 |
+
error = SessionConfigurationError("test")
|
| 573 |
+
|
| 574 |
+
assert isinstance(error, Exception)
|