Karim shoair commited on
Commit
f37031b
·
1 Parent(s): fab7a59

test: add tests for the spiders system

Browse files

It's generated by Opus on Claude Code. It's very good as per my review and instructions, but I will have another look later.

tests/spiders/__init__.py ADDED
File without changes
tests/spiders/test_checkpoint.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the CheckpointManager and CheckpointData classes."""
2
+
3
+ import pickle
4
+ import tempfile
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+ import anyio
9
+
10
+ from scrapling.spiders.request import Request
11
+ from scrapling.spiders.checkpoint import CheckpointData, CheckpointManager
12
+
13
+
14
+ class TestCheckpointData:
15
+ """Test CheckpointData dataclass."""
16
+
17
+ def test_default_values(self):
18
+ """Test CheckpointData with default values."""
19
+ data = CheckpointData()
20
+
21
+ assert data.requests == []
22
+ assert data.seen == set()
23
+
24
+ def test_with_requests_and_seen(self):
25
+ """Test CheckpointData with requests and seen URLs."""
26
+ requests = [
27
+ Request("https://example.com/1", priority=10),
28
+ Request("https://example.com/2", priority=5),
29
+ ]
30
+ seen = {"url1", "url2", "url3"}
31
+
32
+ data = CheckpointData(requests=requests, seen=seen)
33
+
34
+ assert len(data.requests) == 2
35
+ assert data.requests[0].url == "https://example.com/1"
36
+ assert data.seen == {"url1", "url2", "url3"}
37
+
38
+ def test_pickle_roundtrip(self):
39
+ """Test that CheckpointData can be pickled and unpickled."""
40
+ requests = [Request("https://example.com", priority=5)]
41
+ seen = {"fingerprint1", "fingerprint2"}
42
+ data = CheckpointData(requests=requests, seen=seen)
43
+
44
+ pickled = pickle.dumps(data)
45
+ restored = pickle.loads(pickled)
46
+
47
+ assert len(restored.requests) == 1
48
+ assert restored.requests[0].url == "https://example.com"
49
+ assert restored.seen == {"fingerprint1", "fingerprint2"}
50
+
51
+
52
+ class TestCheckpointManagerInit:
53
+ """Test CheckpointManager initialization."""
54
+
55
+ def test_init_with_string_path(self):
56
+ """Test initialization with string path."""
57
+ manager = CheckpointManager("/tmp/test_crawl")
58
+
59
+ assert str(manager.crawldir) == "/tmp/test_crawl"
60
+ assert manager.interval == 300.0
61
+
62
+ def test_init_with_pathlib_path(self):
63
+ """Test initialization with pathlib.Path."""
64
+ path = Path("/tmp/test_crawl")
65
+ manager = CheckpointManager(path)
66
+
67
+ assert str(manager.crawldir) == "/tmp/test_crawl"
68
+
69
+ def test_init_with_custom_interval(self):
70
+ """Test initialization with custom interval."""
71
+ manager = CheckpointManager("/tmp/test", interval=60.0)
72
+ assert manager.interval == 60.0
73
+
74
+ def test_init_with_zero_interval(self):
75
+ """Test initialization with zero interval (disable periodic checkpoints)."""
76
+ manager = CheckpointManager("/tmp/test", interval=0)
77
+ assert manager.interval == 0
78
+
79
+ def test_init_with_negative_interval_raises(self):
80
+ """Test that negative interval raises ValueError."""
81
+ with pytest.raises(ValueError, match="greater than 0"):
82
+ CheckpointManager("/tmp/test", interval=-1)
83
+
84
+ def test_init_with_invalid_interval_type_raises(self):
85
+ """Test that invalid interval type raises TypeError."""
86
+ with pytest.raises(TypeError, match="integer or float"):
87
+ CheckpointManager("/tmp/test", interval="invalid") # type: ignore
88
+
89
+ def test_checkpoint_file_path(self):
90
+ """Test that checkpoint file path is correctly constructed."""
91
+ manager = CheckpointManager("/tmp/test_crawl")
92
+
93
+ expected_path = "/tmp/test_crawl/checkpoint.pkl"
94
+ assert str(manager._checkpoint_path) == expected_path
95
+
96
+
97
+ class TestCheckpointManagerOperations:
98
+ """Test CheckpointManager save/load/cleanup operations."""
99
+
100
+ @pytest.fixture
101
+ def temp_dir(self):
102
+ """Create a temporary directory for testing."""
103
+ with tempfile.TemporaryDirectory() as tmpdir:
104
+ yield Path(tmpdir)
105
+
106
+ @pytest.mark.asyncio
107
+ async def test_has_checkpoint_false_when_no_file(self, temp_dir: Path):
108
+ """Test has_checkpoint returns False when no checkpoint exists."""
109
+ manager = CheckpointManager(temp_dir / "crawl")
110
+
111
+ result = await manager.has_checkpoint()
112
+
113
+ assert result is False
114
+
115
+ @pytest.mark.asyncio
116
+ async def test_save_creates_checkpoint_file(self, temp_dir: Path):
117
+ """Test that save creates the checkpoint file."""
118
+ crawl_dir = temp_dir / "crawl"
119
+ manager = CheckpointManager(crawl_dir)
120
+
121
+ data = CheckpointData(
122
+ requests=[Request("https://example.com")],
123
+ seen={"fp1", "fp2"},
124
+ )
125
+
126
+ await manager.save(data)
127
+
128
+ checkpoint_path = crawl_dir / "checkpoint.pkl"
129
+ assert checkpoint_path.exists()
130
+
131
+ @pytest.mark.asyncio
132
+ async def test_save_creates_directory_if_not_exists(self, temp_dir: Path):
133
+ """Test that save creates the directory if it doesn't exist."""
134
+ crawl_dir = temp_dir / "nested" / "crawl" / "dir"
135
+ manager = CheckpointManager(crawl_dir)
136
+
137
+ data = CheckpointData()
138
+ await manager.save(data)
139
+
140
+ assert crawl_dir.exists()
141
+
142
+ @pytest.mark.asyncio
143
+ async def test_has_checkpoint_true_after_save(self, temp_dir: Path):
144
+ """Test has_checkpoint returns True after saving."""
145
+ manager = CheckpointManager(temp_dir / "crawl")
146
+
147
+ data = CheckpointData()
148
+ await manager.save(data)
149
+
150
+ result = await manager.has_checkpoint()
151
+ assert result is True
152
+
153
+ @pytest.mark.asyncio
154
+ async def test_load_returns_none_when_no_checkpoint(self, temp_dir: Path):
155
+ """Test load returns None when no checkpoint exists."""
156
+ manager = CheckpointManager(temp_dir / "crawl")
157
+
158
+ result = await manager.load()
159
+
160
+ assert result is None
161
+
162
+ @pytest.mark.asyncio
163
+ async def test_save_and_load_roundtrip(self, temp_dir: Path):
164
+ """Test saving and loading checkpoint data."""
165
+ manager = CheckpointManager(temp_dir / "crawl")
166
+
167
+ original_data = CheckpointData(
168
+ requests=[
169
+ Request("https://example.com/1", priority=10),
170
+ Request("https://example.com/2", priority=5),
171
+ ],
172
+ seen={"fp1", "fp2", "fp3"},
173
+ )
174
+
175
+ await manager.save(original_data)
176
+ loaded_data = await manager.load()
177
+
178
+ assert loaded_data is not None
179
+ assert len(loaded_data.requests) == 2
180
+ assert loaded_data.requests[0].url == "https://example.com/1"
181
+ assert loaded_data.requests[0].priority == 10
182
+ assert loaded_data.seen == {"fp1", "fp2", "fp3"}
183
+
184
+ @pytest.mark.asyncio
185
+ async def test_save_is_atomic(self, temp_dir: Path):
186
+ """Test that save uses atomic write (temp file + rename)."""
187
+ crawl_dir = temp_dir / "crawl"
188
+ manager = CheckpointManager(crawl_dir)
189
+
190
+ data = CheckpointData(requests=[Request("https://example.com")])
191
+ await manager.save(data)
192
+
193
+ # Temp file should not exist after successful save
194
+ temp_path = crawl_dir / "checkpoint.tmp"
195
+ assert not temp_path.exists()
196
+
197
+ # Checkpoint file should exist
198
+ checkpoint_path = crawl_dir / "checkpoint.pkl"
199
+ assert checkpoint_path.exists()
200
+
201
+ @pytest.mark.asyncio
202
+ async def test_cleanup_removes_checkpoint_file(self, temp_dir: Path):
203
+ """Test that cleanup removes the checkpoint file."""
204
+ crawl_dir = temp_dir / "crawl"
205
+ manager = CheckpointManager(crawl_dir)
206
+
207
+ # Save a checkpoint first
208
+ data = CheckpointData()
209
+ await manager.save(data)
210
+
211
+ checkpoint_path = crawl_dir / "checkpoint.pkl"
212
+ assert checkpoint_path.exists()
213
+
214
+ # Cleanup should remove it
215
+ await manager.cleanup()
216
+
217
+ assert not checkpoint_path.exists()
218
+
219
+ @pytest.mark.asyncio
220
+ async def test_cleanup_no_error_when_no_file(self, temp_dir: Path):
221
+ """Test that cleanup doesn't raise error when no file exists."""
222
+ manager = CheckpointManager(temp_dir / "crawl")
223
+
224
+ # Should not raise
225
+ await manager.cleanup()
226
+
227
+ @pytest.mark.asyncio
228
+ async def test_load_returns_none_on_corrupt_file(self, temp_dir: Path):
229
+ """Test load returns None when checkpoint file is corrupt."""
230
+ crawl_dir = temp_dir / "crawl"
231
+ crawl_dir.mkdir(parents=True)
232
+
233
+ checkpoint_path = crawl_dir / "checkpoint.pkl"
234
+ checkpoint_path.write_bytes(b"not valid pickle data")
235
+
236
+ manager = CheckpointManager(crawl_dir)
237
+
238
+ result = await manager.load()
239
+
240
+ assert result is None
241
+
242
+ @pytest.mark.asyncio
243
+ async def test_multiple_saves_overwrite(self, temp_dir: Path):
244
+ """Test that multiple saves overwrite the checkpoint."""
245
+ manager = CheckpointManager(temp_dir / "crawl")
246
+
247
+ # First save
248
+ data1 = CheckpointData(
249
+ requests=[Request("https://example.com/1")],
250
+ seen={"fp1"},
251
+ )
252
+ await manager.save(data1)
253
+
254
+ # Second save
255
+ data2 = CheckpointData(
256
+ requests=[Request("https://example.com/2"), Request("https://example.com/3")],
257
+ seen={"fp2", "fp3"},
258
+ )
259
+ await manager.save(data2)
260
+
261
+ # Load should return the second save
262
+ loaded = await manager.load()
263
+
264
+ assert loaded is not None
265
+ assert len(loaded.requests) == 2
266
+ assert loaded.requests[0].url == "https://example.com/2"
267
+ assert loaded.seen == {"fp2", "fp3"}
268
+
269
+
270
+ class TestCheckpointManagerEdgeCases:
271
+ """Test edge cases for CheckpointManager."""
272
+
273
+ @pytest.fixture
274
+ def temp_dir(self):
275
+ """Create a temporary directory for testing."""
276
+ with tempfile.TemporaryDirectory() as tmpdir:
277
+ yield Path(tmpdir)
278
+
279
+ @pytest.mark.asyncio
280
+ async def test_save_empty_checkpoint(self, temp_dir: Path):
281
+ """Test saving empty checkpoint data."""
282
+ manager = CheckpointManager(temp_dir / "crawl")
283
+
284
+ data = CheckpointData(requests=[], seen=set())
285
+ await manager.save(data)
286
+
287
+ loaded = await manager.load()
288
+
289
+ assert loaded is not None
290
+ assert loaded.requests == []
291
+ assert loaded.seen == set()
292
+
293
+ @pytest.mark.asyncio
294
+ async def test_save_large_checkpoint(self, temp_dir: Path):
295
+ """Test saving checkpoint with many requests."""
296
+ manager = CheckpointManager(temp_dir / "crawl")
297
+
298
+ # Create 1000 requests
299
+ requests = [
300
+ Request(f"https://example.com/{i}", priority=i % 10)
301
+ for i in range(1000)
302
+ ]
303
+ seen = {f"fp_{i}" for i in range(2000)}
304
+
305
+ data = CheckpointData(requests=requests, seen=seen)
306
+ await manager.save(data)
307
+
308
+ loaded = await manager.load()
309
+
310
+ assert loaded is not None
311
+ assert len(loaded.requests) == 1000
312
+ assert len(loaded.seen) == 2000
313
+
314
+ @pytest.mark.asyncio
315
+ async def test_requests_preserve_metadata(self, temp_dir: Path):
316
+ """Test that request metadata is preserved through checkpoint."""
317
+ manager = CheckpointManager(temp_dir / "crawl")
318
+
319
+ original_request = Request(
320
+ url="https://example.com",
321
+ sid="my_session",
322
+ priority=42,
323
+ dont_filter=True,
324
+ meta={"item_id": 123, "page": 5},
325
+ proxy="http://proxy:8080",
326
+ )
327
+
328
+ data = CheckpointData(requests=[original_request], seen=set())
329
+ await manager.save(data)
330
+
331
+ loaded = await manager.load()
332
+
333
+ assert loaded is not None
334
+ restored = loaded.requests[0]
335
+
336
+ assert restored.url == "https://example.com"
337
+ assert restored.sid == "my_session"
338
+ assert restored.priority == 42
339
+ assert restored.dont_filter is True
340
+ assert restored.meta == {"item_id": 123, "page": 5}
341
+ assert restored._session_kwargs == {"proxy": "http://proxy:8080"}
tests/spiders/test_request.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the Request class."""
2
+
3
+ import pickle
4
+
5
+ import pytest
6
+
7
+ from scrapling.spiders.request import Request
8
+ from scrapling.core._types import Any, Dict, AsyncGenerator
9
+
10
+
11
+ class TestRequestCreation:
12
+ """Test Request initialization and basic attributes."""
13
+
14
+ def test_basic_request_creation(self):
15
+ """Test creating a request with just a URL."""
16
+ request = Request("https://example.com")
17
+
18
+ assert request.url == "https://example.com"
19
+ assert request.sid == ""
20
+ assert request.callback is None
21
+ assert request.priority == 0
22
+ assert request.dont_filter is False
23
+ assert request.meta == {}
24
+ assert request._retry_count == 0
25
+ assert request._session_kwargs == {}
26
+
27
+ def test_request_with_all_parameters(self):
28
+ """Test creating a request with all parameters."""
29
+
30
+ async def my_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
31
+ yield {"test": "data"}
32
+
33
+ request = Request(
34
+ url="https://example.com/page",
35
+ sid="my_session",
36
+ callback=my_callback,
37
+ priority=10,
38
+ dont_filter=True,
39
+ meta={"key": "value"},
40
+ _retry_count=2,
41
+ proxy="http://proxy:8080",
42
+ timeout=30,
43
+ )
44
+
45
+ assert request.url == "https://example.com/page"
46
+ assert request.sid == "my_session"
47
+ assert request.callback == my_callback
48
+ assert request.priority == 10
49
+ assert request.dont_filter is True
50
+ assert request.meta == {"key": "value"}
51
+ assert request._retry_count == 2
52
+ assert request._session_kwargs == {"proxy": "http://proxy:8080", "timeout": 30}
53
+
54
+ def test_request_meta_default_is_empty_dict(self):
55
+ """Test that meta defaults to empty dict, not shared reference."""
56
+ r1 = Request("https://example.com")
57
+ r2 = Request("https://example.com")
58
+
59
+ r1.meta["key"] = "value"
60
+
61
+ assert r1.meta == {"key": "value"}
62
+ assert r2.meta == {}
63
+
64
+
65
+ class TestRequestProperties:
66
+ """Test Request computed properties."""
67
+
68
+ def test_domain_extraction(self):
69
+ """Test domain property extracts netloc correctly."""
70
+ request = Request("https://www.example.com/path/page.html?query=1")
71
+ assert request.domain == "www.example.com"
72
+
73
+ def test_domain_with_port(self):
74
+ """Test domain extraction with port number."""
75
+ request = Request("http://localhost:8080/api")
76
+ assert request.domain == "localhost:8080"
77
+
78
+ def test_domain_with_subdomain(self):
79
+ """Test domain extraction with subdomains."""
80
+ request = Request("https://api.v2.example.com/endpoint")
81
+ assert request.domain == "api.v2.example.com"
82
+
83
+ def test_fingerprint_includes_session_and_url(self):
84
+ """Test fingerprint generation."""
85
+ request = Request("https://example.com", sid="session1")
86
+ assert request._fp == "session1:https://example.com"
87
+
88
+ def test_fingerprint_empty_session(self):
89
+ """Test fingerprint with empty session ID."""
90
+ request = Request("https://example.com")
91
+ assert request._fp == ":https://example.com"
92
+
93
+
94
+ class TestRequestCopy:
95
+ """Test Request copy functionality."""
96
+
97
+ def test_copy_creates_independent_request(self):
98
+ """Test that copy creates a new independent request."""
99
+
100
+ async def callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
101
+ yield None
102
+
103
+ original = Request(
104
+ url="https://example.com",
105
+ sid="session",
106
+ callback=callback,
107
+ priority=5,
108
+ dont_filter=True,
109
+ meta={"original": True},
110
+ _retry_count=1,
111
+ proxy="http://proxy:8080",
112
+ )
113
+
114
+ copied = original.copy()
115
+
116
+ # Check all values are copied
117
+ assert copied.url == original.url
118
+ assert copied.sid == original.sid
119
+ assert copied.callback == original.callback
120
+ assert copied.priority == original.priority
121
+ assert copied.dont_filter == original.dont_filter
122
+ assert copied.meta == original.meta
123
+ assert copied._retry_count == original._retry_count
124
+ assert copied._session_kwargs == original._session_kwargs
125
+
126
+ # Check they are different objects
127
+ assert copied is not original
128
+ assert copied.meta is not original.meta # Meta should be a copy
129
+
130
+ def test_copy_meta_is_independent(self):
131
+ """Test that modifying copied meta doesn't affect original."""
132
+ original = Request("https://example.com", meta={"key": "original"})
133
+ copied = original.copy()
134
+
135
+ copied.meta["key"] = "modified"
136
+ copied.meta["new_key"] = "new_value"
137
+
138
+ assert original.meta == {"key": "original"}
139
+ assert copied.meta == {"key": "modified", "new_key": "new_value"}
140
+
141
+
142
+ class TestRequestComparison:
143
+ """Test Request comparison operators."""
144
+
145
+ def test_priority_less_than(self):
146
+ """Test less than comparison by priority."""
147
+ low_priority = Request("https://example.com/1", priority=1)
148
+ high_priority = Request("https://example.com/2", priority=10)
149
+
150
+ assert low_priority < high_priority
151
+ assert not high_priority < low_priority
152
+
153
+ def test_priority_greater_than(self):
154
+ """Test greater than comparison by priority."""
155
+ low_priority = Request("https://example.com/1", priority=1)
156
+ high_priority = Request("https://example.com/2", priority=10)
157
+
158
+ assert high_priority > low_priority
159
+ assert not low_priority > high_priority
160
+
161
+ def test_equality_by_fingerprint(self):
162
+ """Test equality comparison by fingerprint."""
163
+ r1 = Request("https://example.com", sid="session1")
164
+ r2 = Request("https://example.com", sid="session1")
165
+ r3 = Request("https://example.com", sid="session2")
166
+
167
+ assert r1 == r2
168
+ assert r1 != r3
169
+
170
+ def test_equality_different_priorities_same_fingerprint(self):
171
+ """Test requests with same fingerprint are equal despite different priorities."""
172
+ r1 = Request("https://example.com", sid="s1", priority=1)
173
+ r2 = Request("https://example.com", sid="s1", priority=100)
174
+
175
+ assert r1 == r2 # Same fingerprint means equal
176
+
177
+ def test_comparison_with_non_request(self):
178
+ """Test comparison with non-Request types returns NotImplemented."""
179
+ request = Request("https://example.com")
180
+
181
+ assert request.__lt__("not a request") == NotImplemented
182
+ assert request.__gt__("not a request") == NotImplemented
183
+ assert request.__eq__("not a request") == NotImplemented
184
+
185
+
186
+ class TestRequestStringRepresentation:
187
+ """Test Request string representations."""
188
+
189
+ def test_str_returns_url(self):
190
+ """Test __str__ returns the URL."""
191
+ request = Request("https://example.com/page")
192
+ assert str(request) == "https://example.com/page"
193
+
194
+ def test_repr_without_callback(self):
195
+ """Test __repr__ without callback."""
196
+ request = Request("https://example.com", priority=5)
197
+ repr_str = repr(request)
198
+
199
+ assert "Request" in repr_str
200
+ assert "https://example.com" in repr_str
201
+ assert "priority=5" in repr_str
202
+ assert "callback=None" in repr_str
203
+
204
+ def test_repr_with_callback(self):
205
+ """Test __repr__ with named callback."""
206
+
207
+ async def my_custom_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
208
+ yield None
209
+
210
+ request = Request("https://example.com", callback=my_custom_callback)
211
+ repr_str = repr(request)
212
+
213
+ assert "callback=my_custom_callback" in repr_str
214
+
215
+
216
+ class TestRequestPickling:
217
+ """Test Request serialization for checkpointing."""
218
+
219
+ def test_pickle_without_callback(self):
220
+ """Test pickling request without callback."""
221
+ original = Request(
222
+ url="https://example.com",
223
+ sid="session",
224
+ priority=5,
225
+ meta={"key": "value"},
226
+ )
227
+
228
+ pickled = pickle.dumps(original)
229
+ restored = pickle.loads(pickled)
230
+
231
+ assert restored.url == original.url
232
+ assert restored.sid == original.sid
233
+ assert restored.priority == original.priority
234
+ assert restored.meta == original.meta
235
+ assert restored.callback is None
236
+
237
+ def test_pickle_with_callback_stores_name(self):
238
+ """Test that callback name is stored when pickling."""
239
+
240
+ async def parse_page(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
241
+ yield {"data": "test"}
242
+
243
+ original = Request("https://example.com", callback=parse_page)
244
+
245
+ # Check getstate stores callback name
246
+ state = original.__getstate__()
247
+ assert state["_callback_name"] == "parse_page"
248
+ assert state["callback"] is None
249
+
250
+ def test_pickle_with_none_callback(self):
251
+ """Test pickling with None callback."""
252
+ original = Request("https://example.com", callback=None)
253
+
254
+ state = original.__getstate__()
255
+ assert state["_callback_name"] is None
256
+ assert state["callback"] is None
257
+
258
+ def test_setstate_stores_callback_name(self):
259
+ """Test that setstate correctly handles callback name."""
260
+ request = Request("https://example.com")
261
+ state = {
262
+ "url": "https://example.com",
263
+ "sid": "",
264
+ "callback": None,
265
+ "priority": 0,
266
+ "dont_filter": False,
267
+ "meta": {},
268
+ "_retry_count": 0,
269
+ "_session_kwargs": {},
270
+ "_callback_name": "custom_parse",
271
+ }
272
+
273
+ request.__setstate__(state)
274
+
275
+ assert hasattr(request, "_callback_name")
276
+ assert request._callback_name == "custom_parse"
277
+
278
+ def test_pickle_roundtrip_preserves_session_kwargs(self):
279
+ """Test that session kwargs are preserved through pickle."""
280
+ original = Request(
281
+ "https://example.com",
282
+ proxy="http://proxy:8080",
283
+ timeout=30,
284
+ headers={"User-Agent": "test"},
285
+ )
286
+
287
+ pickled = pickle.dumps(original)
288
+ restored = pickle.loads(pickled)
289
+
290
+ assert restored._session_kwargs == {
291
+ "proxy": "http://proxy:8080",
292
+ "timeout": 30,
293
+ "headers": {"User-Agent": "test"},
294
+ }
295
+
296
+
297
+ class TestRequestRestoreCallback:
298
+ """Test callback restoration from spider."""
299
+
300
+ def test_restore_callback_from_spider(self):
301
+ """Test restoring callback from spider instance."""
302
+
303
+ class MockSpider:
304
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
305
+ yield None
306
+
307
+ async def parse_detail(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
308
+ yield {"detail": True}
309
+
310
+ spider = MockSpider()
311
+ request = Request("https://example.com")
312
+ request._callback_name = "parse_detail"
313
+
314
+ request._restore_callback(spider) # type: ignore[arg-type]
315
+
316
+ assert request.callback == spider.parse_detail
317
+ assert not hasattr(request, "_callback_name")
318
+
319
+ def test_restore_callback_falls_back_to_parse(self):
320
+ """Test that missing callback falls back to spider.parse."""
321
+
322
+ class MockSpider:
323
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
324
+ yield None
325
+
326
+ spider = MockSpider()
327
+ request = Request("https://example.com")
328
+ request._callback_name = "nonexistent_method"
329
+
330
+ request._restore_callback(spider) # type: ignore[arg-type]
331
+
332
+ assert request.callback == spider.parse
333
+ assert not hasattr(request, "_callback_name")
334
+
335
+ def test_restore_callback_with_none_name(self):
336
+ """Test restore callback when _callback_name is None."""
337
+
338
+ class MockSpider:
339
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
340
+ yield None
341
+
342
+ spider = MockSpider()
343
+ request = Request("https://example.com")
344
+ request._callback_name = None
345
+
346
+ request._restore_callback(spider) # type: ignore[arg-type]
347
+
348
+ # Should clean up _callback_name attribute
349
+ assert not hasattr(request, "_callback_name")
350
+
351
+ def test_restore_callback_without_callback_name_attr(self):
352
+ """Test restore callback when _callback_name attribute doesn't exist."""
353
+
354
+ class MockSpider:
355
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
356
+ yield None
357
+
358
+ spider = MockSpider()
359
+ request = Request("https://example.com")
360
+ # Don't set _callback_name
361
+
362
+ # Should not raise an error
363
+ request._restore_callback(spider) # type: ignore[arg-type]
tests/spiders/test_result.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the result module (ItemList, CrawlStats, CrawlResult)."""
2
+
3
+ import json
4
+ import tempfile
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+ from scrapling.spiders.result import ItemList, CrawlStats, CrawlResult
10
+
11
+
12
+ class TestItemList:
13
+ """Test ItemList functionality."""
14
+
15
+ def test_itemlist_is_list(self):
16
+ """Test that ItemList is a list subclass."""
17
+ items = ItemList()
18
+
19
+ assert isinstance(items, list)
20
+
21
+ def test_itemlist_basic_operations(self):
22
+ """Test basic list operations work."""
23
+ items = ItemList()
24
+
25
+ items.append({"id": 1})
26
+ items.append({"id": 2})
27
+
28
+ assert len(items) == 2
29
+ assert items[0] == {"id": 1}
30
+
31
+ def test_to_json_creates_file(self):
32
+ """Test to_json creates JSON file."""
33
+ items = ItemList()
34
+ items.append({"name": "test", "value": 123})
35
+ items.append({"name": "test2", "value": 456})
36
+
37
+ with tempfile.TemporaryDirectory() as tmpdir:
38
+ path = Path(tmpdir) / "output.json"
39
+ items.to_json(path)
40
+
41
+ assert path.exists()
42
+
43
+ content = json.loads(path.read_text())
44
+ assert len(content) == 2
45
+ assert content[0]["name"] == "test"
46
+
47
+ def test_to_json_creates_parent_directory(self):
48
+ """Test to_json creates parent directories."""
49
+ items = ItemList()
50
+ items.append({"data": "test"})
51
+
52
+ with tempfile.TemporaryDirectory() as tmpdir:
53
+ path = Path(tmpdir) / "nested" / "dirs" / "output.json"
54
+ items.to_json(path)
55
+
56
+ assert path.exists()
57
+
58
+ def test_to_json_with_indent(self):
59
+ """Test to_json with indentation."""
60
+ items = ItemList()
61
+ items.append({"key": "value"})
62
+
63
+ with tempfile.TemporaryDirectory() as tmpdir:
64
+ path = Path(tmpdir) / "output.json"
65
+ items.to_json(path, indent=True)
66
+
67
+ content = path.read_text()
68
+ # Indented JSON should have newlines
69
+ assert "\n" in content
70
+
71
+ def test_to_jsonl_creates_file(self):
72
+ """Test to_jsonl creates JSON Lines file."""
73
+ items = ItemList()
74
+ items.append({"id": 1, "name": "first"})
75
+ items.append({"id": 2, "name": "second"})
76
+ items.append({"id": 3, "name": "third"})
77
+
78
+ with tempfile.TemporaryDirectory() as tmpdir:
79
+ path = Path(tmpdir) / "output.jsonl"
80
+ items.to_jsonl(path)
81
+
82
+ assert path.exists()
83
+
84
+ lines = path.read_text().strip().split("\n")
85
+ assert len(lines) == 3
86
+
87
+ # Each line should be valid JSON
88
+ for line in lines:
89
+ parsed = json.loads(line)
90
+ assert "id" in parsed
91
+ assert "name" in parsed
92
+
93
+ def test_to_jsonl_one_object_per_line(self):
94
+ """Test that JSONL has one JSON object per line."""
95
+ items = ItemList()
96
+ items.append({"line": 1})
97
+ items.append({"line": 2})
98
+
99
+ with tempfile.TemporaryDirectory() as tmpdir:
100
+ path = Path(tmpdir) / "output.jsonl"
101
+ items.to_jsonl(path)
102
+
103
+ lines = path.read_text().strip().split("\n")
104
+
105
+ assert json.loads(lines[0])["line"] == 1
106
+ assert json.loads(lines[1])["line"] == 2
107
+
108
+
109
+ class TestCrawlStats:
110
+ """Test CrawlStats dataclass."""
111
+
112
+ def test_default_values(self):
113
+ """Test CrawlStats default values."""
114
+ stats = CrawlStats()
115
+
116
+ assert stats.requests_count == 0
117
+ assert stats.concurrent_requests == 0
118
+ assert stats.failed_requests_count == 0
119
+ assert stats.response_bytes == 0
120
+ assert stats.items_scraped == 0
121
+ assert stats.items_dropped == 0
122
+ assert stats.start_time == 0.0
123
+ assert stats.end_time == 0.0
124
+ assert stats.custom_stats == {}
125
+ assert stats.response_status_count == {}
126
+ assert stats.proxies == []
127
+
128
+ def test_elapsed_seconds(self):
129
+ """Test elapsed_seconds property."""
130
+ stats = CrawlStats(start_time=100.0, end_time=150.0)
131
+
132
+ assert stats.elapsed_seconds == 50.0
133
+
134
+ def test_requests_per_second(self):
135
+ """Test requests_per_second calculation."""
136
+ stats = CrawlStats(
137
+ requests_count=100,
138
+ start_time=0.0,
139
+ end_time=10.0,
140
+ )
141
+
142
+ assert stats.requests_per_second == 10.0
143
+
144
+ def test_requests_per_second_zero_elapsed(self):
145
+ """Test requests_per_second when elapsed is zero."""
146
+ stats = CrawlStats(
147
+ requests_count=100,
148
+ start_time=0.0,
149
+ end_time=0.0,
150
+ )
151
+
152
+ assert stats.requests_per_second == 0.0
153
+
154
+ def test_increment_status(self):
155
+ """Test increment_status method."""
156
+ stats = CrawlStats()
157
+
158
+ stats.increment_status(200)
159
+ stats.increment_status(200)
160
+ stats.increment_status(404)
161
+
162
+ assert stats.response_status_count == {"status_200": 2, "status_404": 1}
163
+
164
+ def test_increment_response_bytes(self):
165
+ """Test increment_response_bytes method."""
166
+ stats = CrawlStats()
167
+
168
+ stats.increment_response_bytes("example.com", 1000)
169
+ stats.increment_response_bytes("example.com", 500)
170
+ stats.increment_response_bytes("other.com", 2000)
171
+
172
+ assert stats.response_bytes == 3500
173
+ assert stats.domains_response_bytes == {
174
+ "example.com": 1500,
175
+ "other.com": 2000,
176
+ }
177
+
178
+ def test_increment_requests_count(self):
179
+ """Test increment_requests_count method."""
180
+ stats = CrawlStats()
181
+
182
+ stats.increment_requests_count("session1")
183
+ stats.increment_requests_count("session1")
184
+ stats.increment_requests_count("session2")
185
+
186
+ assert stats.requests_count == 3
187
+ assert stats.sessions_requests_count == {"session1": 2, "session2": 1}
188
+
189
+ def test_to_dict(self):
190
+ """Test to_dict method returns all stats."""
191
+ stats = CrawlStats(
192
+ items_scraped=10,
193
+ items_dropped=2,
194
+ requests_count=15,
195
+ start_time=0.0,
196
+ end_time=5.0,
197
+ )
198
+ stats.increment_status(200)
199
+
200
+ result = stats.to_dict()
201
+
202
+ assert result["items_scraped"] == 10
203
+ assert result["items_dropped"] == 2
204
+ assert result["requests_count"] == 15
205
+ assert result["elapsed_seconds"] == 5.0
206
+ assert result["requests_per_second"] == 3.0
207
+ assert result["response_status_count"] == {"status_200": 1}
208
+
209
+ def test_custom_stats(self):
210
+ """Test custom_stats can be used."""
211
+ stats = CrawlStats()
212
+ stats.custom_stats["my_metric"] = 42
213
+ stats.custom_stats["another"] = "value"
214
+
215
+ assert stats.custom_stats["my_metric"] == 42
216
+ assert stats.to_dict()["custom_stats"]["my_metric"] == 42
217
+
218
+
219
+ class TestCrawlResult:
220
+ """Test CrawlResult dataclass."""
221
+
222
+ def test_basic_creation(self):
223
+ """Test basic CrawlResult creation."""
224
+ stats = CrawlStats(items_scraped=5)
225
+ items = ItemList()
226
+ items.extend([{"id": i} for i in range(5)])
227
+
228
+ result = CrawlResult(stats=stats, items=items)
229
+
230
+ assert result.stats.items_scraped == 5
231
+ assert len(result.items) == 5
232
+ assert result.paused is False
233
+
234
+ def test_completed_property_true_when_not_paused(self):
235
+ """Test completed is True when not paused."""
236
+ result = CrawlResult(
237
+ stats=CrawlStats(),
238
+ items=ItemList(),
239
+ paused=False,
240
+ )
241
+
242
+ assert result.completed is True
243
+
244
+ def test_completed_property_false_when_paused(self):
245
+ """Test completed is False when paused."""
246
+ result = CrawlResult(
247
+ stats=CrawlStats(),
248
+ items=ItemList(),
249
+ paused=True,
250
+ )
251
+
252
+ assert result.completed is False
253
+
254
+ def test_len_returns_item_count(self):
255
+ """Test len returns number of items."""
256
+ items = ItemList()
257
+ items.extend([{"id": i} for i in range(10)])
258
+
259
+ result = CrawlResult(stats=CrawlStats(), items=items)
260
+
261
+ assert len(result) == 10
262
+
263
+ def test_iter_yields_items(self):
264
+ """Test iteration yields items."""
265
+ items = ItemList()
266
+ items.extend([{"id": 1}, {"id": 2}, {"id": 3}])
267
+
268
+ result = CrawlResult(stats=CrawlStats(), items=items)
269
+
270
+ collected = list(result)
271
+
272
+ assert collected == [{"id": 1}, {"id": 2}, {"id": 3}]
273
+
274
+ def test_result_with_stats(self):
275
+ """Test CrawlResult with populated stats."""
276
+ stats = CrawlStats(
277
+ requests_count=100,
278
+ items_scraped=50,
279
+ failed_requests_count=5,
280
+ start_time=0.0,
281
+ end_time=10.0,
282
+ )
283
+ items = ItemList()
284
+
285
+ result = CrawlResult(stats=stats, items=items)
286
+
287
+ assert result.stats.requests_count == 100
288
+ assert result.stats.items_scraped == 50
289
+ assert result.stats.requests_per_second == 10.0
290
+
291
+
292
+ class TestCrawlResultIntegration:
293
+ """Integration tests for result classes."""
294
+
295
+ def test_full_workflow(self):
296
+ """Test realistic workflow with all result classes."""
297
+ # Simulate a crawl
298
+ stats = CrawlStats(start_time=1000.0)
299
+
300
+ # Simulate requests
301
+ for _ in range(10):
302
+ stats.increment_requests_count("default")
303
+ stats.increment_status(200)
304
+ stats.increment_response_bytes("example.com", 5000)
305
+
306
+ # Simulate some failures
307
+ stats.failed_requests_count = 2
308
+ stats.blocked_requests_count = 1
309
+
310
+ # Collect items
311
+ items = ItemList()
312
+ for i in range(8):
313
+ items.append({"product_id": i, "name": f"Product {i}"})
314
+ stats.items_scraped += 1
315
+
316
+ # Finish crawl
317
+ stats.end_time = 1005.0
318
+
319
+ # Create result
320
+ result = CrawlResult(stats=stats, items=items, paused=False)
321
+
322
+ # Verify
323
+ assert result.completed is True
324
+ assert len(result) == 8
325
+ assert result.stats.requests_count == 10
326
+ assert result.stats.requests_per_second == 2.0
327
+ assert result.stats.response_bytes == 50000
tests/spiders/test_scheduler.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the Scheduler class."""
2
+
3
+ import pytest
4
+
5
+ from scrapling.spiders.request import Request
6
+ from scrapling.spiders.scheduler import Scheduler
7
+ from scrapling.spiders.checkpoint import CheckpointData
8
+
9
+
10
+ class TestSchedulerInit:
11
+ """Test Scheduler initialization."""
12
+
13
+ def test_scheduler_starts_empty(self):
14
+ """Test that scheduler starts with empty queue."""
15
+ scheduler = Scheduler()
16
+
17
+ assert len(scheduler) == 0
18
+ assert scheduler.is_empty is True
19
+
20
+
21
+ class TestSchedulerEnqueue:
22
+ """Test Scheduler enqueue functionality."""
23
+
24
+ @pytest.mark.asyncio
25
+ async def test_enqueue_single_request(self):
26
+ """Test enqueueing a single request."""
27
+ scheduler = Scheduler()
28
+ request = Request("https://example.com")
29
+
30
+ result = await scheduler.enqueue(request)
31
+
32
+ assert result is True
33
+ assert len(scheduler) == 1
34
+ assert scheduler.is_empty is False
35
+
36
+ @pytest.mark.asyncio
37
+ async def test_enqueue_multiple_requests(self):
38
+ """Test enqueueing multiple requests."""
39
+ scheduler = Scheduler()
40
+
41
+ for i in range(5):
42
+ request = Request(f"https://example.com/{i}")
43
+ await scheduler.enqueue(request)
44
+
45
+ assert len(scheduler) == 5
46
+
47
+ @pytest.mark.asyncio
48
+ async def test_enqueue_duplicate_filtered(self):
49
+ """Test that duplicate requests are filtered by default."""
50
+ scheduler = Scheduler()
51
+
52
+ request1 = Request("https://example.com", sid="s1")
53
+ request2 = Request("https://example.com", sid="s1") # Same fingerprint
54
+
55
+ result1 = await scheduler.enqueue(request1)
56
+ result2 = await scheduler.enqueue(request2)
57
+
58
+ assert result1 is True
59
+ assert result2 is False # Duplicate filtered
60
+ assert len(scheduler) == 1
61
+
62
+ @pytest.mark.asyncio
63
+ async def test_enqueue_duplicate_allowed_with_dont_filter(self):
64
+ """Test that dont_filter allows duplicate requests."""
65
+ scheduler = Scheduler()
66
+
67
+ request1 = Request("https://example.com", sid="s1")
68
+ request2 = Request("https://example.com", sid="s1", dont_filter=True)
69
+
70
+ result1 = await scheduler.enqueue(request1)
71
+ result2 = await scheduler.enqueue(request2)
72
+
73
+ assert result1 is True
74
+ assert result2 is True
75
+ assert len(scheduler) == 2
76
+
77
+ @pytest.mark.asyncio
78
+ async def test_enqueue_different_sessions_not_duplicate(self):
79
+ """Test that same URL with different sessions are not duplicates."""
80
+ scheduler = Scheduler()
81
+
82
+ request1 = Request("https://example.com", sid="session1")
83
+ request2 = Request("https://example.com", sid="session2")
84
+
85
+ result1 = await scheduler.enqueue(request1)
86
+ result2 = await scheduler.enqueue(request2)
87
+
88
+ assert result1 is True
89
+ assert result2 is True
90
+ assert len(scheduler) == 2
91
+
92
+
93
+ class TestSchedulerDequeue:
94
+ """Test Scheduler dequeue functionality."""
95
+
96
+ @pytest.mark.asyncio
97
+ async def test_dequeue_returns_request(self):
98
+ """Test that dequeue returns the enqueued request."""
99
+ scheduler = Scheduler()
100
+ original = Request("https://example.com")
101
+
102
+ await scheduler.enqueue(original)
103
+ dequeued = await scheduler.dequeue()
104
+
105
+ assert dequeued.url == original.url
106
+
107
+ @pytest.mark.asyncio
108
+ async def test_dequeue_respects_priority_order(self):
109
+ """Test that higher priority requests are dequeued first."""
110
+ scheduler = Scheduler()
111
+
112
+ low = Request("https://example.com/low", priority=1)
113
+ high = Request("https://example.com/high", priority=10)
114
+ medium = Request("https://example.com/medium", priority=5)
115
+
116
+ await scheduler.enqueue(low)
117
+ await scheduler.enqueue(high)
118
+ await scheduler.enqueue(medium)
119
+
120
+ # Should get high priority first
121
+ first = await scheduler.dequeue()
122
+ assert first.url == "https://example.com/high"
123
+
124
+ second = await scheduler.dequeue()
125
+ assert second.url == "https://example.com/medium"
126
+
127
+ third = await scheduler.dequeue()
128
+ assert third.url == "https://example.com/low"
129
+
130
+ @pytest.mark.asyncio
131
+ async def test_dequeue_fifo_for_same_priority(self):
132
+ """Test FIFO ordering for requests with same priority."""
133
+ scheduler = Scheduler()
134
+
135
+ for i in range(3):
136
+ request = Request(f"https://example.com/{i}", priority=5)
137
+ await scheduler.enqueue(request)
138
+
139
+ first = await scheduler.dequeue()
140
+ second = await scheduler.dequeue()
141
+ third = await scheduler.dequeue()
142
+
143
+ # Should be in FIFO order since same priority
144
+ assert first.url == "https://example.com/0"
145
+ assert second.url == "https://example.com/1"
146
+ assert third.url == "https://example.com/2"
147
+
148
+ @pytest.mark.asyncio
149
+ async def test_dequeue_updates_length(self):
150
+ """Test that dequeue decreases the queue length."""
151
+ scheduler = Scheduler()
152
+
153
+ await scheduler.enqueue(Request("https://example.com/1"))
154
+ await scheduler.enqueue(Request("https://example.com/2"))
155
+
156
+ assert len(scheduler) == 2
157
+
158
+ await scheduler.dequeue()
159
+ assert len(scheduler) == 1
160
+
161
+ await scheduler.dequeue()
162
+ assert len(scheduler) == 0
163
+ assert scheduler.is_empty is True
164
+
165
+
166
+ class TestSchedulerSnapshot:
167
+ """Test Scheduler snapshot functionality for checkpointing."""
168
+
169
+ @pytest.mark.asyncio
170
+ async def test_snapshot_empty_scheduler(self):
171
+ """Test snapshot of empty scheduler."""
172
+ scheduler = Scheduler()
173
+
174
+ requests, seen = scheduler.snapshot()
175
+
176
+ assert requests == []
177
+ assert seen == set()
178
+
179
+ @pytest.mark.asyncio
180
+ async def test_snapshot_captures_pending_requests(self):
181
+ """Test snapshot captures all pending requests."""
182
+ scheduler = Scheduler()
183
+
184
+ await scheduler.enqueue(Request("https://example.com/1", priority=5))
185
+ await scheduler.enqueue(Request("https://example.com/2", priority=10))
186
+ await scheduler.enqueue(Request("https://example.com/3", priority=1))
187
+
188
+ requests, seen = scheduler.snapshot()
189
+
190
+ assert len(requests) == 3
191
+ # Should be sorted by priority (highest first due to negative priority in queue)
192
+ assert requests[0].url == "https://example.com/2" # priority 10
193
+ assert requests[1].url == "https://example.com/1" # priority 5
194
+ assert requests[2].url == "https://example.com/3" # priority 1
195
+
196
+ @pytest.mark.asyncio
197
+ async def test_snapshot_captures_seen_set(self):
198
+ """Test snapshot captures seen URLs."""
199
+ scheduler = Scheduler()
200
+
201
+ await scheduler.enqueue(Request("https://example.com/1", sid="s1"))
202
+ await scheduler.enqueue(Request("https://example.com/2", sid="s1"))
203
+
204
+ requests, seen = scheduler.snapshot()
205
+
206
+ assert len(seen) == 2
207
+ assert "s1:https://example.com/1" in seen
208
+ assert "s1:https://example.com/2" in seen
209
+
210
+ @pytest.mark.asyncio
211
+ async def test_snapshot_returns_copies(self):
212
+ """Test that snapshot returns copies, not references."""
213
+ scheduler = Scheduler()
214
+
215
+ await scheduler.enqueue(Request("https://example.com"))
216
+
217
+ requests, seen = scheduler.snapshot()
218
+
219
+ # Modifying snapshot shouldn't affect scheduler
220
+ requests.append(Request("https://modified.com"))
221
+ seen.add("new_fingerprint")
222
+
223
+ original_requests, original_seen = scheduler.snapshot()
224
+
225
+ assert len(original_requests) == 1
226
+ assert "new_fingerprint" not in original_seen
227
+
228
+ @pytest.mark.asyncio
229
+ async def test_snapshot_excludes_dequeued_requests(self):
230
+ """Test snapshot only includes pending requests."""
231
+ scheduler = Scheduler()
232
+
233
+ await scheduler.enqueue(Request("https://example.com/1"))
234
+ await scheduler.enqueue(Request("https://example.com/2"))
235
+ await scheduler.enqueue(Request("https://example.com/3"))
236
+
237
+ # Dequeue one
238
+ await scheduler.dequeue()
239
+
240
+ requests, seen = scheduler.snapshot()
241
+
242
+ # Snapshot should only have 2 pending requests
243
+ assert len(requests) == 2
244
+ # But seen should still have all 3 (deduplication tracking)
245
+ assert len(seen) == 3
246
+
247
+
248
+ class TestSchedulerRestore:
249
+ """Test Scheduler restore functionality from checkpoint."""
250
+
251
+ @pytest.mark.asyncio
252
+ async def test_restore_requests(self):
253
+ """Test restoring requests from checkpoint data."""
254
+ scheduler = Scheduler()
255
+
256
+ checkpoint_requests = [
257
+ Request("https://example.com/1", priority=10),
258
+ Request("https://example.com/2", priority=5),
259
+ ]
260
+ checkpoint_seen = {"fp1", "fp2", "fp3"}
261
+
262
+ data = CheckpointData(requests=checkpoint_requests, seen=checkpoint_seen)
263
+
264
+ scheduler.restore(data)
265
+
266
+ assert len(scheduler) == 2
267
+
268
+ @pytest.mark.asyncio
269
+ async def test_restore_seen_set(self):
270
+ """Test that restore sets up seen fingerprints."""
271
+ scheduler = Scheduler()
272
+
273
+ data = CheckpointData(
274
+ requests=[],
275
+ seen={"fp1", "fp2"},
276
+ )
277
+
278
+ scheduler.restore(data)
279
+
280
+ # Now try to enqueue a request with matching fingerprint
281
+ request = Request("https://example.com")
282
+ request.sid = "" # Empty sid
283
+ # Manually set fingerprint that matches seen
284
+ # Since fingerprint is sid:url, we need to create matching ones
285
+
286
+ # Verify seen set was restored
287
+ _, seen = scheduler.snapshot()
288
+ assert seen == {"fp1", "fp2"}
289
+
290
+ @pytest.mark.asyncio
291
+ async def test_restore_maintains_priority_order(self):
292
+ """Test that restored requests maintain priority order."""
293
+ scheduler = Scheduler()
294
+
295
+ # Requests should already be sorted by priority in checkpoint
296
+ checkpoint_requests = [
297
+ Request("https://example.com/high", priority=10),
298
+ Request("https://example.com/low", priority=1),
299
+ ]
300
+
301
+ data = CheckpointData(requests=checkpoint_requests, seen=set())
302
+ scheduler.restore(data)
303
+
304
+ # Dequeue should return high priority first
305
+ first = await scheduler.dequeue()
306
+ assert first.url == "https://example.com/high"
307
+
308
+ second = await scheduler.dequeue()
309
+ assert second.url == "https://example.com/low"
310
+
311
+ @pytest.mark.asyncio
312
+ async def test_restore_empty_checkpoint(self):
313
+ """Test restoring from empty checkpoint."""
314
+ scheduler = Scheduler()
315
+
316
+ data = CheckpointData(requests=[], seen=set())
317
+ scheduler.restore(data)
318
+
319
+ assert len(scheduler) == 0
320
+ assert scheduler.is_empty is True
321
+
322
+
323
+ class TestSchedulerIntegration:
324
+ """Integration tests for Scheduler with checkpoint roundtrip."""
325
+
326
+ @pytest.mark.asyncio
327
+ async def test_snapshot_and_restore_roundtrip(self):
328
+ """Test that snapshot -> restore works correctly."""
329
+ # Create and populate original scheduler
330
+ original = Scheduler()
331
+
332
+ await original.enqueue(Request("https://example.com/1", sid="s1", priority=10))
333
+ await original.enqueue(Request("https://example.com/2", sid="s1", priority=5))
334
+ await original.enqueue(Request("https://example.com/3", sid="s2", priority=7))
335
+
336
+ # Snapshot
337
+ requests, seen = original.snapshot()
338
+ data = CheckpointData(requests=requests, seen=seen)
339
+
340
+ # Restore to new scheduler
341
+ restored = Scheduler()
342
+ restored.restore(data)
343
+
344
+ # Verify state matches
345
+ assert len(restored) == len(original)
346
+
347
+ # Dequeue from both and compare
348
+ for _ in range(3):
349
+ orig_req = await original.dequeue()
350
+ rest_req = await restored.dequeue()
351
+ assert orig_req.url == rest_req.url
352
+ assert orig_req.priority == rest_req.priority
353
+
354
+ @pytest.mark.asyncio
355
+ async def test_partial_processing_then_checkpoint(self):
356
+ """Test checkpointing after partial processing."""
357
+ scheduler = Scheduler()
358
+
359
+ # Enqueue 5 requests
360
+ for i in range(5):
361
+ await scheduler.enqueue(Request(f"https://example.com/{i}"))
362
+
363
+ # Process 2
364
+ await scheduler.dequeue()
365
+ await scheduler.dequeue()
366
+
367
+ # Snapshot should show 3 pending, 5 seen
368
+ requests, seen = scheduler.snapshot()
369
+
370
+ assert len(requests) == 3
371
+ assert len(seen) == 5
372
+
373
+ @pytest.mark.asyncio
374
+ async def test_deduplication_after_restore(self):
375
+ """Test that deduplication works after restore."""
376
+ scheduler = Scheduler()
377
+
378
+ await scheduler.enqueue(Request("https://example.com", sid="s1"))
379
+
380
+ requests, seen = scheduler.snapshot()
381
+ data = CheckpointData(requests=requests, seen=seen)
382
+
383
+ # Restore to new scheduler
384
+ new_scheduler = Scheduler()
385
+ new_scheduler.restore(data)
386
+
387
+ # Try to add duplicate - should be filtered
388
+ result = await new_scheduler.enqueue(Request("https://example.com", sid="s1"))
389
+
390
+ assert result is False # Duplicate filtered based on restored seen set
tests/spiders/test_session.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the SessionManager class."""
2
+
3
+ from scrapling.core._types import Any
4
+ import pytest
5
+
6
+ from scrapling.spiders.session import SessionManager
7
+
8
+
9
+ class MockSession: # type: ignore[type-arg]
10
+ """Mock session for testing without actual network calls."""
11
+
12
+ def __init__(self, name: str = "mock"):
13
+ self.name = name
14
+ self._is_alive = False
15
+ self._started = False
16
+ self._closed = False
17
+
18
+ async def __aenter__(self):
19
+ self._is_alive = True
20
+ self._started = True
21
+ return self
22
+
23
+ async def __aexit__(self, *args):
24
+ self._is_alive = False
25
+ self._closed = True
26
+
27
+ async def fetch(self, url: str, **kwargs):
28
+ pass
29
+
30
+
31
+ class TestSessionManagerInit:
32
+ """Test SessionManager initialization."""
33
+
34
+ def test_manager_starts_empty(self):
35
+ """Test that manager starts with no sessions."""
36
+ manager = SessionManager()
37
+
38
+ assert len(manager) == 0
39
+
40
+ def test_manager_no_default_session_when_empty(self):
41
+ """Test that accessing default_session_id raises when empty."""
42
+ manager = SessionManager()
43
+
44
+ with pytest.raises(RuntimeError, match="No sessions registered"):
45
+ _ = manager.default_session_id
46
+
47
+
48
+ class TestSessionManagerAdd:
49
+ """Test SessionManager add functionality."""
50
+
51
+ def test_add_single_session(self):
52
+ """Test adding a single session."""
53
+ manager = SessionManager()
54
+ session = MockSession()
55
+
56
+ manager.add("test", session)
57
+
58
+ assert len(manager) == 1
59
+ assert "test" in manager
60
+ assert manager.session_ids == ["test"]
61
+
62
+ def test_first_session_becomes_default(self):
63
+ """Test that first added session becomes default."""
64
+ manager = SessionManager()
65
+ session = MockSession()
66
+
67
+ manager.add("first", session)
68
+
69
+ assert manager.default_session_id == "first"
70
+
71
+ def test_add_multiple_sessions(self):
72
+ """Test adding multiple sessions."""
73
+ manager = SessionManager()
74
+
75
+ manager.add("session1", MockSession("s1"))
76
+ manager.add("session2", MockSession("s2"))
77
+ manager.add("session3", MockSession("s3"))
78
+
79
+ assert len(manager) == 3
80
+ assert "session1" in manager
81
+ assert "session2" in manager
82
+ assert "session3" in manager
83
+
84
+ def test_explicit_default_session(self):
85
+ """Test setting explicit default session."""
86
+ manager = SessionManager()
87
+
88
+ manager.add("first", MockSession())
89
+ manager.add("second", MockSession(), default=True)
90
+
91
+ assert manager.default_session_id == "second"
92
+
93
+ def test_add_duplicate_id_raises(self):
94
+ """Test that adding duplicate session ID raises."""
95
+ manager = SessionManager()
96
+ manager.add("test", MockSession())
97
+
98
+ with pytest.raises(ValueError, match="already registered"):
99
+ manager.add("test", MockSession())
100
+
101
+ def test_add_returns_self_for_chaining(self):
102
+ """Test that add returns self for method chaining."""
103
+ manager = SessionManager()
104
+
105
+ result = manager.add("test", MockSession())
106
+
107
+ assert result is manager
108
+
109
+ def test_method_chaining(self):
110
+ """Test fluent interface for adding sessions."""
111
+ manager = SessionManager()
112
+
113
+ manager.add("s1", MockSession()).add("s2", MockSession()).add("s3", MockSession())
114
+
115
+ assert len(manager) == 3
116
+
117
+ def test_add_lazy_session(self):
118
+ """Test adding lazy session."""
119
+ manager = SessionManager()
120
+
121
+ manager.add("lazy", MockSession(), lazy=True)
122
+
123
+ assert "lazy" in manager
124
+ assert "lazy" in manager._lazy_sessions
125
+
126
+
127
+ class TestSessionManagerRemove:
128
+ """Test SessionManager remove/pop functionality."""
129
+
130
+ def test_remove_session(self):
131
+ """Test removing a session."""
132
+ manager = SessionManager()
133
+ manager.add("test", MockSession())
134
+
135
+ manager.remove("test")
136
+
137
+ assert "test" not in manager
138
+ assert len(manager) == 0
139
+
140
+ def test_remove_nonexistent_raises(self):
141
+ """Test removing nonexistent session raises."""
142
+ manager = SessionManager()
143
+
144
+ with pytest.raises(KeyError, match="not found"):
145
+ manager.remove("nonexistent")
146
+
147
+ def test_pop_returns_session(self):
148
+ """Test pop returns the removed session."""
149
+ manager = SessionManager()
150
+ session = MockSession("original")
151
+ manager.add("test", session)
152
+
153
+ popped = manager.pop("test")
154
+
155
+ assert popped is session
156
+ assert "test" not in manager
157
+
158
+ def test_remove_default_updates_default(self):
159
+ """Test that removing default session updates default."""
160
+ manager = SessionManager()
161
+ manager.add("first", MockSession())
162
+ manager.add("second", MockSession())
163
+
164
+ assert manager.default_session_id == "first"
165
+
166
+ manager.remove("first")
167
+
168
+ assert manager.default_session_id == "second"
169
+
170
+ def test_remove_lazy_session_cleans_up(self):
171
+ """Test that removing lazy session cleans up lazy set."""
172
+ manager = SessionManager()
173
+ manager.add("lazy", MockSession(), lazy=True)
174
+
175
+ manager.remove("lazy")
176
+
177
+ assert "lazy" not in manager._lazy_sessions
178
+
179
+
180
+ class TestSessionManagerGet:
181
+ """Test SessionManager get functionality."""
182
+
183
+ def test_get_existing_session(self):
184
+ """Test getting an existing session."""
185
+ manager = SessionManager()
186
+ session = MockSession("test")
187
+ manager.add("test", session)
188
+
189
+ retrieved = manager.get("test")
190
+
191
+ assert retrieved is session
192
+
193
+ def test_get_nonexistent_raises_with_available(self):
194
+ """Test getting nonexistent session shows available sessions."""
195
+ manager = SessionManager()
196
+ manager.add("session1", MockSession())
197
+ manager.add("session2", MockSession())
198
+
199
+ with pytest.raises(KeyError, match="Available:"):
200
+ manager.get("nonexistent")
201
+
202
+
203
+ class TestSessionManagerContains:
204
+ """Test SessionManager contains functionality."""
205
+
206
+ def test_contains_existing(self):
207
+ """Test contains for existing session."""
208
+ manager = SessionManager()
209
+ manager.add("test", MockSession())
210
+
211
+ assert "test" in manager
212
+
213
+ def test_not_contains_missing(self):
214
+ """Test contains for missing session."""
215
+ manager = SessionManager()
216
+ manager.add("test", MockSession())
217
+
218
+ assert "other" not in manager
219
+
220
+
221
+ class TestSessionManagerAsyncContext:
222
+ """Test SessionManager async context manager."""
223
+
224
+ @pytest.mark.asyncio
225
+ async def test_start_activates_sessions(self):
226
+ """Test that start activates non-lazy sessions."""
227
+ manager = SessionManager()
228
+ session = MockSession()
229
+ manager.add("test", session)
230
+
231
+ await manager.start()
232
+
233
+ assert session._is_alive is True
234
+ assert manager._started is True
235
+
236
+ @pytest.mark.asyncio
237
+ async def test_start_skips_lazy_sessions(self):
238
+ """Test that start skips lazy sessions."""
239
+ manager = SessionManager()
240
+ eager_session = MockSession("eager")
241
+ lazy_session = MockSession("lazy")
242
+
243
+ manager.add("eager", eager_session)
244
+ manager.add("lazy", lazy_session, lazy=True)
245
+
246
+ await manager.start()
247
+
248
+ assert eager_session._is_alive is True
249
+ assert lazy_session._is_alive is False
250
+
251
+ @pytest.mark.asyncio
252
+ async def test_close_deactivates_sessions(self):
253
+ """Test that close deactivates all sessions."""
254
+ manager = SessionManager()
255
+ session = MockSession()
256
+ manager.add("test", session)
257
+
258
+ await manager.start()
259
+ assert session._is_alive is True
260
+
261
+ await manager.close()
262
+ assert session._is_alive is False
263
+ assert manager._started is False
264
+
265
+ @pytest.mark.asyncio
266
+ async def test_async_context_manager(self):
267
+ """Test using SessionManager as async context manager."""
268
+ manager = SessionManager()
269
+ session = MockSession()
270
+ manager.add("test", session)
271
+
272
+ async with manager:
273
+ assert session._is_alive is True
274
+
275
+ assert session._is_alive is False
276
+
277
+ @pytest.mark.asyncio
278
+ async def test_start_idempotent(self):
279
+ """Test that calling start multiple times is safe."""
280
+ manager = SessionManager()
281
+ session = MockSession()
282
+ manager.add("test", session)
283
+
284
+ await manager.start()
285
+ await manager.start() # Should not raise or double-start
286
+
287
+ assert session._started is True
288
+
289
+
290
+ class TestSessionManagerProperties:
291
+ """Test SessionManager properties."""
292
+
293
+ def test_session_ids_returns_list(self):
294
+ """Test session_ids returns list of IDs."""
295
+ manager = SessionManager()
296
+ manager.add("a", MockSession())
297
+ manager.add("b", MockSession())
298
+ manager.add("c", MockSession())
299
+
300
+ ids = manager.session_ids
301
+
302
+ assert isinstance(ids, list)
303
+ assert set(ids) == {"a", "b", "c"}
304
+
305
+ def test_len_returns_session_count(self):
306
+ """Test len returns number of sessions."""
307
+ manager = SessionManager()
308
+
309
+ assert len(manager) == 0
310
+
311
+ manager.add("s1", MockSession())
312
+ assert len(manager) == 1
313
+
314
+ manager.add("s2", MockSession())
315
+ assert len(manager) == 2
316
+
317
+
318
+ class TestSessionManagerIntegration:
319
+ """Integration tests for SessionManager."""
320
+
321
+ def test_realistic_setup(self):
322
+ """Test realistic session manager setup."""
323
+ manager = SessionManager()
324
+
325
+ # Add different types of sessions
326
+ manager.add("default", MockSession("default"))
327
+ manager.add("backup", MockSession("backup"))
328
+ manager.add("lazy_special", MockSession("special"), lazy=True)
329
+
330
+ assert len(manager) == 3
331
+ assert manager.default_session_id == "default"
332
+ assert "lazy_special" in manager._lazy_sessions
333
+
334
+ @pytest.mark.asyncio
335
+ async def test_lifecycle_management(self):
336
+ """Test complete lifecycle of session manager."""
337
+ manager = SessionManager()
338
+ sessions = [MockSession(f"s{i}") for i in range(3)]
339
+
340
+ for i, session in enumerate(sessions):
341
+ manager.add(f"session{i}", session)
342
+
343
+ # Before start - no sessions active
344
+ assert all(not s._is_alive for s in sessions)
345
+
346
+ # After start - all active
347
+ await manager.start()
348
+ assert all(s._is_alive for s in sessions)
349
+
350
+ # After close - all inactive
351
+ await manager.close()
352
+ assert all(not s._is_alive for s in sessions)
tests/spiders/test_spider.py ADDED
@@ -0,0 +1,574 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the Spider class and related components."""
2
+
3
+ import logging
4
+ import tempfile
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+ from scrapling.spiders.spider import Spider, SessionConfigurationError, LogCounterHandler, BLOCKED_CODES
10
+ from scrapling.spiders.request import Request
11
+ from scrapling.spiders.session import SessionManager
12
+ from scrapling.spiders.result import CrawlStats
13
+ from scrapling.core._types import Any, Dict, AsyncGenerator
14
+
15
+
16
+ class TestLogCounterHandler:
17
+ """Test LogCounterHandler for tracking log counts."""
18
+
19
+ def test_initial_counts_are_zero(self):
20
+ """Test that handler starts with zero counts."""
21
+ handler = LogCounterHandler()
22
+ counts = handler.get_counts()
23
+
24
+ assert counts["debug"] == 0
25
+ assert counts["info"] == 0
26
+ assert counts["warning"] == 0
27
+ assert counts["error"] == 0
28
+ assert counts["critical"] == 0
29
+
30
+ def test_counts_debug_messages(self):
31
+ """Test counting debug level messages."""
32
+ handler = LogCounterHandler()
33
+ record = logging.LogRecord(
34
+ name="test",
35
+ level=logging.DEBUG,
36
+ pathname="",
37
+ lineno=0,
38
+ msg="test",
39
+ args=(),
40
+ exc_info=None,
41
+ )
42
+
43
+ handler.emit(record)
44
+ handler.emit(record)
45
+
46
+ assert handler.get_counts()["debug"] == 2
47
+
48
+ def test_counts_info_messages(self):
49
+ """Test counting info level messages."""
50
+ handler = LogCounterHandler()
51
+ record = logging.LogRecord(
52
+ name="test",
53
+ level=logging.INFO,
54
+ pathname="",
55
+ lineno=0,
56
+ msg="test",
57
+ args=(),
58
+ exc_info=None,
59
+ )
60
+
61
+ handler.emit(record)
62
+
63
+ assert handler.get_counts()["info"] == 1
64
+
65
+ def test_counts_warning_messages(self):
66
+ """Test counting warning level messages."""
67
+ handler = LogCounterHandler()
68
+ record = logging.LogRecord(
69
+ name="test",
70
+ level=logging.WARNING,
71
+ pathname="",
72
+ lineno=0,
73
+ msg="test",
74
+ args=(),
75
+ exc_info=None,
76
+ )
77
+
78
+ handler.emit(record)
79
+
80
+ assert handler.get_counts()["warning"] == 1
81
+
82
+ def test_counts_error_messages(self):
83
+ """Test counting error level messages."""
84
+ handler = LogCounterHandler()
85
+ record = logging.LogRecord(
86
+ name="test",
87
+ level=logging.ERROR,
88
+ pathname="",
89
+ lineno=0,
90
+ msg="test",
91
+ args=(),
92
+ exc_info=None,
93
+ )
94
+
95
+ handler.emit(record)
96
+
97
+ assert handler.get_counts()["error"] == 1
98
+
99
+ def test_counts_critical_messages(self):
100
+ """Test counting critical level messages."""
101
+ handler = LogCounterHandler()
102
+ record = logging.LogRecord(
103
+ name="test",
104
+ level=logging.CRITICAL,
105
+ pathname="",
106
+ lineno=0,
107
+ msg="test",
108
+ args=(),
109
+ exc_info=None,
110
+ )
111
+
112
+ handler.emit(record)
113
+
114
+ assert handler.get_counts()["critical"] == 1
115
+
116
+ def test_counts_multiple_levels(self):
117
+ """Test counting messages at different levels."""
118
+ handler = LogCounterHandler()
119
+
120
+ levels = [
121
+ logging.DEBUG,
122
+ logging.DEBUG,
123
+ logging.INFO,
124
+ logging.WARNING,
125
+ logging.ERROR,
126
+ logging.ERROR,
127
+ logging.ERROR,
128
+ logging.CRITICAL,
129
+ ]
130
+
131
+ for level in levels:
132
+ record = logging.LogRecord(
133
+ name="test",
134
+ level=level,
135
+ pathname="",
136
+ lineno=0,
137
+ msg="test",
138
+ args=(),
139
+ exc_info=None,
140
+ )
141
+ handler.emit(record)
142
+
143
+ counts = handler.get_counts()
144
+ assert counts["debug"] == 2
145
+ assert counts["info"] == 1
146
+ assert counts["warning"] == 1
147
+ assert counts["error"] == 3
148
+ assert counts["critical"] == 1
149
+
150
+
151
+ class TestBlockedCodes:
152
+ """Test BLOCKED_CODES constant."""
153
+
154
+ def test_blocked_codes_contains_expected_values(self):
155
+ """Test that BLOCKED_CODES contains expected HTTP status codes."""
156
+ assert 401 in BLOCKED_CODES # Unauthorized
157
+ assert 403 in BLOCKED_CODES # Forbidden
158
+ assert 407 in BLOCKED_CODES # Proxy Authentication Required
159
+ assert 429 in BLOCKED_CODES # Too Many Requests
160
+ assert 444 in BLOCKED_CODES # Connection Closed Without Response (nginx)
161
+ assert 500 in BLOCKED_CODES # Internal Server Error
162
+ assert 502 in BLOCKED_CODES # Bad Gateway
163
+ assert 503 in BLOCKED_CODES # Service Unavailable
164
+ assert 504 in BLOCKED_CODES # Gateway Timeout
165
+
166
+ def test_blocked_codes_does_not_contain_success(self):
167
+ """Test that success codes are not blocked."""
168
+ assert 200 not in BLOCKED_CODES
169
+ assert 201 not in BLOCKED_CODES
170
+ assert 204 not in BLOCKED_CODES
171
+ assert 301 not in BLOCKED_CODES
172
+ assert 302 not in BLOCKED_CODES
173
+
174
+
175
+ class ConcreteSpider(Spider):
176
+ """Concrete spider implementation for testing."""
177
+
178
+ name = "test_spider"
179
+ start_urls = ["https://example.com"]
180
+
181
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
182
+ yield {"url": str(response)}
183
+
184
+
185
+ class TestSpiderInit:
186
+ """Test Spider initialization."""
187
+
188
+ def test_spider_requires_name(self):
189
+ """Test that spider without name raises ValueError."""
190
+
191
+ class NoNameSpider(Spider):
192
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
193
+ yield None
194
+
195
+ with pytest.raises(ValueError, match="must have a name"):
196
+ NoNameSpider()
197
+
198
+ def test_spider_initializes_logger(self):
199
+ """Test that spider creates a logger."""
200
+ spider = ConcreteSpider()
201
+
202
+ assert spider.logger is not None
203
+ assert spider.logger.name == "scrapling.spiders.test_spider"
204
+
205
+ def test_spider_logger_has_log_counter(self):
206
+ """Test that spider logger has log counter handler."""
207
+ spider = ConcreteSpider()
208
+
209
+ assert spider._log_counter is not None
210
+ assert isinstance(spider._log_counter, LogCounterHandler)
211
+
212
+ def test_spider_with_crawldir(self):
213
+ """Test spider initialization with crawldir."""
214
+ with tempfile.TemporaryDirectory() as tmpdir:
215
+ spider = ConcreteSpider(crawldir=tmpdir)
216
+
217
+ assert spider.crawldir == Path(tmpdir)
218
+
219
+ def test_spider_without_crawldir(self):
220
+ """Test spider initialization without crawldir."""
221
+ spider = ConcreteSpider()
222
+
223
+ assert spider.crawldir is None
224
+
225
+ def test_spider_custom_interval(self):
226
+ """Test spider with custom checkpoint interval."""
227
+ spider = ConcreteSpider(interval=60.0)
228
+
229
+ assert spider._interval == 60.0
230
+
231
+ def test_spider_default_interval(self):
232
+ """Test spider has default checkpoint interval."""
233
+ spider = ConcreteSpider()
234
+
235
+ assert spider._interval == 300.0
236
+
237
+ def test_spider_repr(self):
238
+ """Test spider string representation."""
239
+ spider = ConcreteSpider()
240
+
241
+ repr_str = repr(spider)
242
+
243
+ assert "ConcreteSpider" in repr_str
244
+ assert "test_spider" in repr_str
245
+
246
+
247
+ class TestSpiderClassAttributes:
248
+ """Test Spider class attribute defaults."""
249
+
250
+ def test_default_concurrent_requests(self):
251
+ """Test default concurrent_requests is 16."""
252
+ assert ConcreteSpider.concurrent_requests == 16
253
+
254
+ def test_default_concurrent_requests_per_domain(self):
255
+ """Test default concurrent_requests_per_domain is 0 (disabled)."""
256
+ assert ConcreteSpider.concurrent_requests_per_domain == 0
257
+
258
+ def test_default_download_delay(self):
259
+ """Test default download_delay is 0."""
260
+ assert ConcreteSpider.download_delay == 0.0
261
+
262
+ def test_default_max_blocked_retries(self):
263
+ """Test default max_blocked_retries is 3."""
264
+ assert ConcreteSpider.max_blocked_retries == 3
265
+
266
+ def test_default_logging_level(self):
267
+ """Test default logging level is DEBUG."""
268
+ assert ConcreteSpider.logging_level == logging.DEBUG
269
+
270
+ def test_default_allowed_domains_empty(self):
271
+ """Test default allowed_domains is empty set."""
272
+ assert ConcreteSpider.allowed_domains == set()
273
+
274
+
275
+ class TestSpiderSessionConfiguration:
276
+ """Test Spider session configuration."""
277
+
278
+ def test_default_configure_sessions(self):
279
+ """Test that default configure_sessions adds a session."""
280
+ spider = ConcreteSpider()
281
+
282
+ assert len(spider._session_manager) > 0
283
+
284
+ def test_configure_sessions_error_raises_custom_exception(self):
285
+ """Test that errors in configure_sessions raise SessionConfigurationError."""
286
+
287
+ class BadSessionSpider(Spider):
288
+ name = "bad_spider"
289
+
290
+ def configure_sessions(self, manager: SessionManager) -> None:
291
+ raise RuntimeError("Configuration failed!")
292
+
293
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
294
+ yield None
295
+
296
+ with pytest.raises(SessionConfigurationError, match="Configuration failed"):
297
+ BadSessionSpider()
298
+
299
+ def test_configure_sessions_no_sessions_raises(self):
300
+ """Test that not adding any sessions raises SessionConfigurationError."""
301
+
302
+ class NoSessionSpider(Spider):
303
+ name = "no_session_spider"
304
+
305
+ def configure_sessions(self, manager: SessionManager) -> None:
306
+ pass # Don't add any sessions
307
+
308
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
309
+ yield None
310
+
311
+ with pytest.raises(SessionConfigurationError, match="did not add any sessions"):
312
+ NoSessionSpider()
313
+
314
+
315
+ class TestSpiderStartRequests:
316
+ """Test Spider start_requests method."""
317
+
318
+ @pytest.mark.asyncio
319
+ async def test_start_requests_yields_from_start_urls(self):
320
+ """Test that start_requests yields requests for start_urls."""
321
+
322
+ class MultiUrlSpider(Spider):
323
+ name = "multi_url"
324
+ start_urls = [
325
+ "https://example.com/1",
326
+ "https://example.com/2",
327
+ "https://example.com/3",
328
+ ]
329
+
330
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
331
+ yield None
332
+
333
+ spider = MultiUrlSpider()
334
+ requests = [r async for r in spider.start_requests()]
335
+
336
+ assert len(requests) == 3
337
+ assert requests[0].url == "https://example.com/1"
338
+ assert requests[1].url == "https://example.com/2"
339
+ assert requests[2].url == "https://example.com/3"
340
+
341
+ @pytest.mark.asyncio
342
+ async def test_start_requests_no_urls_raises(self):
343
+ """Test that start_requests raises when no start_urls."""
344
+
345
+ class NoUrlSpider(Spider):
346
+ name = "no_url"
347
+ start_urls = []
348
+
349
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
350
+ yield None
351
+
352
+ spider = NoUrlSpider()
353
+
354
+ with pytest.raises(RuntimeError, match="no starting point"):
355
+ async for _ in spider.start_requests():
356
+ pass
357
+
358
+ @pytest.mark.asyncio
359
+ async def test_start_requests_uses_default_session(self):
360
+ """Test that start_requests uses default session ID."""
361
+ spider = ConcreteSpider()
362
+ requests = [r async for r in spider.start_requests()]
363
+
364
+ # Should use the default session from session manager
365
+ default_sid = spider._session_manager.default_session_id
366
+ assert requests[0].sid == default_sid
367
+
368
+
369
+ class TestSpiderHooks:
370
+ """Test Spider lifecycle hooks."""
371
+
372
+ @pytest.mark.asyncio
373
+ async def test_on_start_default(self):
374
+ """Test default on_start doesn't raise."""
375
+ spider = ConcreteSpider()
376
+
377
+ # Should not raise
378
+ await spider.on_start(resuming=False)
379
+ await spider.on_start(resuming=True)
380
+
381
+ @pytest.mark.asyncio
382
+ async def test_on_close_default(self):
383
+ """Test default on_close doesn't raise."""
384
+ spider = ConcreteSpider()
385
+
386
+ # Should not raise
387
+ await spider.on_close()
388
+
389
+ @pytest.mark.asyncio
390
+ async def test_on_error_default(self):
391
+ """Test default on_error logs the error."""
392
+ spider = ConcreteSpider()
393
+ request = Request("https://example.com")
394
+ error = ValueError("test error")
395
+
396
+ # Should not raise
397
+ await spider.on_error(request, error)
398
+
399
+ @pytest.mark.asyncio
400
+ async def test_on_scraped_item_default_returns_item(self):
401
+ """Test default on_scraped_item returns the item unchanged."""
402
+ spider = ConcreteSpider()
403
+ item = {"key": "value", "nested": {"a": 1}}
404
+
405
+ result = await spider.on_scraped_item(item)
406
+
407
+ assert result == item
408
+
409
+ @pytest.mark.asyncio
410
+ async def test_is_blocked_default_checks_status_codes(self):
411
+ """Test default is_blocked checks blocked status codes."""
412
+
413
+ class MockResponse:
414
+ def __init__(self, status: int):
415
+ self.status = status
416
+
417
+ spider = ConcreteSpider()
418
+
419
+ # Test blocked codes
420
+ assert await spider.is_blocked(MockResponse(403)) is True
421
+ assert await spider.is_blocked(MockResponse(429)) is True
422
+ assert await spider.is_blocked(MockResponse(503)) is True
423
+
424
+ # Test non-blocked codes
425
+ assert await spider.is_blocked(MockResponse(200)) is False
426
+ assert await spider.is_blocked(MockResponse(404)) is False
427
+
428
+ @pytest.mark.asyncio
429
+ async def test_retry_blocked_request_default_returns_request(self):
430
+ """Test default retry_blocked_request returns the request unchanged."""
431
+
432
+ class MockResponse:
433
+ status = 429
434
+
435
+ spider = ConcreteSpider()
436
+ request = Request("https://example.com", priority=5)
437
+
438
+ result = await spider.retry_blocked_request(request, MockResponse())
439
+
440
+ assert result is request
441
+
442
+
443
+ class TestSpiderPause:
444
+ """Test Spider pause functionality."""
445
+
446
+ def test_pause_without_engine_raises(self):
447
+ """Test that pause without active engine raises RuntimeError."""
448
+ spider = ConcreteSpider()
449
+
450
+ with pytest.raises(RuntimeError, match="no crawl engine started"):
451
+ spider.pause()
452
+
453
+
454
+ class TestSpiderStats:
455
+ """Test Spider stats property."""
456
+
457
+ def test_stats_without_engine_raises(self):
458
+ """Test that accessing stats without active crawl raises."""
459
+ spider = ConcreteSpider()
460
+
461
+ with pytest.raises(RuntimeError, match="No active crawl"):
462
+ _ = spider.stats
463
+
464
+
465
+ class TestSpiderCustomization:
466
+ """Test Spider customization patterns."""
467
+
468
+ def test_custom_concurrent_requests(self):
469
+ """Test spider with custom concurrent_requests."""
470
+
471
+ class CustomSpider(Spider):
472
+ name = "custom"
473
+ concurrent_requests = 32
474
+ start_urls = ["https://example.com"]
475
+
476
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
477
+ yield None
478
+
479
+ spider = CustomSpider()
480
+ assert spider.concurrent_requests == 32
481
+
482
+ def test_custom_allowed_domains(self):
483
+ """Test spider with allowed_domains."""
484
+
485
+ class DomainSpider(Spider):
486
+ name = "domain_spider"
487
+ start_urls = ["https://example.com"]
488
+ allowed_domains = {"example.com", "api.example.com"}
489
+
490
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
491
+ yield None
492
+
493
+ spider = DomainSpider()
494
+ assert "example.com" in spider.allowed_domains
495
+ assert "api.example.com" in spider.allowed_domains
496
+
497
+ def test_custom_download_delay(self):
498
+ """Test spider with download delay."""
499
+
500
+ class SlowSpider(Spider):
501
+ name = "slow"
502
+ download_delay = 1.5
503
+ start_urls = ["https://example.com"]
504
+
505
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
506
+ yield None
507
+
508
+ spider = SlowSpider()
509
+ assert spider.download_delay == 1.5
510
+
511
+
512
+ class TestSpiderLogging:
513
+ """Test Spider logging configuration."""
514
+
515
+ def test_custom_logging_level(self):
516
+ """Test spider with custom logging level."""
517
+
518
+ class QuietSpider(Spider):
519
+ name = "quiet"
520
+ logging_level = logging.WARNING
521
+ start_urls = ["https://example.com"]
522
+
523
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
524
+ yield None
525
+
526
+ spider = QuietSpider()
527
+ assert spider.logger.level == logging.WARNING
528
+
529
+ def test_log_file_creates_handler(self):
530
+ """Test spider with log file creates file handler."""
531
+ with tempfile.TemporaryDirectory() as tmpdir:
532
+ log_path = Path(tmpdir) / "spider.log"
533
+
534
+ class FileLogSpider(Spider):
535
+ name = "file_log"
536
+ log_file = str(log_path)
537
+ start_urls = ["https://example.com"]
538
+
539
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
540
+ yield None
541
+
542
+ spider = FileLogSpider()
543
+
544
+ # Should have a file handler
545
+ file_handlers = [
546
+ h for h in spider.logger.handlers if isinstance(h, logging.FileHandler)
547
+ ]
548
+ assert len(file_handlers) == 1
549
+
550
+ # Clean up
551
+ for h in file_handlers:
552
+ h.close()
553
+
554
+ def test_logger_does_not_propagate(self):
555
+ """Test that spider logger does not propagate to parent."""
556
+ spider = ConcreteSpider()
557
+
558
+ assert spider.logger.propagate is False
559
+
560
+
561
+ class TestSessionConfigurationError:
562
+ """Test SessionConfigurationError exception."""
563
+
564
+ def test_exception_message(self):
565
+ """Test that exception preserves message."""
566
+ error = SessionConfigurationError("Custom error message")
567
+
568
+ assert str(error) == "Custom error message"
569
+
570
+ def test_exception_is_exception(self):
571
+ """Test that it's a proper exception."""
572
+ error = SessionConfigurationError("test")
573
+
574
+ assert isinstance(error, Exception)