Karim shoair commited on
Commit
c7d1929
·
1 Parent(s): c923d18

test: add tests for spiders engine

Browse files
Files changed (1) hide show
  1. tests/spiders/test_engine.py +915 -0
tests/spiders/test_engine.py ADDED
@@ -0,0 +1,915 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the CrawlerEngine class."""
2
+
3
+ import tempfile
4
+ from pathlib import Path
5
+
6
+ import anyio
7
+ import pytest
8
+
9
+ from scrapling.spiders.engine import CrawlerEngine, _dump
10
+ from scrapling.spiders.request import Request
11
+ from scrapling.spiders.session import SessionManager
12
+ from scrapling.spiders.result import CrawlStats, ItemList
13
+ from scrapling.spiders.checkpoint import CheckpointData
14
+ from scrapling.core._types import Any, Dict, Set, AsyncGenerator
15
+
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Mock helpers
19
+ # ---------------------------------------------------------------------------
20
+
21
+
22
+ class MockResponse:
23
+ """Minimal Response stand-in."""
24
+
25
+ def __init__(self, status: int = 200, body: bytes = b"ok", url: str = "https://example.com"):
26
+ self.status = status
27
+ self.body = body
28
+ self.url = url
29
+ self.request: Any = None
30
+ self.meta: Dict[str, Any] = {}
31
+
32
+ def __str__(self) -> str:
33
+ return self.url
34
+
35
+
36
+ class MockSession:
37
+ """Mock session that returns a canned response."""
38
+
39
+ def __init__(self, name: str = "mock", response: MockResponse | None = None):
40
+ self.name = name
41
+ self._is_alive = False
42
+ self._response = response or MockResponse()
43
+ self.fetch_calls: list[dict] = []
44
+
45
+ async def __aenter__(self):
46
+ self._is_alive = True
47
+ return self
48
+
49
+ async def __aexit__(self, *args):
50
+ self._is_alive = False
51
+
52
+ async def fetch(self, url: str, **kwargs):
53
+ self.fetch_calls.append({"url": url, **kwargs})
54
+ resp = MockResponse(status=self._response.status, body=self._response.body, url=url)
55
+ return resp
56
+
57
+
58
+ class ErrorSession(MockSession):
59
+ """Session that raises on fetch."""
60
+
61
+ def __init__(self, error: Exception | None = None):
62
+ super().__init__("error")
63
+ self._error = error or RuntimeError("fetch failed")
64
+
65
+ async def fetch(self, url: str, **kwargs):
66
+ raise self._error
67
+
68
+
69
+ class MockSpider:
70
+ """Lightweight spider stub for engine tests."""
71
+
72
+ def __init__(
73
+ self,
74
+ *,
75
+ concurrent_requests: int = 4,
76
+ concurrent_requests_per_domain: int = 0,
77
+ download_delay: float = 0.0,
78
+ max_blocked_retries: int = 3,
79
+ allowed_domains: Set[str] | None = None,
80
+ fp_include_kwargs: bool = False,
81
+ fp_include_headers: bool = False,
82
+ fp_keep_fragments: bool = False,
83
+ is_blocked_fn=None,
84
+ on_scraped_item_fn=None,
85
+ retry_blocked_request_fn=None,
86
+ ):
87
+ self.concurrent_requests = concurrent_requests
88
+ self.concurrent_requests_per_domain = concurrent_requests_per_domain
89
+ self.download_delay = download_delay
90
+ self.max_blocked_retries = max_blocked_retries
91
+ self.allowed_domains = allowed_domains or set()
92
+ self.fp_include_kwargs = fp_include_kwargs
93
+ self.fp_include_headers = fp_include_headers
94
+ self.fp_keep_fragments = fp_keep_fragments
95
+ self.name = "test_spider"
96
+
97
+ # Tracking lists
98
+ self.on_start_calls: list[dict] = []
99
+ self.on_close_calls: int = 0
100
+ self.on_error_calls: list[tuple[Request, Exception]] = []
101
+ self.scraped_items: list[dict] = []
102
+ self.blocked_responses: list = []
103
+ self.retry_requests: list = []
104
+
105
+ # Pluggable behaviour
106
+ self._is_blocked_fn = is_blocked_fn
107
+ self._on_scraped_item_fn = on_scraped_item_fn
108
+ self._retry_blocked_request_fn = retry_blocked_request_fn
109
+
110
+ # Log counter stub
111
+ self._log_counter = _LogCounterStub()
112
+
113
+ async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
114
+ yield {"url": str(response)}
115
+
116
+ async def on_start(self, resuming: bool = False) -> None:
117
+ self.on_start_calls.append({"resuming": resuming})
118
+
119
+ async def on_close(self) -> None:
120
+ self.on_close_calls += 1
121
+
122
+ async def on_error(self, request: Request, error: Exception) -> None:
123
+ self.on_error_calls.append((request, error))
124
+
125
+ async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:
126
+ if self._on_scraped_item_fn:
127
+ return self._on_scraped_item_fn(item)
128
+ self.scraped_items.append(item)
129
+ return item
130
+
131
+ async def is_blocked(self, response) -> bool:
132
+ if self._is_blocked_fn:
133
+ return self._is_blocked_fn(response)
134
+ return False
135
+
136
+ async def retry_blocked_request(self, request: Request, response) -> Request:
137
+ self.retry_requests.append(request)
138
+ if self._retry_blocked_request_fn:
139
+ return self._retry_blocked_request_fn(request, response)
140
+ return request
141
+
142
+ async def start_requests(self) -> AsyncGenerator[Request, None]:
143
+ yield Request("https://example.com", sid="default")
144
+
145
+
146
+ class _LogCounterStub:
147
+ """Stub for LogCounterHandler."""
148
+
149
+ def get_counts(self) -> Dict[str, int]:
150
+ return {"debug": 0, "info": 0, "warning": 0, "error": 0, "critical": 0}
151
+
152
+
153
+ def _make_engine(
154
+ spider: MockSpider | None = None,
155
+ session: MockSession | None = None,
156
+ crawldir: str | None = None,
157
+ interval: float = 300.0,
158
+ ) -> CrawlerEngine:
159
+ """Create a CrawlerEngine wired to mock objects."""
160
+ spider = spider or MockSpider()
161
+ sm = SessionManager()
162
+ sm.add("default", session or MockSession())
163
+ return CrawlerEngine(spider, sm, crawldir=crawldir, interval=interval)
164
+
165
+
166
+ # ---------------------------------------------------------------------------
167
+ # Tests: _dump helper
168
+ # ---------------------------------------------------------------------------
169
+
170
+
171
+ class TestDumpHelper:
172
+ def test_dump_returns_json_string(self):
173
+ result = _dump({"key": "value"})
174
+ assert '"key": "value"' in result
175
+
176
+ def test_dump_handles_nested(self):
177
+ result = _dump({"a": {"b": 1}})
178
+ assert '"a"' in result
179
+ assert '"b"' in result
180
+
181
+
182
+ # ---------------------------------------------------------------------------
183
+ # Tests: __init__
184
+ # ---------------------------------------------------------------------------
185
+
186
+
187
+ class TestCrawlerEngineInit:
188
+ def test_default_initialisation(self):
189
+ engine = _make_engine()
190
+
191
+ assert engine._running is False
192
+ assert engine._active_tasks == 0
193
+ assert engine._pause_requested is False
194
+ assert engine._force_stop is False
195
+ assert engine.paused is False
196
+ assert isinstance(engine.stats, CrawlStats)
197
+ assert isinstance(engine.items, ItemList)
198
+
199
+ def test_checkpoint_system_disabled_by_default(self):
200
+ engine = _make_engine()
201
+ assert engine._checkpoint_system_enabled is False
202
+
203
+ def test_checkpoint_system_enabled_with_crawldir(self):
204
+ with tempfile.TemporaryDirectory() as tmpdir:
205
+ engine = _make_engine(crawldir=tmpdir)
206
+ assert engine._checkpoint_system_enabled is True
207
+
208
+ def test_global_limiter_uses_concurrent_requests(self):
209
+ spider = MockSpider(concurrent_requests=8)
210
+ engine = _make_engine(spider=spider)
211
+ assert engine._global_limiter.total_tokens == 8
212
+
213
+ def test_allowed_domains_from_spider(self):
214
+ spider = MockSpider(allowed_domains={"example.com", "test.org"})
215
+ engine = _make_engine(spider=spider)
216
+ assert engine._allowed_domains == {"example.com", "test.org"}
217
+
218
+
219
+ # ---------------------------------------------------------------------------
220
+ # Tests: _is_domain_allowed
221
+ # ---------------------------------------------------------------------------
222
+
223
+
224
+ class TestIsDomainAllowed:
225
+ def test_all_allowed_when_empty(self):
226
+ engine = _make_engine()
227
+ request = Request("https://anything.com/page")
228
+ assert engine._is_domain_allowed(request) is True
229
+
230
+ def test_exact_domain_match(self):
231
+ spider = MockSpider(allowed_domains={"example.com"})
232
+ engine = _make_engine(spider=spider)
233
+
234
+ assert engine._is_domain_allowed(Request("https://example.com/page")) is True
235
+ assert engine._is_domain_allowed(Request("https://other.com/page")) is False
236
+
237
+ def test_subdomain_match(self):
238
+ spider = MockSpider(allowed_domains={"example.com"})
239
+ engine = _make_engine(spider=spider)
240
+
241
+ assert engine._is_domain_allowed(Request("https://sub.example.com/page")) is True
242
+ assert engine._is_domain_allowed(Request("https://deep.sub.example.com/x")) is True
243
+
244
+ def test_partial_name_not_matched(self):
245
+ spider = MockSpider(allowed_domains={"example.com"})
246
+ engine = _make_engine(spider=spider)
247
+
248
+ # "notexample.com" should NOT match "example.com"
249
+ assert engine._is_domain_allowed(Request("https://notexample.com/x")) is False
250
+
251
+ def test_multiple_allowed_domains(self):
252
+ spider = MockSpider(allowed_domains={"a.com", "b.org"})
253
+ engine = _make_engine(spider=spider)
254
+
255
+ assert engine._is_domain_allowed(Request("https://a.com/")) is True
256
+ assert engine._is_domain_allowed(Request("https://b.org/")) is True
257
+ assert engine._is_domain_allowed(Request("https://c.net/")) is False
258
+
259
+
260
+ # ---------------------------------------------------------------------------
261
+ # Tests: _rate_limiter
262
+ # ---------------------------------------------------------------------------
263
+
264
+
265
+ class TestRateLimiter:
266
+ def test_returns_global_limiter_when_per_domain_disabled(self):
267
+ engine = _make_engine() # concurrent_requests_per_domain=0
268
+ limiter = engine._rate_limiter("example.com")
269
+ assert limiter is engine._global_limiter
270
+
271
+ def test_returns_per_domain_limiter_when_enabled(self):
272
+ spider = MockSpider(concurrent_requests_per_domain=2)
273
+ engine = _make_engine(spider=spider)
274
+
275
+ limiter = engine._rate_limiter("example.com")
276
+ assert limiter is not engine._global_limiter
277
+ assert limiter.total_tokens == 2
278
+
279
+ def test_same_domain_returns_same_limiter(self):
280
+ spider = MockSpider(concurrent_requests_per_domain=2)
281
+ engine = _make_engine(spider=spider)
282
+
283
+ l1 = engine._rate_limiter("example.com")
284
+ l2 = engine._rate_limiter("example.com")
285
+ assert l1 is l2
286
+
287
+ def test_different_domains_get_different_limiters(self):
288
+ spider = MockSpider(concurrent_requests_per_domain=2)
289
+ engine = _make_engine(spider=spider)
290
+
291
+ l1 = engine._rate_limiter("a.com")
292
+ l2 = engine._rate_limiter("b.com")
293
+ assert l1 is not l2
294
+
295
+
296
+ # ---------------------------------------------------------------------------
297
+ # Tests: _normalize_request
298
+ # ---------------------------------------------------------------------------
299
+
300
+
301
+ class TestNormalizeRequest:
302
+ def test_sets_default_sid_when_empty(self):
303
+ engine = _make_engine()
304
+ request = Request("https://example.com")
305
+ assert request.sid == ""
306
+
307
+ engine._normalize_request(request)
308
+ assert request.sid == "default"
309
+
310
+ def test_preserves_existing_sid(self):
311
+ engine = _make_engine()
312
+ request = Request("https://example.com", sid="custom")
313
+
314
+ engine._normalize_request(request)
315
+ assert request.sid == "custom"
316
+
317
+
318
+ # ---------------------------------------------------------------------------
319
+ # Tests: _process_request
320
+ # ---------------------------------------------------------------------------
321
+
322
+
323
+ class TestProcessRequest:
324
+ @pytest.mark.asyncio
325
+ async def test_successful_fetch_updates_stats(self):
326
+ spider = MockSpider()
327
+ session = MockSession(response=MockResponse(status=200, body=b"hello"))
328
+ engine = _make_engine(spider=spider, session=session)
329
+
330
+ request = Request("https://example.com", sid="default")
331
+ await engine._process_request(request)
332
+
333
+ assert engine.stats.requests_count == 1
334
+ assert engine.stats.response_bytes == 5 # len(b"hello") from MockSession
335
+ assert "status_200" in engine.stats.response_status_count
336
+
337
+ @pytest.mark.asyncio
338
+ async def test_failed_fetch_increments_failed_count(self):
339
+ spider = MockSpider()
340
+ sm = SessionManager()
341
+ sm.add("default", ErrorSession())
342
+ engine = CrawlerEngine(spider, sm)
343
+
344
+ request = Request("https://example.com", sid="default")
345
+ await engine._process_request(request)
346
+
347
+ assert engine.stats.failed_requests_count == 1
348
+ assert len(spider.on_error_calls) == 1
349
+
350
+ @pytest.mark.asyncio
351
+ async def test_failed_fetch_does_not_increment_requests_count(self):
352
+ spider = MockSpider()
353
+ sm = SessionManager()
354
+ sm.add("default", ErrorSession())
355
+ engine = CrawlerEngine(spider, sm)
356
+
357
+ request = Request("https://example.com", sid="default")
358
+ await engine._process_request(request)
359
+
360
+ assert engine.stats.requests_count == 0
361
+
362
+ @pytest.mark.asyncio
363
+ async def test_blocked_response_triggers_retry(self):
364
+ spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=2)
365
+ engine = _make_engine(spider=spider)
366
+
367
+ request = Request("https://example.com", sid="default")
368
+ await engine._process_request(request)
369
+
370
+ assert engine.stats.blocked_requests_count == 1
371
+ # A retry request should be enqueued
372
+ assert not engine.scheduler.is_empty
373
+
374
+ @pytest.mark.asyncio
375
+ async def test_blocked_response_max_retries_exceeded(self):
376
+ spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=2)
377
+ engine = _make_engine(spider=spider)
378
+
379
+ request = Request("https://example.com", sid="default")
380
+ request._retry_count = 2 # Already at max
381
+ await engine._process_request(request)
382
+
383
+ assert engine.stats.blocked_requests_count == 1
384
+ # No retry enqueued
385
+ assert engine.scheduler.is_empty
386
+
387
+ @pytest.mark.asyncio
388
+ async def test_retry_request_has_dont_filter(self):
389
+ spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=3)
390
+ engine = _make_engine(spider=spider)
391
+
392
+ request = Request("https://example.com", sid="default")
393
+ await engine._process_request(request)
394
+
395
+ retry = await engine.scheduler.dequeue()
396
+ assert retry.dont_filter is True
397
+ assert retry._retry_count == 1
398
+
399
+ @pytest.mark.asyncio
400
+ async def test_retry_clears_proxy_kwargs(self):
401
+ spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=3)
402
+ engine = _make_engine(spider=spider)
403
+
404
+ request = Request("https://example.com", sid="default", proxy="http://proxy:8080")
405
+ await engine._process_request(request)
406
+
407
+ retry = await engine.scheduler.dequeue()
408
+ assert "proxy" not in retry._session_kwargs
409
+ assert "proxies" not in retry._session_kwargs
410
+
411
+ @pytest.mark.asyncio
412
+ async def test_callback_yielding_dict_increments_items(self):
413
+ spider = MockSpider()
414
+ engine = _make_engine(spider=spider)
415
+
416
+ request = Request("https://example.com", sid="default")
417
+ await engine._process_request(request)
418
+
419
+ assert engine.stats.items_scraped == 1
420
+ assert len(engine.items) == 1
421
+
422
+ @pytest.mark.asyncio
423
+ async def test_callback_yielding_request_enqueues(self):
424
+ async def callback(response) -> AsyncGenerator:
425
+ yield Request("https://example.com/page2", sid="default")
426
+
427
+ spider = MockSpider()
428
+ engine = _make_engine(spider=spider)
429
+
430
+ request = Request("https://example.com", sid="default", callback=callback)
431
+ await engine._process_request(request)
432
+
433
+ assert not engine.scheduler.is_empty
434
+
435
+ @pytest.mark.asyncio
436
+ async def test_callback_yielding_offsite_request_filtered(self):
437
+ async def callback(response) -> AsyncGenerator:
438
+ yield Request("https://other.com/page", sid="default")
439
+
440
+ spider = MockSpider(allowed_domains={"example.com"})
441
+ engine = _make_engine(spider=spider)
442
+
443
+ request = Request("https://example.com", sid="default", callback=callback)
444
+ await engine._process_request(request)
445
+
446
+ assert engine.stats.offsite_requests_count == 1
447
+ assert engine.scheduler.is_empty
448
+
449
+ @pytest.mark.asyncio
450
+ async def test_dropped_item_when_on_scraped_item_returns_none(self):
451
+ spider = MockSpider(on_scraped_item_fn=lambda item: None)
452
+ engine = _make_engine(spider=spider)
453
+
454
+ request = Request("https://example.com", sid="default")
455
+ await engine._process_request(request)
456
+
457
+ assert engine.stats.items_dropped == 1
458
+ assert engine.stats.items_scraped == 0
459
+ assert len(engine.items) == 0
460
+
461
+ @pytest.mark.asyncio
462
+ async def test_callback_exception_calls_on_error(self):
463
+ async def bad_callback(response) -> AsyncGenerator:
464
+ raise ValueError("callback boom")
465
+ yield # noqa: unreachable
466
+
467
+ spider = MockSpider()
468
+ engine = _make_engine(spider=spider)
469
+
470
+ request = Request("https://example.com", sid="default", callback=bad_callback)
471
+ await engine._process_request(request)
472
+
473
+ assert len(spider.on_error_calls) == 1
474
+ assert isinstance(spider.on_error_calls[0][1], ValueError)
475
+
476
+ @pytest.mark.asyncio
477
+ async def test_proxy_tracked_in_stats(self):
478
+ spider = MockSpider()
479
+ engine = _make_engine(spider=spider)
480
+
481
+ request = Request("https://example.com", sid="default", proxy="http://p:8080")
482
+ await engine._process_request(request)
483
+
484
+ assert "http://p:8080" in engine.stats.proxies
485
+
486
+ @pytest.mark.asyncio
487
+ async def test_proxies_dict_tracked_in_stats(self):
488
+ spider = MockSpider()
489
+ engine = _make_engine(spider=spider)
490
+
491
+ proxies = {"http": "http://p:8080", "https": "https://p:8443"}
492
+ request = Request("https://example.com", sid="default", proxies=proxies)
493
+ await engine._process_request(request)
494
+
495
+ assert len(engine.stats.proxies) == 1
496
+ assert engine.stats.proxies[0] == proxies
497
+
498
+ @pytest.mark.asyncio
499
+ async def test_uses_parse_when_no_callback(self):
500
+ items_seen = []
501
+
502
+ async def custom_parse(response) -> AsyncGenerator:
503
+ yield {"from": "custom_parse"}
504
+
505
+ spider = MockSpider()
506
+ spider.parse = custom_parse # type: ignore[assignment]
507
+ engine = _make_engine(spider=spider)
508
+
509
+ request = Request("https://example.com", sid="default")
510
+ # No callback set → should use spider.parse
511
+ await engine._process_request(request)
512
+
513
+ assert engine.stats.items_scraped == 1
514
+
515
+
516
+ # ---------------------------------------------------------------------------
517
+ # Tests: _task_wrapper
518
+ # ---------------------------------------------------------------------------
519
+
520
+
521
+ class TestTaskWrapper:
522
+ @pytest.mark.asyncio
523
+ async def test_decrements_active_tasks(self):
524
+ engine = _make_engine()
525
+ engine._active_tasks = 1
526
+
527
+ request = Request("https://example.com", sid="default")
528
+ await engine._task_wrapper(request)
529
+
530
+ assert engine._active_tasks == 0
531
+
532
+ @pytest.mark.asyncio
533
+ async def test_decrements_even_on_error(self):
534
+ spider = MockSpider()
535
+ sm = SessionManager()
536
+ sm.add("default", ErrorSession())
537
+ engine = CrawlerEngine(spider, sm)
538
+ engine._active_tasks = 1
539
+
540
+ request = Request("https://example.com", sid="default")
541
+ await engine._task_wrapper(request)
542
+
543
+ assert engine._active_tasks == 0
544
+
545
+
546
+ # ---------------------------------------------------------------------------
547
+ # Tests: request_pause
548
+ # ---------------------------------------------------------------------------
549
+
550
+
551
+ class TestRequestPause:
552
+ def test_first_call_sets_pause_requested(self):
553
+ engine = _make_engine()
554
+
555
+ engine.request_pause()
556
+
557
+ assert engine._pause_requested is True
558
+ assert engine._force_stop is False
559
+
560
+ def test_second_call_sets_force_stop(self):
561
+ engine = _make_engine()
562
+
563
+ engine.request_pause() # first
564
+ engine.request_pause() # second
565
+
566
+ assert engine._pause_requested is True
567
+ assert engine._force_stop is True
568
+
569
+ def test_third_call_after_force_stop_is_noop(self):
570
+ engine = _make_engine()
571
+
572
+ engine.request_pause()
573
+ engine.request_pause()
574
+ engine.request_pause() # should not raise
575
+
576
+ assert engine._force_stop is True
577
+
578
+
579
+ # ---------------------------------------------------------------------------
580
+ # Tests: checkpoint methods
581
+ # ---------------------------------------------------------------------------
582
+
583
+
584
+ class TestCheckpointMethods:
585
+ def test_is_checkpoint_time_false_when_disabled(self):
586
+ engine = _make_engine() # no crawldir
587
+ assert engine._is_checkpoint_time() is False
588
+
589
+ @pytest.mark.asyncio
590
+ async def test_save_and_restore_checkpoint(self):
591
+ with tempfile.TemporaryDirectory() as tmpdir:
592
+ spider = MockSpider()
593
+ engine = _make_engine(spider=spider, crawldir=tmpdir)
594
+
595
+ # Enqueue a request so snapshot has data
596
+ req = Request("https://example.com", sid="default")
597
+ engine._normalize_request(req)
598
+ await engine.scheduler.enqueue(req)
599
+
600
+ await engine._save_checkpoint()
601
+
602
+ # Verify checkpoint file exists
603
+ checkpoint_path = Path(tmpdir) / "checkpoint.pkl"
604
+ assert checkpoint_path.exists()
605
+
606
+ @pytest.mark.asyncio
607
+ async def test_restore_when_no_checkpoint_returns_false(self):
608
+ with tempfile.TemporaryDirectory() as tmpdir:
609
+ engine = _make_engine(crawldir=tmpdir)
610
+ result = await engine._restore_from_checkpoint()
611
+ assert result is False
612
+
613
+ @pytest.mark.asyncio
614
+ async def test_restore_from_checkpoint_raises_when_disabled(self):
615
+ engine = _make_engine() # no crawldir → checkpoint disabled
616
+ with pytest.raises(RuntimeError):
617
+ await engine._restore_from_checkpoint()
618
+
619
+
620
+ # ---------------------------------------------------------------------------
621
+ # Tests: crawl
622
+ # ---------------------------------------------------------------------------
623
+
624
+
625
+ class TestCrawl:
626
+ @pytest.mark.asyncio
627
+ async def test_basic_crawl_returns_stats(self):
628
+ spider = MockSpider()
629
+ engine = _make_engine(spider=spider)
630
+
631
+ stats = await engine.crawl()
632
+
633
+ assert isinstance(stats, CrawlStats)
634
+ assert stats.requests_count >= 1
635
+ assert stats.items_scraped >= 1
636
+
637
+ @pytest.mark.asyncio
638
+ async def test_crawl_calls_on_start_and_on_close(self):
639
+ spider = MockSpider()
640
+ engine = _make_engine(spider=spider)
641
+
642
+ await engine.crawl()
643
+
644
+ assert len(spider.on_start_calls) == 1
645
+ assert spider.on_start_calls[0]["resuming"] is False
646
+ assert spider.on_close_calls == 1
647
+
648
+ @pytest.mark.asyncio
649
+ async def test_crawl_sets_stats_timing(self):
650
+ spider = MockSpider()
651
+ engine = _make_engine(spider=spider)
652
+
653
+ stats = await engine.crawl()
654
+
655
+ assert stats.start_time > 0
656
+ assert stats.end_time > 0
657
+ assert stats.end_time >= stats.start_time
658
+
659
+ @pytest.mark.asyncio
660
+ async def test_crawl_sets_concurrency_stats(self):
661
+ spider = MockSpider(concurrent_requests=16, concurrent_requests_per_domain=4)
662
+ engine = _make_engine(spider=spider)
663
+
664
+ stats = await engine.crawl()
665
+
666
+ assert stats.concurrent_requests == 16
667
+ assert stats.concurrent_requests_per_domain == 4
668
+
669
+ @pytest.mark.asyncio
670
+ async def test_crawl_processes_multiple_start_urls(self):
671
+ spider = MockSpider()
672
+
673
+ urls = ["https://example.com/1", "https://example.com/2", "https://example.com/3"]
674
+
675
+ async def multi_start_requests() -> AsyncGenerator[Request, None]:
676
+ for url in urls:
677
+ yield Request(url, sid="default")
678
+
679
+ spider.start_requests = multi_start_requests # type: ignore[assignment]
680
+ engine = _make_engine(spider=spider)
681
+
682
+ stats = await engine.crawl()
683
+
684
+ assert stats.requests_count == 3
685
+ assert stats.items_scraped == 3
686
+
687
+ @pytest.mark.asyncio
688
+ async def test_crawl_follows_yielded_requests(self):
689
+ """Test that requests yielded from callbacks are processed."""
690
+ call_count = 0
691
+
692
+ async def parse_with_follow(response) -> AsyncGenerator:
693
+ nonlocal call_count
694
+ call_count += 1
695
+ if call_count == 1:
696
+ yield Request("https://example.com/page2", sid="default")
697
+ yield {"page": str(response)}
698
+
699
+ spider = MockSpider()
700
+ spider.parse = parse_with_follow # type: ignore[assignment]
701
+ engine = _make_engine(spider=spider)
702
+
703
+ stats = await engine.crawl()
704
+
705
+ assert stats.requests_count == 2
706
+ assert stats.items_scraped == 2
707
+
708
+ @pytest.mark.asyncio
709
+ async def test_crawl_with_download_delay(self):
710
+ spider = MockSpider(download_delay=0.01)
711
+ engine = _make_engine(spider=spider)
712
+
713
+ stats = await engine.crawl()
714
+
715
+ assert stats.download_delay == 0.01
716
+ assert stats.requests_count >= 1
717
+
718
+ @pytest.mark.asyncio
719
+ async def test_crawl_filters_offsite_requests(self):
720
+ async def parse_offsite(response) -> AsyncGenerator:
721
+ yield Request("https://other-domain.com/page", sid="default")
722
+ yield {"url": str(response)}
723
+
724
+ spider = MockSpider(allowed_domains={"example.com"})
725
+ spider.parse = parse_offsite # type: ignore[assignment]
726
+ engine = _make_engine(spider=spider)
727
+
728
+ stats = await engine.crawl()
729
+
730
+ assert stats.offsite_requests_count == 1
731
+ assert stats.requests_count == 1 # Only the initial request
732
+
733
+ @pytest.mark.asyncio
734
+ async def test_crawl_cleans_up_checkpoint_on_completion(self):
735
+ with tempfile.TemporaryDirectory() as tmpdir:
736
+ spider = MockSpider()
737
+ engine = _make_engine(spider=spider, crawldir=tmpdir)
738
+
739
+ await engine.crawl()
740
+
741
+ checkpoint_path = Path(tmpdir) / "checkpoint.pkl"
742
+ assert not checkpoint_path.exists() # Cleaned up
743
+
744
+ @pytest.mark.asyncio
745
+ async def test_crawl_handles_fetch_error_gracefully(self):
746
+ spider = MockSpider()
747
+ sm = SessionManager()
748
+ sm.add("default", ErrorSession())
749
+ engine = CrawlerEngine(spider, sm)
750
+
751
+ stats = await engine.crawl()
752
+
753
+ assert stats.failed_requests_count == 1
754
+ assert len(spider.on_error_calls) == 1
755
+
756
+ @pytest.mark.asyncio
757
+ async def test_crawl_log_levels_populated(self):
758
+ spider = MockSpider()
759
+ engine = _make_engine(spider=spider)
760
+
761
+ stats = await engine.crawl()
762
+
763
+ assert isinstance(stats.log_levels_counter, dict)
764
+
765
+ @pytest.mark.asyncio
766
+ async def test_crawl_resets_state_on_each_run(self):
767
+ spider = MockSpider()
768
+ engine = _make_engine(spider=spider)
769
+
770
+ # Run first crawl
771
+ await engine.crawl()
772
+ assert engine.stats.requests_count >= 1
773
+
774
+ # Run second crawl - stats should reset
775
+ stats = await engine.crawl()
776
+ # Items are cleared on each crawl
777
+ assert engine.paused is False
778
+
779
+
780
+ # ---------------------------------------------------------------------------
781
+ # Tests: items property
782
+ # ---------------------------------------------------------------------------
783
+
784
+
785
+ class TestItemsProperty:
786
+ def test_items_returns_item_list(self):
787
+ engine = _make_engine()
788
+ assert isinstance(engine.items, ItemList)
789
+
790
+ def test_items_initially_empty(self):
791
+ engine = _make_engine()
792
+ assert len(engine.items) == 0
793
+
794
+ @pytest.mark.asyncio
795
+ async def test_items_populated_after_crawl(self):
796
+ engine = _make_engine()
797
+ await engine.crawl()
798
+ assert len(engine.items) >= 1
799
+
800
+
801
+ # ---------------------------------------------------------------------------
802
+ # Tests: streaming (__aiter__ / _stream)
803
+ # ---------------------------------------------------------------------------
804
+
805
+
806
+ class TestStreaming:
807
+ @pytest.mark.asyncio
808
+ async def test_stream_yields_items(self):
809
+ spider = MockSpider()
810
+ engine = _make_engine(spider=spider)
811
+
812
+ items = []
813
+ async for item in engine:
814
+ items.append(item)
815
+
816
+ assert len(items) >= 1
817
+ assert isinstance(items[0], dict)
818
+
819
+ @pytest.mark.asyncio
820
+ async def test_stream_processes_follow_up_requests(self):
821
+ call_count = 0
822
+
823
+ async def parse_with_follow(response) -> AsyncGenerator:
824
+ nonlocal call_count
825
+ call_count += 1
826
+ if call_count == 1:
827
+ yield Request("https://example.com/page2", sid="default")
828
+ yield {"page": call_count}
829
+
830
+ spider = MockSpider()
831
+ spider.parse = parse_with_follow # type: ignore[assignment]
832
+ engine = _make_engine(spider=spider)
833
+
834
+ items = []
835
+ async for item in engine:
836
+ items.append(item)
837
+
838
+ assert len(items) == 2
839
+
840
+ @pytest.mark.asyncio
841
+ async def test_stream_items_not_stored_in_items_list(self):
842
+ """When streaming, items go to the stream, not to engine._items."""
843
+ spider = MockSpider()
844
+ engine = _make_engine(spider=spider)
845
+
846
+ items = []
847
+ async for item in engine:
848
+ items.append(item)
849
+
850
+ # Items were sent through stream, not appended to _items
851
+ assert len(items) >= 1
852
+ assert len(engine.items) == 0
853
+
854
+
855
+ # ---------------------------------------------------------------------------
856
+ # Tests: pause during crawl
857
+ # ---------------------------------------------------------------------------
858
+
859
+
860
+ class TestPauseDuringCrawl:
861
+ @pytest.mark.asyncio
862
+ async def test_pause_stops_crawl_gracefully(self):
863
+ processed = 0
864
+
865
+ async def slow_parse(response) -> AsyncGenerator:
866
+ nonlocal processed
867
+ processed += 1
868
+ # Yield more requests to keep the crawl going
869
+ if processed <= 2:
870
+ yield Request(f"https://example.com/p{processed + 1}", sid="default")
871
+ yield {"n": processed}
872
+
873
+ spider = MockSpider()
874
+ spider.parse = slow_parse # type: ignore[assignment]
875
+ engine = _make_engine(spider=spider)
876
+
877
+ # Request pause immediately - the engine will stop as soon as active tasks complete
878
+ engine._pause_requested = True
879
+
880
+ stats = await engine.crawl()
881
+ # Should stop without processing everything
882
+ assert engine._running is False
883
+
884
+ @pytest.mark.asyncio
885
+ async def test_pause_with_checkpoint_sets_paused(self):
886
+ with tempfile.TemporaryDirectory() as tmpdir:
887
+ parse_count = 0
888
+
889
+ async def parse_and_pause(response) -> AsyncGenerator:
890
+ nonlocal parse_count
891
+ parse_count += 1
892
+ # Request pause after first request, but yield follow-ups
893
+ if parse_count == 1:
894
+ engine.request_pause()
895
+ yield Request("https://example.com/p2", sid="default")
896
+ yield {"n": parse_count}
897
+
898
+ spider = MockSpider()
899
+ spider.parse = parse_and_pause # type: ignore[assignment]
900
+ engine = _make_engine(spider=spider, crawldir=tmpdir)
901
+
902
+ await engine.crawl()
903
+
904
+ assert engine.paused is True
905
+
906
+ @pytest.mark.asyncio
907
+ async def test_pause_without_checkpoint_does_not_set_paused(self):
908
+ spider = MockSpider()
909
+ engine = _make_engine(spider=spider)
910
+
911
+ engine._pause_requested = True
912
+
913
+ await engine.crawl()
914
+
915
+ assert engine.paused is False