Karim shoair commited on
Commit
7489283
·
1 Parent(s): 47dd985

test: add tests for the new feature

Browse files
tests/fetchers/test_utils.py CHANGED
@@ -4,8 +4,11 @@ from pathlib import Path
4
  from scrapling.engines.toolbelt.custom import StatusText, Response
5
  from scrapling.engines.toolbelt.navigation import (
6
  construct_proxy_dict,
7
- js_bypass_path
 
 
8
  )
 
9
  from scrapling.engines.toolbelt.fingerprints import (
10
  generate_convincing_referer,
11
  get_os_name,
@@ -300,3 +303,150 @@ class TestResponse:
300
 
301
  # Should handle 'bytes' content properly
302
  assert response.status == 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from scrapling.engines.toolbelt.custom import StatusText, Response
5
  from scrapling.engines.toolbelt.navigation import (
6
  construct_proxy_dict,
7
+ create_intercept_handler,
8
+ create_async_intercept_handler,
9
+ js_bypass_path,
10
  )
11
+ from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
12
  from scrapling.engines.toolbelt.fingerprints import (
13
  generate_convincing_referer,
14
  get_os_name,
 
303
 
304
  # Should handle 'bytes' content properly
305
  assert response.status == 200
306
+
307
+
308
+ class _MockRequest:
309
+ """Minimal mock for Playwright's Request object."""
310
+ def __init__(self, url: str, resource_type: str = "document"):
311
+ self.url = url
312
+ self.resource_type = resource_type
313
+
314
+
315
+ class _MockRoute:
316
+ """Minimal mock for Playwright's sync Route object."""
317
+ def __init__(self, url: str, resource_type: str = "document"):
318
+ self.request = _MockRequest(url, resource_type)
319
+ self.aborted = False
320
+ self.continued = False
321
+
322
+ def abort(self):
323
+ self.aborted = True
324
+
325
+ def continue_(self):
326
+ self.continued = True
327
+
328
+
329
+ class _AsyncMockRoute:
330
+ """Minimal mock for Playwright's async Route object."""
331
+ def __init__(self, url: str, resource_type: str = "document"):
332
+ self.request = _MockRequest(url, resource_type)
333
+ self.aborted = False
334
+ self.continued = False
335
+
336
+ async def abort(self):
337
+ self.aborted = True
338
+
339
+ async def continue_(self):
340
+ self.continued = True
341
+
342
+
343
+ class TestCreateInterceptHandler:
344
+ """Test the unified sync route handler factory."""
345
+
346
+ def test_blocks_disabled_resource_types(self):
347
+ handler = create_intercept_handler(disable_resources=True)
348
+ route = _MockRoute("https://example.com/image.png", resource_type="image")
349
+ handler(route)
350
+ assert route.aborted
351
+
352
+ def test_continues_allowed_resource_types(self):
353
+ handler = create_intercept_handler(disable_resources=True)
354
+ route = _MockRoute("https://example.com/page", resource_type="document")
355
+ handler(route)
356
+ assert route.continued
357
+
358
+ def test_blocks_exact_domain(self):
359
+ handler = create_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"})
360
+ route = _MockRoute("https://ads.example.com/tracker.js")
361
+ handler(route)
362
+ assert route.aborted
363
+
364
+ def test_blocks_subdomain(self):
365
+ handler = create_intercept_handler(disable_resources=False, blocked_domains={"example.com"})
366
+ route = _MockRoute("https://sub.example.com/page")
367
+ handler(route)
368
+ assert route.aborted
369
+
370
+ def test_continues_non_blocked_domain(self):
371
+ handler = create_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"})
372
+ route = _MockRoute("https://safe.example.com/page")
373
+ handler(route)
374
+ assert route.continued
375
+
376
+ def test_resource_blocking_takes_priority_over_domain(self):
377
+ """When both are active, resource type check comes first."""
378
+ handler = create_intercept_handler(disable_resources=True, blocked_domains={"example.com"})
379
+ route = _MockRoute("https://example.com/style.css", resource_type="stylesheet")
380
+ handler(route)
381
+ assert route.aborted
382
+
383
+ def test_domain_blocking_with_resources_disabled(self):
384
+ """Non-blocked resource type from a blocked domain should still be aborted."""
385
+ handler = create_intercept_handler(disable_resources=True, blocked_domains={"tracker.io"})
386
+ route = _MockRoute("https://tracker.io/api", resource_type="document")
387
+ handler(route)
388
+ assert route.aborted
389
+
390
+ def test_no_blocking_continues(self):
391
+ handler = create_intercept_handler(disable_resources=False)
392
+ route = _MockRoute("https://example.com/page")
393
+ handler(route)
394
+ assert route.continued
395
+
396
+ def test_does_not_block_partial_domain_match(self):
397
+ """'example.com' should not block 'notexample.com'."""
398
+ handler = create_intercept_handler(disable_resources=False, blocked_domains={"example.com"})
399
+ route = _MockRoute("https://notexample.com/page")
400
+ handler(route)
401
+ assert route.continued
402
+
403
+ def test_multiple_blocked_domains(self):
404
+ handler = create_intercept_handler(disable_resources=False, blocked_domains={"ads.com", "tracker.io"})
405
+ route_ads = _MockRoute("https://ads.com/banner")
406
+ route_tracker = _MockRoute("https://cdn.tracker.io/script.js")
407
+ route_safe = _MockRoute("https://example.com/page")
408
+ handler(route_ads)
409
+ handler(route_tracker)
410
+ handler(route_safe)
411
+ assert route_ads.aborted
412
+ assert route_tracker.aborted
413
+ assert route_safe.continued
414
+
415
+
416
+ class TestCreateAsyncInterceptHandler:
417
+ """Test the unified async route handler factory."""
418
+
419
+ @pytest.mark.asyncio
420
+ async def test_blocks_disabled_resource_types(self):
421
+ handler = create_async_intercept_handler(disable_resources=True)
422
+ route = _AsyncMockRoute("https://example.com/font.woff", resource_type="font")
423
+ await handler(route)
424
+ assert route.aborted
425
+
426
+ @pytest.mark.asyncio
427
+ async def test_blocks_domain(self):
428
+ handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"})
429
+ route = _AsyncMockRoute("https://ads.example.com/track")
430
+ await handler(route)
431
+ assert route.aborted
432
+
433
+ @pytest.mark.asyncio
434
+ async def test_continues_non_blocked(self):
435
+ handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"})
436
+ route = _AsyncMockRoute("https://safe.example.com/page")
437
+ await handler(route)
438
+ assert route.continued
439
+
440
+ @pytest.mark.asyncio
441
+ async def test_blocks_subdomain(self):
442
+ handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"tracker.io"})
443
+ route = _AsyncMockRoute("https://cdn.tracker.io/script.js")
444
+ await handler(route)
445
+ assert route.aborted
446
+
447
+ @pytest.mark.asyncio
448
+ async def test_does_not_block_partial_domain_match(self):
449
+ handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"example.com"})
450
+ route = _AsyncMockRoute("https://notexample.com/page")
451
+ await handler(route)
452
+ assert route.continued
tests/fetchers/test_validator.py CHANGED
@@ -77,3 +77,25 @@ class TestValidators:
77
  config = validate(params, StealthConfig)
78
 
79
  assert config.timeout == 60000 # Should be increased
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  config = validate(params, StealthConfig)
78
 
79
  assert config.timeout == 60000 # Should be increased
80
+
81
+ def test_playwright_config_blocked_domains(self):
82
+ """Test PlaywrightConfig with blocked_domains"""
83
+ params = {"blocked_domains": {"ads.example.com", "tracker.io"}}
84
+
85
+ config = validate(params, PlaywrightConfig)
86
+
87
+ assert config.blocked_domains == {"ads.example.com", "tracker.io"}
88
+
89
+ def test_playwright_config_blocked_domains_default_none(self):
90
+ """Test PlaywrightConfig blocked_domains defaults to None"""
91
+ config = validate({}, PlaywrightConfig)
92
+
93
+ assert config.blocked_domains is None
94
+
95
+ def test_stealth_config_blocked_domains(self):
96
+ """Test StealthConfig inherits blocked_domains"""
97
+ params = {"blocked_domains": {"ads.example.com"}}
98
+
99
+ config = validate(params, StealthConfig)
100
+
101
+ assert config.blocked_domains == {"ads.example.com"}