Scrapling / tests /spiders /test_request.py
Karim shoair
test: update tests accordingly
5ae256c
"""Tests for the Request class."""
import pickle
import pytest
from scrapling.spiders.request import Request
from scrapling.core._types import Any, Dict, AsyncGenerator
class TestRequestCreation:
"""Test Request initialization and basic attributes."""
def test_basic_request_creation(self):
"""Test creating a request with just a URL."""
request = Request("https://example.com")
assert request.url == "https://example.com"
assert request.sid == ""
assert request.callback is None
assert request.priority == 0
assert request.dont_filter is False
assert request.meta == {}
assert request._retry_count == 0
assert request._session_kwargs == {}
def test_request_with_all_parameters(self):
"""Test creating a request with all parameters."""
async def my_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
yield {"test": "data"}
request = Request(
url="https://example.com/page",
sid="my_session",
callback=my_callback,
priority=10,
dont_filter=True,
meta={"key": "value"},
_retry_count=2,
proxy="http://proxy:8080",
timeout=30,
)
assert request.url == "https://example.com/page"
assert request.sid == "my_session"
assert request.callback == my_callback
assert request.priority == 10
assert request.dont_filter is True
assert request.meta == {"key": "value"}
assert request._retry_count == 2
assert request._session_kwargs == {"proxy": "http://proxy:8080", "timeout": 30}
def test_request_meta_default_is_empty_dict(self):
"""Test that meta defaults to empty dict, not shared reference."""
r1 = Request("https://example.com")
r2 = Request("https://example.com")
r1.meta["key"] = "value"
assert r1.meta == {"key": "value"}
assert r2.meta == {}
class TestRequestProperties:
"""Test Request computed properties."""
def test_domain_extraction(self):
"""Test domain property extracts netloc correctly."""
request = Request("https://www.example.com/path/page.html?query=1")
assert request.domain == "www.example.com"
def test_domain_with_port(self):
"""Test domain extraction with port number."""
request = Request("http://localhost:8080/api")
assert request.domain == "localhost:8080"
def test_domain_with_subdomain(self):
"""Test domain extraction with subdomains."""
request = Request("https://api.v2.example.com/endpoint")
assert request.domain == "api.v2.example.com"
def test_fingerprint_returns_bytes(self):
"""Test fingerprint generation returns bytes."""
request = Request("https://example.com")
fp = request.update_fingerprint()
assert isinstance(fp, bytes)
assert len(fp) == 20 # SHA1 produces 20 bytes
def test_fingerprint_is_deterministic(self):
"""Test same request produces same fingerprint."""
r1 = Request("https://example.com", data={"key": "value"})
r2 = Request("https://example.com", data={"key": "value"})
assert r1.update_fingerprint() == r2.update_fingerprint()
def test_fingerprint_different_urls(self):
"""Test different URLs produce different fingerprints."""
r1 = Request("https://example.com/page1")
r2 = Request("https://example.com/page2")
assert r1.update_fingerprint() != r2.update_fingerprint()
class TestRequestCopy:
"""Test Request copy functionality."""
def test_copy_creates_independent_request(self):
"""Test that copy creates a new independent request."""
async def callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
yield None
original = Request(
url="https://example.com",
sid="session",
callback=callback,
priority=5,
dont_filter=True,
meta={"original": True},
_retry_count=1,
proxy="http://proxy:8080",
)
copied = original.copy()
# Check all values are copied
assert copied.url == original.url
assert copied.sid == original.sid
assert copied.callback == original.callback
assert copied.priority == original.priority
assert copied.dont_filter == original.dont_filter
assert copied.meta == original.meta
assert copied._retry_count == original._retry_count
assert copied._session_kwargs == original._session_kwargs
# Check they are different objects
assert copied is not original
assert copied.meta is not original.meta # Meta should be a copy
def test_copy_meta_is_independent(self):
"""Test that modifying copied meta doesn't affect original."""
original = Request("https://example.com", meta={"key": "original"})
copied = original.copy()
copied.meta["key"] = "modified"
copied.meta["new_key"] = "new_value"
assert original.meta == {"key": "original"}
assert copied.meta == {"key": "modified", "new_key": "new_value"}
class TestRequestComparison:
"""Test Request comparison operators."""
def test_priority_less_than(self):
"""Test less than comparison by priority."""
low_priority = Request("https://example.com/1", priority=1)
high_priority = Request("https://example.com/2", priority=10)
assert low_priority < high_priority
assert not high_priority < low_priority
def test_priority_greater_than(self):
"""Test greater than comparison by priority."""
low_priority = Request("https://example.com/1", priority=1)
high_priority = Request("https://example.com/2", priority=10)
assert high_priority > low_priority
assert not low_priority > high_priority
def test_equality_by_fingerprint(self):
"""Test equality comparison by fingerprint."""
r1 = Request("https://example.com")
r2 = Request("https://example.com")
r3 = Request("https://example.com/other")
# Generate fingerprints first (required for equality)
r1.update_fingerprint()
r2.update_fingerprint()
r3.update_fingerprint()
assert r1 == r2
assert r1 != r3
def test_equality_different_priorities_same_fingerprint(self):
"""Test requests with same fingerprint are equal despite different priorities."""
r1 = Request("https://example.com", priority=1)
r2 = Request("https://example.com", priority=100)
# Generate fingerprints first
r1.update_fingerprint()
r2.update_fingerprint()
assert r1 == r2 # Same fingerprint means equal
def test_comparison_with_non_request(self):
"""Test comparison with non-Request types returns NotImplemented."""
request = Request("https://example.com")
assert request.__lt__("not a request") == NotImplemented
assert request.__gt__("not a request") == NotImplemented
assert request.__eq__("not a request") == NotImplemented
class TestRequestStringRepresentation:
"""Test Request string representations."""
def test_str_returns_url(self):
"""Test __str__ returns the URL."""
request = Request("https://example.com/page")
assert str(request) == "https://example.com/page"
def test_repr_without_callback(self):
"""Test __repr__ without callback."""
request = Request("https://example.com", priority=5)
repr_str = repr(request)
assert "Request" in repr_str
assert "https://example.com" in repr_str
assert "priority=5" in repr_str
assert "callback=None" in repr_str
def test_repr_with_callback(self):
"""Test __repr__ with named callback."""
async def my_custom_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
yield None
request = Request("https://example.com", callback=my_custom_callback)
repr_str = repr(request)
assert "callback=my_custom_callback" in repr_str
class TestRequestPickling:
"""Test Request serialization for checkpointing."""
def test_pickle_without_callback(self):
"""Test pickling request without callback."""
original = Request(
url="https://example.com",
sid="session",
priority=5,
meta={"key": "value"},
)
pickled = pickle.dumps(original)
restored = pickle.loads(pickled)
assert restored.url == original.url
assert restored.sid == original.sid
assert restored.priority == original.priority
assert restored.meta == original.meta
assert restored.callback is None
def test_pickle_with_callback_stores_name(self):
"""Test that callback name is stored when pickling."""
async def parse_page(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
yield {"data": "test"}
original = Request("https://example.com", callback=parse_page)
# Check getstate stores callback name
state = original.__getstate__()
assert state["_callback_name"] == "parse_page"
assert state["callback"] is None
def test_pickle_with_none_callback(self):
"""Test pickling with None callback."""
original = Request("https://example.com", callback=None)
state = original.__getstate__()
assert state["_callback_name"] is None
assert state["callback"] is None
def test_setstate_stores_callback_name(self):
"""Test that setstate correctly handles callback name."""
request = Request("https://example.com")
state = {
"url": "https://example.com",
"sid": "",
"callback": None,
"priority": 0,
"dont_filter": False,
"meta": {},
"_retry_count": 0,
"_session_kwargs": {},
"_callback_name": "custom_parse",
}
request.__setstate__(state)
assert hasattr(request, "_callback_name")
assert request._callback_name == "custom_parse"
def test_pickle_roundtrip_preserves_session_kwargs(self):
"""Test that session kwargs are preserved through pickle."""
original = Request(
"https://example.com",
proxy="http://proxy:8080",
timeout=30,
headers={"User-Agent": "test"},
)
pickled = pickle.dumps(original)
restored = pickle.loads(pickled)
assert restored._session_kwargs == {
"proxy": "http://proxy:8080",
"timeout": 30,
"headers": {"User-Agent": "test"},
}
class TestRequestRestoreCallback:
"""Test callback restoration from spider."""
def test_restore_callback_from_spider(self):
"""Test restoring callback from spider instance."""
class MockSpider:
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
yield None
async def parse_detail(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
yield {"detail": True}
spider = MockSpider()
request = Request("https://example.com")
request._callback_name = "parse_detail"
request._restore_callback(spider) # type: ignore[arg-type]
assert request.callback == spider.parse_detail
assert not hasattr(request, "_callback_name")
def test_restore_callback_falls_back_to_parse(self):
"""Test that missing callback falls back to spider.parse."""
class MockSpider:
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
yield None
spider = MockSpider()
request = Request("https://example.com")
request._callback_name = "nonexistent_method"
request._restore_callback(spider) # type: ignore[arg-type]
assert request.callback == spider.parse
assert not hasattr(request, "_callback_name")
def test_restore_callback_with_none_name(self):
"""Test restore callback when _callback_name is None."""
class MockSpider:
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
yield None
spider = MockSpider()
request = Request("https://example.com")
request._callback_name = None
request._restore_callback(spider) # type: ignore[arg-type]
# Should clean up _callback_name attribute
assert not hasattr(request, "_callback_name")
def test_restore_callback_without_callback_name_attr(self):
"""Test restore callback when _callback_name attribute doesn't exist."""
class MockSpider:
async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
yield None
spider = MockSpider()
request = Request("https://example.com")
# Don't set _callback_name
# Should not raise an error
request._restore_callback(spider) # type: ignore[arg-type]