File size: 13,449 Bytes
f37031b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ae256c
 
f37031b
5ae256c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f37031b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ae256c
 
 
 
 
 
 
 
f37031b
 
 
 
 
 
5ae256c
 
 
 
 
 
f37031b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
"""Tests for the Request class."""

import pickle

import pytest

from scrapling.spiders.request import Request
from scrapling.core._types import Any, Dict, AsyncGenerator


class TestRequestCreation:
    """Test Request initialization and basic attributes."""

    def test_basic_request_creation(self):
        """Test creating a request with just a URL."""
        request = Request("https://example.com")

        assert request.url == "https://example.com"
        assert request.sid == ""
        assert request.callback is None
        assert request.priority == 0
        assert request.dont_filter is False
        assert request.meta == {}
        assert request._retry_count == 0
        assert request._session_kwargs == {}

    def test_request_with_all_parameters(self):
        """Test creating a request with all parameters."""

        async def my_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
            yield {"test": "data"}

        request = Request(
            url="https://example.com/page",
            sid="my_session",
            callback=my_callback,
            priority=10,
            dont_filter=True,
            meta={"key": "value"},
            _retry_count=2,
            proxy="http://proxy:8080",
            timeout=30,
        )

        assert request.url == "https://example.com/page"
        assert request.sid == "my_session"
        assert request.callback == my_callback
        assert request.priority == 10
        assert request.dont_filter is True
        assert request.meta == {"key": "value"}
        assert request._retry_count == 2
        assert request._session_kwargs == {"proxy": "http://proxy:8080", "timeout": 30}

    def test_request_meta_default_is_empty_dict(self):
        """Test that meta defaults to empty dict, not shared reference."""
        r1 = Request("https://example.com")
        r2 = Request("https://example.com")

        r1.meta["key"] = "value"

        assert r1.meta == {"key": "value"}
        assert r2.meta == {}


class TestRequestProperties:
    """Test Request computed properties."""

    def test_domain_extraction(self):
        """Test domain property extracts netloc correctly."""
        request = Request("https://www.example.com/path/page.html?query=1")
        assert request.domain == "www.example.com"

    def test_domain_with_port(self):
        """Test domain extraction with port number."""
        request = Request("http://localhost:8080/api")
        assert request.domain == "localhost:8080"

    def test_domain_with_subdomain(self):
        """Test domain extraction with subdomains."""
        request = Request("https://api.v2.example.com/endpoint")
        assert request.domain == "api.v2.example.com"

    def test_fingerprint_returns_bytes(self):
        """Test fingerprint generation returns bytes."""
        request = Request("https://example.com")
        fp = request.update_fingerprint()
        assert isinstance(fp, bytes)
        assert len(fp) == 20  # SHA1 produces 20 bytes

    def test_fingerprint_is_deterministic(self):
        """Test same request produces same fingerprint."""
        r1 = Request("https://example.com", data={"key": "value"})
        r2 = Request("https://example.com", data={"key": "value"})
        assert r1.update_fingerprint() == r2.update_fingerprint()

    def test_fingerprint_different_urls(self):
        """Test different URLs produce different fingerprints."""
        r1 = Request("https://example.com/page1")
        r2 = Request("https://example.com/page2")
        assert r1.update_fingerprint() != r2.update_fingerprint()


class TestRequestCopy:
    """Test Request copy functionality."""

    def test_copy_creates_independent_request(self):
        """Test that copy creates a new independent request."""

        async def callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
            yield None

        original = Request(
            url="https://example.com",
            sid="session",
            callback=callback,
            priority=5,
            dont_filter=True,
            meta={"original": True},
            _retry_count=1,
            proxy="http://proxy:8080",
        )

        copied = original.copy()

        # Check all values are copied
        assert copied.url == original.url
        assert copied.sid == original.sid
        assert copied.callback == original.callback
        assert copied.priority == original.priority
        assert copied.dont_filter == original.dont_filter
        assert copied.meta == original.meta
        assert copied._retry_count == original._retry_count
        assert copied._session_kwargs == original._session_kwargs

        # Check they are different objects
        assert copied is not original
        assert copied.meta is not original.meta  # Meta should be a copy

    def test_copy_meta_is_independent(self):
        """Test that modifying copied meta doesn't affect original."""
        original = Request("https://example.com", meta={"key": "original"})
        copied = original.copy()

        copied.meta["key"] = "modified"
        copied.meta["new_key"] = "new_value"

        assert original.meta == {"key": "original"}
        assert copied.meta == {"key": "modified", "new_key": "new_value"}


class TestRequestComparison:
    """Test Request comparison operators."""

    def test_priority_less_than(self):
        """Test less than comparison by priority."""
        low_priority = Request("https://example.com/1", priority=1)
        high_priority = Request("https://example.com/2", priority=10)

        assert low_priority < high_priority
        assert not high_priority < low_priority

    def test_priority_greater_than(self):
        """Test greater than comparison by priority."""
        low_priority = Request("https://example.com/1", priority=1)
        high_priority = Request("https://example.com/2", priority=10)

        assert high_priority > low_priority
        assert not low_priority > high_priority

    def test_equality_by_fingerprint(self):
        """Test equality comparison by fingerprint."""
        r1 = Request("https://example.com")
        r2 = Request("https://example.com")
        r3 = Request("https://example.com/other")

        # Generate fingerprints first (required for equality)
        r1.update_fingerprint()
        r2.update_fingerprint()
        r3.update_fingerprint()

        assert r1 == r2
        assert r1 != r3

    def test_equality_different_priorities_same_fingerprint(self):
        """Test requests with same fingerprint are equal despite different priorities."""
        r1 = Request("https://example.com", priority=1)
        r2 = Request("https://example.com", priority=100)

        # Generate fingerprints first
        r1.update_fingerprint()
        r2.update_fingerprint()

        assert r1 == r2  # Same fingerprint means equal

    def test_comparison_with_non_request(self):
        """Test comparison with non-Request types returns NotImplemented."""
        request = Request("https://example.com")

        assert request.__lt__("not a request") == NotImplemented
        assert request.__gt__("not a request") == NotImplemented
        assert request.__eq__("not a request") == NotImplemented


class TestRequestStringRepresentation:
    """Test Request string representations."""

    def test_str_returns_url(self):
        """Test __str__ returns the URL."""
        request = Request("https://example.com/page")
        assert str(request) == "https://example.com/page"

    def test_repr_without_callback(self):
        """Test __repr__ without callback."""
        request = Request("https://example.com", priority=5)
        repr_str = repr(request)

        assert "Request" in repr_str
        assert "https://example.com" in repr_str
        assert "priority=5" in repr_str
        assert "callback=None" in repr_str

    def test_repr_with_callback(self):
        """Test __repr__ with named callback."""

        async def my_custom_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
            yield None

        request = Request("https://example.com", callback=my_custom_callback)
        repr_str = repr(request)

        assert "callback=my_custom_callback" in repr_str


class TestRequestPickling:
    """Test Request serialization for checkpointing."""

    def test_pickle_without_callback(self):
        """Test pickling request without callback."""
        original = Request(
            url="https://example.com",
            sid="session",
            priority=5,
            meta={"key": "value"},
        )

        pickled = pickle.dumps(original)
        restored = pickle.loads(pickled)

        assert restored.url == original.url
        assert restored.sid == original.sid
        assert restored.priority == original.priority
        assert restored.meta == original.meta
        assert restored.callback is None

    def test_pickle_with_callback_stores_name(self):
        """Test that callback name is stored when pickling."""

        async def parse_page(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
            yield {"data": "test"}

        original = Request("https://example.com", callback=parse_page)

        # Check getstate stores callback name
        state = original.__getstate__()
        assert state["_callback_name"] == "parse_page"
        assert state["callback"] is None

    def test_pickle_with_none_callback(self):
        """Test pickling with None callback."""
        original = Request("https://example.com", callback=None)

        state = original.__getstate__()
        assert state["_callback_name"] is None
        assert state["callback"] is None

    def test_setstate_stores_callback_name(self):
        """Test that setstate correctly handles callback name."""
        request = Request("https://example.com")
        state = {
            "url": "https://example.com",
            "sid": "",
            "callback": None,
            "priority": 0,
            "dont_filter": False,
            "meta": {},
            "_retry_count": 0,
            "_session_kwargs": {},
            "_callback_name": "custom_parse",
        }

        request.__setstate__(state)

        assert hasattr(request, "_callback_name")
        assert request._callback_name == "custom_parse"

    def test_pickle_roundtrip_preserves_session_kwargs(self):
        """Test that session kwargs are preserved through pickle."""
        original = Request(
            "https://example.com",
            proxy="http://proxy:8080",
            timeout=30,
            headers={"User-Agent": "test"},
        )

        pickled = pickle.dumps(original)
        restored = pickle.loads(pickled)

        assert restored._session_kwargs == {
            "proxy": "http://proxy:8080",
            "timeout": 30,
            "headers": {"User-Agent": "test"},
        }


class TestRequestRestoreCallback:
    """Test callback restoration from spider."""

    def test_restore_callback_from_spider(self):
        """Test restoring callback from spider instance."""

        class MockSpider:
            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

            async def parse_detail(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield {"detail": True}

        spider = MockSpider()
        request = Request("https://example.com")
        request._callback_name = "parse_detail"

        request._restore_callback(spider)  # type: ignore[arg-type]

        assert request.callback == spider.parse_detail
        assert not hasattr(request, "_callback_name")

    def test_restore_callback_falls_back_to_parse(self):
        """Test that missing callback falls back to spider.parse."""

        class MockSpider:
            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = MockSpider()
        request = Request("https://example.com")
        request._callback_name = "nonexistent_method"

        request._restore_callback(spider)  # type: ignore[arg-type]

        assert request.callback == spider.parse
        assert not hasattr(request, "_callback_name")

    def test_restore_callback_with_none_name(self):
        """Test restore callback when _callback_name is None."""

        class MockSpider:
            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = MockSpider()
        request = Request("https://example.com")
        request._callback_name = None

        request._restore_callback(spider)  # type: ignore[arg-type]

        # Should clean up _callback_name attribute
        assert not hasattr(request, "_callback_name")

    def test_restore_callback_without_callback_name_attr(self):
        """Test restore callback when _callback_name attribute doesn't exist."""

        class MockSpider:
            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = MockSpider()
        request = Request("https://example.com")
        # Don't set _callback_name

        # Should not raise an error
        request._restore_callback(spider)  # type: ignore[arg-type]