Spaces:

lenson78
/

Scrapling

Paused

Karim shoair commited on Jan 19

Commit

f37031b

1 Parent(s): fab7a59

test: add tests for the spiders system

It's generated by Opus on Claude Code. It's very good as per my review and instructions, but I will have another look later.

Files changed (7) hide show

tests/spiders/__init__.py +0 -0
tests/spiders/test_checkpoint.py +341 -0
tests/spiders/test_request.py +363 -0
tests/spiders/test_result.py +327 -0
tests/spiders/test_scheduler.py +390 -0
tests/spiders/test_session.py +352 -0
tests/spiders/test_spider.py +574 -0

tests/spiders/__init__.py ADDED Viewed

File without changes

tests/spiders/test_checkpoint.py ADDED Viewed

	@@ -0,0 +1,341 @@

+"""Tests for the CheckpointManager and CheckpointData classes."""
+import pickle
+import tempfile
+from pathlib import Path
+import pytest
+import anyio
+from scrapling.spiders.request import Request
+from scrapling.spiders.checkpoint import CheckpointData, CheckpointManager
+class TestCheckpointData:
+    """Test CheckpointData dataclass."""
+    def test_default_values(self):
+        """Test CheckpointData with default values."""
+        data = CheckpointData()
+        assert data.requests == []
+        assert data.seen == set()
+    def test_with_requests_and_seen(self):
+        """Test CheckpointData with requests and seen URLs."""
+        requests = [
+            Request("https://example.com/1", priority=10),
+            Request("https://example.com/2", priority=5),
+        ]
+        seen = {"url1", "url2", "url3"}
+        data = CheckpointData(requests=requests, seen=seen)
+        assert len(data.requests) == 2
+        assert data.requests[0].url == "https://example.com/1"
+        assert data.seen == {"url1", "url2", "url3"}
+    def test_pickle_roundtrip(self):
+        """Test that CheckpointData can be pickled and unpickled."""
+        requests = [Request("https://example.com", priority=5)]
+        seen = {"fingerprint1", "fingerprint2"}
+        data = CheckpointData(requests=requests, seen=seen)
+        pickled = pickle.dumps(data)
+        restored = pickle.loads(pickled)
+        assert len(restored.requests) == 1
+        assert restored.requests[0].url == "https://example.com"
+        assert restored.seen == {"fingerprint1", "fingerprint2"}
+class TestCheckpointManagerInit:
+    """Test CheckpointManager initialization."""
+    def test_init_with_string_path(self):
+        """Test initialization with string path."""
+        manager = CheckpointManager("/tmp/test_crawl")
+        assert str(manager.crawldir) == "/tmp/test_crawl"
+        assert manager.interval == 300.0
+    def test_init_with_pathlib_path(self):
+        """Test initialization with pathlib.Path."""
+        path = Path("/tmp/test_crawl")
+        manager = CheckpointManager(path)
+        assert str(manager.crawldir) == "/tmp/test_crawl"
+    def test_init_with_custom_interval(self):
+        """Test initialization with custom interval."""
+        manager = CheckpointManager("/tmp/test", interval=60.0)
+        assert manager.interval == 60.0
+    def test_init_with_zero_interval(self):
+        """Test initialization with zero interval (disable periodic checkpoints)."""
+        manager = CheckpointManager("/tmp/test", interval=0)
+        assert manager.interval == 0
+    def test_init_with_negative_interval_raises(self):
+        """Test that negative interval raises ValueError."""
+        with pytest.raises(ValueError, match="greater than 0"):
+            CheckpointManager("/tmp/test", interval=-1)
+    def test_init_with_invalid_interval_type_raises(self):
+        """Test that invalid interval type raises TypeError."""
+        with pytest.raises(TypeError, match="integer or float"):
+            CheckpointManager("/tmp/test", interval="invalid")  # type: ignore
+    def test_checkpoint_file_path(self):
+        """Test that checkpoint file path is correctly constructed."""
+        manager = CheckpointManager("/tmp/test_crawl")
+        expected_path = "/tmp/test_crawl/checkpoint.pkl"
+        assert str(manager._checkpoint_path) == expected_path
+class TestCheckpointManagerOperations:
+    """Test CheckpointManager save/load/cleanup operations."""
+    @pytest.fixture
+    def temp_dir(self):
+        """Create a temporary directory for testing."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            yield Path(tmpdir)
+    @pytest.mark.asyncio
+    async def test_has_checkpoint_false_when_no_file(self, temp_dir: Path):
+        """Test has_checkpoint returns False when no checkpoint exists."""
+        manager = CheckpointManager(temp_dir / "crawl")
+        result = await manager.has_checkpoint()
+        assert result is False
+    @pytest.mark.asyncio
+    async def test_save_creates_checkpoint_file(self, temp_dir: Path):
+        """Test that save creates the checkpoint file."""
+        crawl_dir = temp_dir / "crawl"
+        manager = CheckpointManager(crawl_dir)
+        data = CheckpointData(
+            requests=[Request("https://example.com")],
+            seen={"fp1", "fp2"},
+        )
+        await manager.save(data)
+        checkpoint_path = crawl_dir / "checkpoint.pkl"
+        assert checkpoint_path.exists()
+    @pytest.mark.asyncio
+    async def test_save_creates_directory_if_not_exists(self, temp_dir: Path):
+        """Test that save creates the directory if it doesn't exist."""
+        crawl_dir = temp_dir / "nested" / "crawl" / "dir"
+        manager = CheckpointManager(crawl_dir)
+        data = CheckpointData()
+        await manager.save(data)
+        assert crawl_dir.exists()
+    @pytest.mark.asyncio
+    async def test_has_checkpoint_true_after_save(self, temp_dir: Path):
+        """Test has_checkpoint returns True after saving."""
+        manager = CheckpointManager(temp_dir / "crawl")
+        data = CheckpointData()
+        await manager.save(data)
+        result = await manager.has_checkpoint()
+        assert result is True
+    @pytest.mark.asyncio
+    async def test_load_returns_none_when_no_checkpoint(self, temp_dir: Path):
+        """Test load returns None when no checkpoint exists."""
+        manager = CheckpointManager(temp_dir / "crawl")
+        result = await manager.load()
+        assert result is None
+    @pytest.mark.asyncio
+    async def test_save_and_load_roundtrip(self, temp_dir: Path):
+        """Test saving and loading checkpoint data."""
+        manager = CheckpointManager(temp_dir / "crawl")
+        original_data = CheckpointData(
+            requests=[
+                Request("https://example.com/1", priority=10),
+                Request("https://example.com/2", priority=5),
+            ],
+            seen={"fp1", "fp2", "fp3"},
+        )
+        await manager.save(original_data)
+        loaded_data = await manager.load()
+        assert loaded_data is not None
+        assert len(loaded_data.requests) == 2
+        assert loaded_data.requests[0].url == "https://example.com/1"
+        assert loaded_data.requests[0].priority == 10
+        assert loaded_data.seen == {"fp1", "fp2", "fp3"}
+    @pytest.mark.asyncio
+    async def test_save_is_atomic(self, temp_dir: Path):
+        """Test that save uses atomic write (temp file + rename)."""
+        crawl_dir = temp_dir / "crawl"
+        manager = CheckpointManager(crawl_dir)
+        data = CheckpointData(requests=[Request("https://example.com")])
+        await manager.save(data)
+        # Temp file should not exist after successful save
+        temp_path = crawl_dir / "checkpoint.tmp"
+        assert not temp_path.exists()
+        # Checkpoint file should exist
+        checkpoint_path = crawl_dir / "checkpoint.pkl"
+        assert checkpoint_path.exists()
+    @pytest.mark.asyncio
+    async def test_cleanup_removes_checkpoint_file(self, temp_dir: Path):
+        """Test that cleanup removes the checkpoint file."""
+        crawl_dir = temp_dir / "crawl"
+        manager = CheckpointManager(crawl_dir)
+        # Save a checkpoint first
+        data = CheckpointData()
+        await manager.save(data)
+        checkpoint_path = crawl_dir / "checkpoint.pkl"
+        assert checkpoint_path.exists()
+        # Cleanup should remove it
+        await manager.cleanup()
+        assert not checkpoint_path.exists()
+    @pytest.mark.asyncio
+    async def test_cleanup_no_error_when_no_file(self, temp_dir: Path):
+        """Test that cleanup doesn't raise error when no file exists."""
+        manager = CheckpointManager(temp_dir / "crawl")
+        # Should not raise
+        await manager.cleanup()
+    @pytest.mark.asyncio
+    async def test_load_returns_none_on_corrupt_file(self, temp_dir: Path):
+        """Test load returns None when checkpoint file is corrupt."""
+        crawl_dir = temp_dir / "crawl"
+        crawl_dir.mkdir(parents=True)
+        checkpoint_path = crawl_dir / "checkpoint.pkl"
+        checkpoint_path.write_bytes(b"not valid pickle data")
+        manager = CheckpointManager(crawl_dir)
+        result = await manager.load()
+        assert result is None
+    @pytest.mark.asyncio
+    async def test_multiple_saves_overwrite(self, temp_dir: Path):
+        """Test that multiple saves overwrite the checkpoint."""
+        manager = CheckpointManager(temp_dir / "crawl")
+        # First save
+        data1 = CheckpointData(
+            requests=[Request("https://example.com/1")],
+            seen={"fp1"},
+        )
+        await manager.save(data1)
+        # Second save
+        data2 = CheckpointData(
+            requests=[Request("https://example.com/2"), Request("https://example.com/3")],
+            seen={"fp2", "fp3"},
+        )
+        await manager.save(data2)
+        # Load should return the second save
+        loaded = await manager.load()
+        assert loaded is not None
+        assert len(loaded.requests) == 2
+        assert loaded.requests[0].url == "https://example.com/2"
+        assert loaded.seen == {"fp2", "fp3"}
+class TestCheckpointManagerEdgeCases:
+    """Test edge cases for CheckpointManager."""
+    @pytest.fixture
+    def temp_dir(self):
+        """Create a temporary directory for testing."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            yield Path(tmpdir)
+    @pytest.mark.asyncio
+    async def test_save_empty_checkpoint(self, temp_dir: Path):
+        """Test saving empty checkpoint data."""
+        manager = CheckpointManager(temp_dir / "crawl")
+        data = CheckpointData(requests=[], seen=set())
+        await manager.save(data)
+        loaded = await manager.load()
+        assert loaded is not None
+        assert loaded.requests == []
+        assert loaded.seen == set()
+    @pytest.mark.asyncio
+    async def test_save_large_checkpoint(self, temp_dir: Path):
+        """Test saving checkpoint with many requests."""
+        manager = CheckpointManager(temp_dir / "crawl")
+        # Create 1000 requests
+        requests = [
+            Request(f"https://example.com/{i}", priority=i % 10)
+            for i in range(1000)
+        ]
+        seen = {f"fp_{i}" for i in range(2000)}
+        data = CheckpointData(requests=requests, seen=seen)
+        await manager.save(data)
+        loaded = await manager.load()
+        assert loaded is not None
+        assert len(loaded.requests) == 1000
+        assert len(loaded.seen) == 2000
+    @pytest.mark.asyncio
+    async def test_requests_preserve_metadata(self, temp_dir: Path):
+        """Test that request metadata is preserved through checkpoint."""
+        manager = CheckpointManager(temp_dir / "crawl")
+        original_request = Request(
+            url="https://example.com",
+            sid="my_session",
+            priority=42,
+            dont_filter=True,
+            meta={"item_id": 123, "page": 5},
+            proxy="http://proxy:8080",
+        )
+        data = CheckpointData(requests=[original_request], seen=set())
+        await manager.save(data)
+        loaded = await manager.load()
+        assert loaded is not None
+        restored = loaded.requests[0]
+        assert restored.url == "https://example.com"
+        assert restored.sid == "my_session"
+        assert restored.priority == 42
+        assert restored.dont_filter is True
+        assert restored.meta == {"item_id": 123, "page": 5}
+        assert restored._session_kwargs == {"proxy": "http://proxy:8080"}

tests/spiders/test_request.py ADDED Viewed

	@@ -0,0 +1,363 @@

+"""Tests for the Request class."""
+import pickle
+import pytest
+from scrapling.spiders.request import Request
+from scrapling.core._types import Any, Dict, AsyncGenerator
+class TestRequestCreation:
+    """Test Request initialization and basic attributes."""
+    def test_basic_request_creation(self):
+        """Test creating a request with just a URL."""
+        request = Request("https://example.com")
+        assert request.url == "https://example.com"
+        assert request.sid == ""
+        assert request.callback is None
+        assert request.priority == 0
+        assert request.dont_filter is False
+        assert request.meta == {}
+        assert request._retry_count == 0
+        assert request._session_kwargs == {}
+    def test_request_with_all_parameters(self):
+        """Test creating a request with all parameters."""
+        async def my_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+            yield {"test": "data"}
+        request = Request(
+            url="https://example.com/page",
+            sid="my_session",
+            callback=my_callback,
+            priority=10,
+            dont_filter=True,
+            meta={"key": "value"},
+            _retry_count=2,
+            proxy="http://proxy:8080",
+            timeout=30,
+        )
+        assert request.url == "https://example.com/page"
+        assert request.sid == "my_session"
+        assert request.callback == my_callback
+        assert request.priority == 10
+        assert request.dont_filter is True
+        assert request.meta == {"key": "value"}
+        assert request._retry_count == 2
+        assert request._session_kwargs == {"proxy": "http://proxy:8080", "timeout": 30}
+    def test_request_meta_default_is_empty_dict(self):
+        """Test that meta defaults to empty dict, not shared reference."""
+        r1 = Request("https://example.com")
+        r2 = Request("https://example.com")
+        r1.meta["key"] = "value"
+        assert r1.meta == {"key": "value"}
+        assert r2.meta == {}
+class TestRequestProperties:
+    """Test Request computed properties."""
+    def test_domain_extraction(self):
+        """Test domain property extracts netloc correctly."""
+        request = Request("https://www.example.com/path/page.html?query=1")
+        assert request.domain == "www.example.com"
+    def test_domain_with_port(self):
+        """Test domain extraction with port number."""
+        request = Request("http://localhost:8080/api")
+        assert request.domain == "localhost:8080"
+    def test_domain_with_subdomain(self):
+        """Test domain extraction with subdomains."""
+        request = Request("https://api.v2.example.com/endpoint")
+        assert request.domain == "api.v2.example.com"
+    def test_fingerprint_includes_session_and_url(self):
+        """Test fingerprint generation."""
+        request = Request("https://example.com", sid="session1")
+        assert request._fp == "session1:https://example.com"
+    def test_fingerprint_empty_session(self):
+        """Test fingerprint with empty session ID."""
+        request = Request("https://example.com")
+        assert request._fp == ":https://example.com"
+class TestRequestCopy:
+    """Test Request copy functionality."""
+    def test_copy_creates_independent_request(self):
+        """Test that copy creates a new independent request."""
+        async def callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+            yield None
+        original = Request(
+            url="https://example.com",
+            sid="session",
+            callback=callback,
+            priority=5,
+            dont_filter=True,
+            meta={"original": True},
+            _retry_count=1,
+            proxy="http://proxy:8080",
+        )
+        copied = original.copy()
+        # Check all values are copied
+        assert copied.url == original.url
+        assert copied.sid == original.sid
+        assert copied.callback == original.callback
+        assert copied.priority == original.priority
+        assert copied.dont_filter == original.dont_filter
+        assert copied.meta == original.meta
+        assert copied._retry_count == original._retry_count
+        assert copied._session_kwargs == original._session_kwargs
+        # Check they are different objects
+        assert copied is not original
+        assert copied.meta is not original.meta  # Meta should be a copy
+    def test_copy_meta_is_independent(self):
+        """Test that modifying copied meta doesn't affect original."""
+        original = Request("https://example.com", meta={"key": "original"})
+        copied = original.copy()
+        copied.meta["key"] = "modified"
+        copied.meta["new_key"] = "new_value"
+        assert original.meta == {"key": "original"}
+        assert copied.meta == {"key": "modified", "new_key": "new_value"}
+class TestRequestComparison:
+    """Test Request comparison operators."""
+    def test_priority_less_than(self):
+        """Test less than comparison by priority."""
+        low_priority = Request("https://example.com/1", priority=1)
+        high_priority = Request("https://example.com/2", priority=10)
+        assert low_priority < high_priority
+        assert not high_priority < low_priority
+    def test_priority_greater_than(self):
+        """Test greater than comparison by priority."""
+        low_priority = Request("https://example.com/1", priority=1)
+        high_priority = Request("https://example.com/2", priority=10)
+        assert high_priority > low_priority
+        assert not low_priority > high_priority
+    def test_equality_by_fingerprint(self):
+        """Test equality comparison by fingerprint."""
+        r1 = Request("https://example.com", sid="session1")
+        r2 = Request("https://example.com", sid="session1")
+        r3 = Request("https://example.com", sid="session2")
+        assert r1 == r2
+        assert r1 != r3
+    def test_equality_different_priorities_same_fingerprint(self):
+        """Test requests with same fingerprint are equal despite different priorities."""
+        r1 = Request("https://example.com", sid="s1", priority=1)
+        r2 = Request("https://example.com", sid="s1", priority=100)
+        assert r1 == r2  # Same fingerprint means equal
+    def test_comparison_with_non_request(self):
+        """Test comparison with non-Request types returns NotImplemented."""
+        request = Request("https://example.com")
+        assert request.__lt__("not a request") == NotImplemented
+        assert request.__gt__("not a request") == NotImplemented
+        assert request.__eq__("not a request") == NotImplemented
+class TestRequestStringRepresentation:
+    """Test Request string representations."""
+    def test_str_returns_url(self):
+        """Test __str__ returns the URL."""
+        request = Request("https://example.com/page")
+        assert str(request) == "https://example.com/page"
+    def test_repr_without_callback(self):
+        """Test __repr__ without callback."""
+        request = Request("https://example.com", priority=5)
+        repr_str = repr(request)
+        assert "Request" in repr_str
+        assert "https://example.com" in repr_str
+        assert "priority=5" in repr_str
+        assert "callback=None" in repr_str
+    def test_repr_with_callback(self):
+        """Test __repr__ with named callback."""
+        async def my_custom_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+            yield None
+        request = Request("https://example.com", callback=my_custom_callback)
+        repr_str = repr(request)
+        assert "callback=my_custom_callback" in repr_str
+class TestRequestPickling:
+    """Test Request serialization for checkpointing."""
+    def test_pickle_without_callback(self):
+        """Test pickling request without callback."""
+        original = Request(
+            url="https://example.com",
+            sid="session",
+            priority=5,
+            meta={"key": "value"},
+        )
+        pickled = pickle.dumps(original)
+        restored = pickle.loads(pickled)
+        assert restored.url == original.url
+        assert restored.sid == original.sid
+        assert restored.priority == original.priority
+        assert restored.meta == original.meta
+        assert restored.callback is None
+    def test_pickle_with_callback_stores_name(self):
+        """Test that callback name is stored when pickling."""
+        async def parse_page(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+            yield {"data": "test"}
+        original = Request("https://example.com", callback=parse_page)
+        # Check getstate stores callback name
+        state = original.__getstate__()
+        assert state["_callback_name"] == "parse_page"
+        assert state["callback"] is None
+    def test_pickle_with_none_callback(self):
+        """Test pickling with None callback."""
+        original = Request("https://example.com", callback=None)
+        state = original.__getstate__()
+        assert state["_callback_name"] is None
+        assert state["callback"] is None
+    def test_setstate_stores_callback_name(self):
+        """Test that setstate correctly handles callback name."""
+        request = Request("https://example.com")
+        state = {
+            "url": "https://example.com",
+            "sid": "",
+            "callback": None,
+            "priority": 0,
+            "dont_filter": False,
+            "meta": {},
+            "_retry_count": 0,
+            "_session_kwargs": {},
+            "_callback_name": "custom_parse",
+        }
+        request.__setstate__(state)
+        assert hasattr(request, "_callback_name")
+        assert request._callback_name == "custom_parse"
+    def test_pickle_roundtrip_preserves_session_kwargs(self):
+        """Test that session kwargs are preserved through pickle."""
+        original = Request(
+            "https://example.com",
+            proxy="http://proxy:8080",
+            timeout=30,
+            headers={"User-Agent": "test"},
+        )
+        pickled = pickle.dumps(original)
+        restored = pickle.loads(pickled)
+        assert restored._session_kwargs == {
+            "proxy": "http://proxy:8080",
+            "timeout": 30,
+            "headers": {"User-Agent": "test"},
+        }
+class TestRequestRestoreCallback:
+    """Test callback restoration from spider."""
+    def test_restore_callback_from_spider(self):
+        """Test restoring callback from spider instance."""
+        class MockSpider:
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+            async def parse_detail(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield {"detail": True}
+        spider = MockSpider()
+        request = Request("https://example.com")
+        request._callback_name = "parse_detail"
+        request._restore_callback(spider)  # type: ignore[arg-type]
+        assert request.callback == spider.parse_detail
+        assert not hasattr(request, "_callback_name")
+    def test_restore_callback_falls_back_to_parse(self):
+        """Test that missing callback falls back to spider.parse."""
+        class MockSpider:
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        spider = MockSpider()
+        request = Request("https://example.com")
+        request._callback_name = "nonexistent_method"
+        request._restore_callback(spider)  # type: ignore[arg-type]
+        assert request.callback == spider.parse
+        assert not hasattr(request, "_callback_name")
+    def test_restore_callback_with_none_name(self):
+        """Test restore callback when _callback_name is None."""
+        class MockSpider:
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        spider = MockSpider()
+        request = Request("https://example.com")
+        request._callback_name = None
+        request._restore_callback(spider)  # type: ignore[arg-type]
+        # Should clean up _callback_name attribute
+        assert not hasattr(request, "_callback_name")
+    def test_restore_callback_without_callback_name_attr(self):
+        """Test restore callback when _callback_name attribute doesn't exist."""
+        class MockSpider:
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        spider = MockSpider()
+        request = Request("https://example.com")
+        # Don't set _callback_name
+        # Should not raise an error
+        request._restore_callback(spider)  # type: ignore[arg-type]

tests/spiders/test_result.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""Tests for the result module (ItemList, CrawlStats, CrawlResult)."""
+import json
+import tempfile
+from pathlib import Path
+import pytest
+from scrapling.spiders.result import ItemList, CrawlStats, CrawlResult
+class TestItemList:
+    """Test ItemList functionality."""
+    def test_itemlist_is_list(self):
+        """Test that ItemList is a list subclass."""
+        items = ItemList()
+        assert isinstance(items, list)
+    def test_itemlist_basic_operations(self):
+        """Test basic list operations work."""
+        items = ItemList()
+        items.append({"id": 1})
+        items.append({"id": 2})
+        assert len(items) == 2
+        assert items[0] == {"id": 1}
+    def test_to_json_creates_file(self):
+        """Test to_json creates JSON file."""
+        items = ItemList()
+        items.append({"name": "test", "value": 123})
+        items.append({"name": "test2", "value": 456})
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = Path(tmpdir) / "output.json"
+            items.to_json(path)
+            assert path.exists()
+            content = json.loads(path.read_text())
+            assert len(content) == 2
+            assert content[0]["name"] == "test"
+    def test_to_json_creates_parent_directory(self):
+        """Test to_json creates parent directories."""
+        items = ItemList()
+        items.append({"data": "test"})
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = Path(tmpdir) / "nested" / "dirs" / "output.json"
+            items.to_json(path)
+            assert path.exists()
+    def test_to_json_with_indent(self):
+        """Test to_json with indentation."""
+        items = ItemList()
+        items.append({"key": "value"})
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = Path(tmpdir) / "output.json"
+            items.to_json(path, indent=True)
+            content = path.read_text()
+            # Indented JSON should have newlines
+            assert "\n" in content
+    def test_to_jsonl_creates_file(self):
+        """Test to_jsonl creates JSON Lines file."""
+        items = ItemList()
+        items.append({"id": 1, "name": "first"})
+        items.append({"id": 2, "name": "second"})
+        items.append({"id": 3, "name": "third"})
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = Path(tmpdir) / "output.jsonl"
+            items.to_jsonl(path)
+            assert path.exists()
+            lines = path.read_text().strip().split("\n")
+            assert len(lines) == 3
+            # Each line should be valid JSON
+            for line in lines:
+                parsed = json.loads(line)
+                assert "id" in parsed
+                assert "name" in parsed
+    def test_to_jsonl_one_object_per_line(self):
+        """Test that JSONL has one JSON object per line."""
+        items = ItemList()
+        items.append({"line": 1})
+        items.append({"line": 2})
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = Path(tmpdir) / "output.jsonl"
+            items.to_jsonl(path)
+            lines = path.read_text().strip().split("\n")
+            assert json.loads(lines[0])["line"] == 1
+            assert json.loads(lines[1])["line"] == 2
+class TestCrawlStats:
+    """Test CrawlStats dataclass."""
+    def test_default_values(self):
+        """Test CrawlStats default values."""
+        stats = CrawlStats()
+        assert stats.requests_count == 0
+        assert stats.concurrent_requests == 0
+        assert stats.failed_requests_count == 0
+        assert stats.response_bytes == 0
+        assert stats.items_scraped == 0
+        assert stats.items_dropped == 0
+        assert stats.start_time == 0.0
+        assert stats.end_time == 0.0
+        assert stats.custom_stats == {}
+        assert stats.response_status_count == {}
+        assert stats.proxies == []
+    def test_elapsed_seconds(self):
+        """Test elapsed_seconds property."""
+        stats = CrawlStats(start_time=100.0, end_time=150.0)
+        assert stats.elapsed_seconds == 50.0
+    def test_requests_per_second(self):
+        """Test requests_per_second calculation."""
+        stats = CrawlStats(
+            requests_count=100,
+            start_time=0.0,
+            end_time=10.0,
+        )
+        assert stats.requests_per_second == 10.0
+    def test_requests_per_second_zero_elapsed(self):
+        """Test requests_per_second when elapsed is zero."""
+        stats = CrawlStats(
+            requests_count=100,
+            start_time=0.0,
+            end_time=0.0,
+        )
+        assert stats.requests_per_second == 0.0
+    def test_increment_status(self):
+        """Test increment_status method."""
+        stats = CrawlStats()
+        stats.increment_status(200)
+        stats.increment_status(200)
+        stats.increment_status(404)
+        assert stats.response_status_count == {"status_200": 2, "status_404": 1}
+    def test_increment_response_bytes(self):
+        """Test increment_response_bytes method."""
+        stats = CrawlStats()
+        stats.increment_response_bytes("example.com", 1000)
+        stats.increment_response_bytes("example.com", 500)
+        stats.increment_response_bytes("other.com", 2000)
+        assert stats.response_bytes == 3500
+        assert stats.domains_response_bytes == {
+            "example.com": 1500,
+            "other.com": 2000,
+        }
+    def test_increment_requests_count(self):
+        """Test increment_requests_count method."""
+        stats = CrawlStats()
+        stats.increment_requests_count("session1")
+        stats.increment_requests_count("session1")
+        stats.increment_requests_count("session2")
+        assert stats.requests_count == 3
+        assert stats.sessions_requests_count == {"session1": 2, "session2": 1}
+    def test_to_dict(self):
+        """Test to_dict method returns all stats."""
+        stats = CrawlStats(
+            items_scraped=10,
+            items_dropped=2,
+            requests_count=15,
+            start_time=0.0,
+            end_time=5.0,
+        )
+        stats.increment_status(200)
+        result = stats.to_dict()
+        assert result["items_scraped"] == 10
+        assert result["items_dropped"] == 2
+        assert result["requests_count"] == 15
+        assert result["elapsed_seconds"] == 5.0
+        assert result["requests_per_second"] == 3.0
+        assert result["response_status_count"] == {"status_200": 1}
+    def test_custom_stats(self):
+        """Test custom_stats can be used."""
+        stats = CrawlStats()
+        stats.custom_stats["my_metric"] = 42
+        stats.custom_stats["another"] = "value"
+        assert stats.custom_stats["my_metric"] == 42
+        assert stats.to_dict()["custom_stats"]["my_metric"] == 42
+class TestCrawlResult:
+    """Test CrawlResult dataclass."""
+    def test_basic_creation(self):
+        """Test basic CrawlResult creation."""
+        stats = CrawlStats(items_scraped=5)
+        items = ItemList()
+        items.extend([{"id": i} for i in range(5)])
+        result = CrawlResult(stats=stats, items=items)
+        assert result.stats.items_scraped == 5
+        assert len(result.items) == 5
+        assert result.paused is False
+    def test_completed_property_true_when_not_paused(self):
+        """Test completed is True when not paused."""
+        result = CrawlResult(
+            stats=CrawlStats(),
+            items=ItemList(),
+            paused=False,
+        )
+        assert result.completed is True
+    def test_completed_property_false_when_paused(self):
+        """Test completed is False when paused."""
+        result = CrawlResult(
+            stats=CrawlStats(),
+            items=ItemList(),
+            paused=True,
+        )
+        assert result.completed is False
+    def test_len_returns_item_count(self):
+        """Test len returns number of items."""
+        items = ItemList()
+        items.extend([{"id": i} for i in range(10)])
+        result = CrawlResult(stats=CrawlStats(), items=items)
+        assert len(result) == 10
+    def test_iter_yields_items(self):
+        """Test iteration yields items."""
+        items = ItemList()
+        items.extend([{"id": 1}, {"id": 2}, {"id": 3}])
+        result = CrawlResult(stats=CrawlStats(), items=items)
+        collected = list(result)
+        assert collected == [{"id": 1}, {"id": 2}, {"id": 3}]
+    def test_result_with_stats(self):
+        """Test CrawlResult with populated stats."""
+        stats = CrawlStats(
+            requests_count=100,
+            items_scraped=50,
+            failed_requests_count=5,
+            start_time=0.0,
+            end_time=10.0,
+        )
+        items = ItemList()
+        result = CrawlResult(stats=stats, items=items)
+        assert result.stats.requests_count == 100
+        assert result.stats.items_scraped == 50
+        assert result.stats.requests_per_second == 10.0
+class TestCrawlResultIntegration:
+    """Integration tests for result classes."""
+    def test_full_workflow(self):
+        """Test realistic workflow with all result classes."""
+        # Simulate a crawl
+        stats = CrawlStats(start_time=1000.0)
+        # Simulate requests
+        for _ in range(10):
+            stats.increment_requests_count("default")
+            stats.increment_status(200)
+            stats.increment_response_bytes("example.com", 5000)
+        # Simulate some failures
+        stats.failed_requests_count = 2
+        stats.blocked_requests_count = 1
+        # Collect items
+        items = ItemList()
+        for i in range(8):
+            items.append({"product_id": i, "name": f"Product {i}"})
+            stats.items_scraped += 1
+        # Finish crawl
+        stats.end_time = 1005.0
+        # Create result
+        result = CrawlResult(stats=stats, items=items, paused=False)
+        # Verify
+        assert result.completed is True
+        assert len(result) == 8
+        assert result.stats.requests_count == 10
+        assert result.stats.requests_per_second == 2.0
+        assert result.stats.response_bytes == 50000

tests/spiders/test_scheduler.py ADDED Viewed

	@@ -0,0 +1,390 @@

+"""Tests for the Scheduler class."""
+import pytest
+from scrapling.spiders.request import Request
+from scrapling.spiders.scheduler import Scheduler
+from scrapling.spiders.checkpoint import CheckpointData
+class TestSchedulerInit:
+    """Test Scheduler initialization."""
+    def test_scheduler_starts_empty(self):
+        """Test that scheduler starts with empty queue."""
+        scheduler = Scheduler()
+        assert len(scheduler) == 0
+        assert scheduler.is_empty is True
+class TestSchedulerEnqueue:
+    """Test Scheduler enqueue functionality."""
+    @pytest.mark.asyncio
+    async def test_enqueue_single_request(self):
+        """Test enqueueing a single request."""
+        scheduler = Scheduler()
+        request = Request("https://example.com")
+        result = await scheduler.enqueue(request)
+        assert result is True
+        assert len(scheduler) == 1
+        assert scheduler.is_empty is False
+    @pytest.mark.asyncio
+    async def test_enqueue_multiple_requests(self):
+        """Test enqueueing multiple requests."""
+        scheduler = Scheduler()
+        for i in range(5):
+            request = Request(f"https://example.com/{i}")
+            await scheduler.enqueue(request)
+        assert len(scheduler) == 5
+    @pytest.mark.asyncio
+    async def test_enqueue_duplicate_filtered(self):
+        """Test that duplicate requests are filtered by default."""
+        scheduler = Scheduler()
+        request1 = Request("https://example.com", sid="s1")
+        request2 = Request("https://example.com", sid="s1")  # Same fingerprint
+        result1 = await scheduler.enqueue(request1)
+        result2 = await scheduler.enqueue(request2)
+        assert result1 is True
+        assert result2 is False  # Duplicate filtered
+        assert len(scheduler) == 1
+    @pytest.mark.asyncio
+    async def test_enqueue_duplicate_allowed_with_dont_filter(self):
+        """Test that dont_filter allows duplicate requests."""
+        scheduler = Scheduler()
+        request1 = Request("https://example.com", sid="s1")
+        request2 = Request("https://example.com", sid="s1", dont_filter=True)
+        result1 = await scheduler.enqueue(request1)
+        result2 = await scheduler.enqueue(request2)
+        assert result1 is True
+        assert result2 is True
+        assert len(scheduler) == 2
+    @pytest.mark.asyncio
+    async def test_enqueue_different_sessions_not_duplicate(self):
+        """Test that same URL with different sessions are not duplicates."""
+        scheduler = Scheduler()
+        request1 = Request("https://example.com", sid="session1")
+        request2 = Request("https://example.com", sid="session2")
+        result1 = await scheduler.enqueue(request1)
+        result2 = await scheduler.enqueue(request2)
+        assert result1 is True
+        assert result2 is True
+        assert len(scheduler) == 2
+class TestSchedulerDequeue:
+    """Test Scheduler dequeue functionality."""
+    @pytest.mark.asyncio
+    async def test_dequeue_returns_request(self):
+        """Test that dequeue returns the enqueued request."""
+        scheduler = Scheduler()
+        original = Request("https://example.com")
+        await scheduler.enqueue(original)
+        dequeued = await scheduler.dequeue()
+        assert dequeued.url == original.url
+    @pytest.mark.asyncio
+    async def test_dequeue_respects_priority_order(self):
+        """Test that higher priority requests are dequeued first."""
+        scheduler = Scheduler()
+        low = Request("https://example.com/low", priority=1)
+        high = Request("https://example.com/high", priority=10)
+        medium = Request("https://example.com/medium", priority=5)
+        await scheduler.enqueue(low)
+        await scheduler.enqueue(high)
+        await scheduler.enqueue(medium)
+        # Should get high priority first
+        first = await scheduler.dequeue()
+        assert first.url == "https://example.com/high"
+        second = await scheduler.dequeue()
+        assert second.url == "https://example.com/medium"
+        third = await scheduler.dequeue()
+        assert third.url == "https://example.com/low"
+    @pytest.mark.asyncio
+    async def test_dequeue_fifo_for_same_priority(self):
+        """Test FIFO ordering for requests with same priority."""
+        scheduler = Scheduler()
+        for i in range(3):
+            request = Request(f"https://example.com/{i}", priority=5)
+            await scheduler.enqueue(request)
+        first = await scheduler.dequeue()
+        second = await scheduler.dequeue()
+        third = await scheduler.dequeue()
+        # Should be in FIFO order since same priority
+        assert first.url == "https://example.com/0"
+        assert second.url == "https://example.com/1"
+        assert third.url == "https://example.com/2"
+    @pytest.mark.asyncio
+    async def test_dequeue_updates_length(self):
+        """Test that dequeue decreases the queue length."""
+        scheduler = Scheduler()
+        await scheduler.enqueue(Request("https://example.com/1"))
+        await scheduler.enqueue(Request("https://example.com/2"))
+        assert len(scheduler) == 2
+        await scheduler.dequeue()
+        assert len(scheduler) == 1
+        await scheduler.dequeue()
+        assert len(scheduler) == 0
+        assert scheduler.is_empty is True
+class TestSchedulerSnapshot:
+    """Test Scheduler snapshot functionality for checkpointing."""
+    @pytest.mark.asyncio
+    async def test_snapshot_empty_scheduler(self):
+        """Test snapshot of empty scheduler."""
+        scheduler = Scheduler()
+        requests, seen = scheduler.snapshot()
+        assert requests == []
+        assert seen == set()
+    @pytest.mark.asyncio
+    async def test_snapshot_captures_pending_requests(self):
+        """Test snapshot captures all pending requests."""
+        scheduler = Scheduler()
+        await scheduler.enqueue(Request("https://example.com/1", priority=5))
+        await scheduler.enqueue(Request("https://example.com/2", priority=10))
+        await scheduler.enqueue(Request("https://example.com/3", priority=1))
+        requests, seen = scheduler.snapshot()
+        assert len(requests) == 3
+        # Should be sorted by priority (highest first due to negative priority in queue)
+        assert requests[0].url == "https://example.com/2"  # priority 10
+        assert requests[1].url == "https://example.com/1"  # priority 5
+        assert requests[2].url == "https://example.com/3"  # priority 1
+    @pytest.mark.asyncio
+    async def test_snapshot_captures_seen_set(self):
+        """Test snapshot captures seen URLs."""
+        scheduler = Scheduler()
+        await scheduler.enqueue(Request("https://example.com/1", sid="s1"))
+        await scheduler.enqueue(Request("https://example.com/2", sid="s1"))
+        requests, seen = scheduler.snapshot()
+        assert len(seen) == 2
+        assert "s1:https://example.com/1" in seen
+        assert "s1:https://example.com/2" in seen
+    @pytest.mark.asyncio
+    async def test_snapshot_returns_copies(self):
+        """Test that snapshot returns copies, not references."""
+        scheduler = Scheduler()
+        await scheduler.enqueue(Request("https://example.com"))
+        requests, seen = scheduler.snapshot()
+        # Modifying snapshot shouldn't affect scheduler
+        requests.append(Request("https://modified.com"))
+        seen.add("new_fingerprint")
+        original_requests, original_seen = scheduler.snapshot()
+        assert len(original_requests) == 1
+        assert "new_fingerprint" not in original_seen
+    @pytest.mark.asyncio
+    async def test_snapshot_excludes_dequeued_requests(self):
+        """Test snapshot only includes pending requests."""
+        scheduler = Scheduler()
+        await scheduler.enqueue(Request("https://example.com/1"))
+        await scheduler.enqueue(Request("https://example.com/2"))
+        await scheduler.enqueue(Request("https://example.com/3"))
+        # Dequeue one
+        await scheduler.dequeue()
+        requests, seen = scheduler.snapshot()
+        # Snapshot should only have 2 pending requests
+        assert len(requests) == 2
+        # But seen should still have all 3 (deduplication tracking)
+        assert len(seen) == 3
+class TestSchedulerRestore:
+    """Test Scheduler restore functionality from checkpoint."""
+    @pytest.mark.asyncio
+    async def test_restore_requests(self):
+        """Test restoring requests from checkpoint data."""
+        scheduler = Scheduler()
+        checkpoint_requests = [
+            Request("https://example.com/1", priority=10),
+            Request("https://example.com/2", priority=5),
+        ]
+        checkpoint_seen = {"fp1", "fp2", "fp3"}
+        data = CheckpointData(requests=checkpoint_requests, seen=checkpoint_seen)
+        scheduler.restore(data)
+        assert len(scheduler) == 2
+    @pytest.mark.asyncio
+    async def test_restore_seen_set(self):
+        """Test that restore sets up seen fingerprints."""
+        scheduler = Scheduler()
+        data = CheckpointData(
+            requests=[],
+            seen={"fp1", "fp2"},
+        )
+        scheduler.restore(data)
+        # Now try to enqueue a request with matching fingerprint
+        request = Request("https://example.com")
+        request.sid = ""  # Empty sid
+        # Manually set fingerprint that matches seen
+        # Since fingerprint is sid:url, we need to create matching ones
+        # Verify seen set was restored
+        _, seen = scheduler.snapshot()
+        assert seen == {"fp1", "fp2"}
+    @pytest.mark.asyncio
+    async def test_restore_maintains_priority_order(self):
+        """Test that restored requests maintain priority order."""
+        scheduler = Scheduler()
+        # Requests should already be sorted by priority in checkpoint
+        checkpoint_requests = [
+            Request("https://example.com/high", priority=10),
+            Request("https://example.com/low", priority=1),
+        ]
+        data = CheckpointData(requests=checkpoint_requests, seen=set())
+        scheduler.restore(data)
+        # Dequeue should return high priority first
+        first = await scheduler.dequeue()
+        assert first.url == "https://example.com/high"
+        second = await scheduler.dequeue()
+        assert second.url == "https://example.com/low"
+    @pytest.mark.asyncio
+    async def test_restore_empty_checkpoint(self):
+        """Test restoring from empty checkpoint."""
+        scheduler = Scheduler()
+        data = CheckpointData(requests=[], seen=set())
+        scheduler.restore(data)
+        assert len(scheduler) == 0
+        assert scheduler.is_empty is True
+class TestSchedulerIntegration:
+    """Integration tests for Scheduler with checkpoint roundtrip."""
+    @pytest.mark.asyncio
+    async def test_snapshot_and_restore_roundtrip(self):
+        """Test that snapshot -> restore works correctly."""
+        # Create and populate original scheduler
+        original = Scheduler()
+        await original.enqueue(Request("https://example.com/1", sid="s1", priority=10))
+        await original.enqueue(Request("https://example.com/2", sid="s1", priority=5))
+        await original.enqueue(Request("https://example.com/3", sid="s2", priority=7))
+        # Snapshot
+        requests, seen = original.snapshot()
+        data = CheckpointData(requests=requests, seen=seen)
+        # Restore to new scheduler
+        restored = Scheduler()
+        restored.restore(data)
+        # Verify state matches
+        assert len(restored) == len(original)
+        # Dequeue from both and compare
+        for _ in range(3):
+            orig_req = await original.dequeue()
+            rest_req = await restored.dequeue()
+            assert orig_req.url == rest_req.url
+            assert orig_req.priority == rest_req.priority
+    @pytest.mark.asyncio
+    async def test_partial_processing_then_checkpoint(self):
+        """Test checkpointing after partial processing."""
+        scheduler = Scheduler()
+        # Enqueue 5 requests
+        for i in range(5):
+            await scheduler.enqueue(Request(f"https://example.com/{i}"))
+        # Process 2
+        await scheduler.dequeue()
+        await scheduler.dequeue()
+        # Snapshot should show 3 pending, 5 seen
+        requests, seen = scheduler.snapshot()
+        assert len(requests) == 3
+        assert len(seen) == 5
+    @pytest.mark.asyncio
+    async def test_deduplication_after_restore(self):
+        """Test that deduplication works after restore."""
+        scheduler = Scheduler()
+        await scheduler.enqueue(Request("https://example.com", sid="s1"))
+        requests, seen = scheduler.snapshot()
+        data = CheckpointData(requests=requests, seen=seen)
+        # Restore to new scheduler
+        new_scheduler = Scheduler()
+        new_scheduler.restore(data)
+        # Try to add duplicate - should be filtered
+        result = await new_scheduler.enqueue(Request("https://example.com", sid="s1"))
+        assert result is False  # Duplicate filtered based on restored seen set

tests/spiders/test_session.py ADDED Viewed

	@@ -0,0 +1,352 @@

+"""Tests for the SessionManager class."""
+from scrapling.core._types import Any
+import pytest
+from scrapling.spiders.session import SessionManager
+class MockSession:  # type: ignore[type-arg]
+    """Mock session for testing without actual network calls."""
+    def __init__(self, name: str = "mock"):
+        self.name = name
+        self._is_alive = False
+        self._started = False
+        self._closed = False
+    async def __aenter__(self):
+        self._is_alive = True
+        self._started = True
+        return self
+    async def __aexit__(self, *args):
+        self._is_alive = False
+        self._closed = True
+    async def fetch(self, url: str, **kwargs):
+        pass
+class TestSessionManagerInit:
+    """Test SessionManager initialization."""
+    def test_manager_starts_empty(self):
+        """Test that manager starts with no sessions."""
+        manager = SessionManager()
+        assert len(manager) == 0
+    def test_manager_no_default_session_when_empty(self):
+        """Test that accessing default_session_id raises when empty."""
+        manager = SessionManager()
+        with pytest.raises(RuntimeError, match="No sessions registered"):
+            _ = manager.default_session_id
+class TestSessionManagerAdd:
+    """Test SessionManager add functionality."""
+    def test_add_single_session(self):
+        """Test adding a single session."""
+        manager = SessionManager()
+        session = MockSession()
+        manager.add("test", session)
+        assert len(manager) == 1
+        assert "test" in manager
+        assert manager.session_ids == ["test"]
+    def test_first_session_becomes_default(self):
+        """Test that first added session becomes default."""
+        manager = SessionManager()
+        session = MockSession()
+        manager.add("first", session)
+        assert manager.default_session_id == "first"
+    def test_add_multiple_sessions(self):
+        """Test adding multiple sessions."""
+        manager = SessionManager()
+        manager.add("session1", MockSession("s1"))
+        manager.add("session2", MockSession("s2"))
+        manager.add("session3", MockSession("s3"))
+        assert len(manager) == 3
+        assert "session1" in manager
+        assert "session2" in manager
+        assert "session3" in manager
+    def test_explicit_default_session(self):
+        """Test setting explicit default session."""
+        manager = SessionManager()
+        manager.add("first", MockSession())
+        manager.add("second", MockSession(), default=True)
+        assert manager.default_session_id == "second"
+    def test_add_duplicate_id_raises(self):
+        """Test that adding duplicate session ID raises."""
+        manager = SessionManager()
+        manager.add("test", MockSession())
+        with pytest.raises(ValueError, match="already registered"):
+            manager.add("test", MockSession())
+    def test_add_returns_self_for_chaining(self):
+        """Test that add returns self for method chaining."""
+        manager = SessionManager()
+        result = manager.add("test", MockSession())
+        assert result is manager
+    def test_method_chaining(self):
+        """Test fluent interface for adding sessions."""
+        manager = SessionManager()
+        manager.add("s1", MockSession()).add("s2", MockSession()).add("s3", MockSession())
+        assert len(manager) == 3
+    def test_add_lazy_session(self):
+        """Test adding lazy session."""
+        manager = SessionManager()
+        manager.add("lazy", MockSession(), lazy=True)
+        assert "lazy" in manager
+        assert "lazy" in manager._lazy_sessions
+class TestSessionManagerRemove:
+    """Test SessionManager remove/pop functionality."""
+    def test_remove_session(self):
+        """Test removing a session."""
+        manager = SessionManager()
+        manager.add("test", MockSession())
+        manager.remove("test")
+        assert "test" not in manager
+        assert len(manager) == 0
+    def test_remove_nonexistent_raises(self):
+        """Test removing nonexistent session raises."""
+        manager = SessionManager()
+        with pytest.raises(KeyError, match="not found"):
+            manager.remove("nonexistent")
+    def test_pop_returns_session(self):
+        """Test pop returns the removed session."""
+        manager = SessionManager()
+        session = MockSession("original")
+        manager.add("test", session)
+        popped = manager.pop("test")
+        assert popped is session
+        assert "test" not in manager
+    def test_remove_default_updates_default(self):
+        """Test that removing default session updates default."""
+        manager = SessionManager()
+        manager.add("first", MockSession())
+        manager.add("second", MockSession())
+        assert manager.default_session_id == "first"
+        manager.remove("first")
+        assert manager.default_session_id == "second"
+    def test_remove_lazy_session_cleans_up(self):
+        """Test that removing lazy session cleans up lazy set."""
+        manager = SessionManager()
+        manager.add("lazy", MockSession(), lazy=True)
+        manager.remove("lazy")
+        assert "lazy" not in manager._lazy_sessions
+class TestSessionManagerGet:
+    """Test SessionManager get functionality."""
+    def test_get_existing_session(self):
+        """Test getting an existing session."""
+        manager = SessionManager()
+        session = MockSession("test")
+        manager.add("test", session)
+        retrieved = manager.get("test")
+        assert retrieved is session
+    def test_get_nonexistent_raises_with_available(self):
+        """Test getting nonexistent session shows available sessions."""
+        manager = SessionManager()
+        manager.add("session1", MockSession())
+        manager.add("session2", MockSession())
+        with pytest.raises(KeyError, match="Available:"):
+            manager.get("nonexistent")
+class TestSessionManagerContains:
+    """Test SessionManager contains functionality."""
+    def test_contains_existing(self):
+        """Test contains for existing session."""
+        manager = SessionManager()
+        manager.add("test", MockSession())
+        assert "test" in manager
+    def test_not_contains_missing(self):
+        """Test contains for missing session."""
+        manager = SessionManager()
+        manager.add("test", MockSession())
+        assert "other" not in manager
+class TestSessionManagerAsyncContext:
+    """Test SessionManager async context manager."""
+    @pytest.mark.asyncio
+    async def test_start_activates_sessions(self):
+        """Test that start activates non-lazy sessions."""
+        manager = SessionManager()
+        session = MockSession()
+        manager.add("test", session)
+        await manager.start()
+        assert session._is_alive is True
+        assert manager._started is True
+    @pytest.mark.asyncio
+    async def test_start_skips_lazy_sessions(self):
+        """Test that start skips lazy sessions."""
+        manager = SessionManager()
+        eager_session = MockSession("eager")
+        lazy_session = MockSession("lazy")
+        manager.add("eager", eager_session)
+        manager.add("lazy", lazy_session, lazy=True)
+        await manager.start()
+        assert eager_session._is_alive is True
+        assert lazy_session._is_alive is False
+    @pytest.mark.asyncio
+    async def test_close_deactivates_sessions(self):
+        """Test that close deactivates all sessions."""
+        manager = SessionManager()
+        session = MockSession()
+        manager.add("test", session)
+        await manager.start()
+        assert session._is_alive is True
+        await manager.close()
+        assert session._is_alive is False
+        assert manager._started is False
+    @pytest.mark.asyncio
+    async def test_async_context_manager(self):
+        """Test using SessionManager as async context manager."""
+        manager = SessionManager()
+        session = MockSession()
+        manager.add("test", session)
+        async with manager:
+            assert session._is_alive is True
+        assert session._is_alive is False
+    @pytest.mark.asyncio
+    async def test_start_idempotent(self):
+        """Test that calling start multiple times is safe."""
+        manager = SessionManager()
+        session = MockSession()
+        manager.add("test", session)
+        await manager.start()
+        await manager.start()  # Should not raise or double-start
+        assert session._started is True
+class TestSessionManagerProperties:
+    """Test SessionManager properties."""
+    def test_session_ids_returns_list(self):
+        """Test session_ids returns list of IDs."""
+        manager = SessionManager()
+        manager.add("a", MockSession())
+        manager.add("b", MockSession())
+        manager.add("c", MockSession())
+        ids = manager.session_ids
+        assert isinstance(ids, list)
+        assert set(ids) == {"a", "b", "c"}
+    def test_len_returns_session_count(self):
+        """Test len returns number of sessions."""
+        manager = SessionManager()
+        assert len(manager) == 0
+        manager.add("s1", MockSession())
+        assert len(manager) == 1
+        manager.add("s2", MockSession())
+        assert len(manager) == 2
+class TestSessionManagerIntegration:
+    """Integration tests for SessionManager."""
+    def test_realistic_setup(self):
+        """Test realistic session manager setup."""
+        manager = SessionManager()
+        # Add different types of sessions
+        manager.add("default", MockSession("default"))
+        manager.add("backup", MockSession("backup"))
+        manager.add("lazy_special", MockSession("special"), lazy=True)
+        assert len(manager) == 3
+        assert manager.default_session_id == "default"
+        assert "lazy_special" in manager._lazy_sessions
+    @pytest.mark.asyncio
+    async def test_lifecycle_management(self):
+        """Test complete lifecycle of session manager."""
+        manager = SessionManager()
+        sessions = [MockSession(f"s{i}") for i in range(3)]
+        for i, session in enumerate(sessions):
+            manager.add(f"session{i}", session)
+        # Before start - no sessions active
+        assert all(not s._is_alive for s in sessions)
+        # After start - all active
+        await manager.start()
+        assert all(s._is_alive for s in sessions)
+        # After close - all inactive
+        await manager.close()
+        assert all(not s._is_alive for s in sessions)

tests/spiders/test_spider.py ADDED Viewed

	@@ -0,0 +1,574 @@

+"""Tests for the Spider class and related components."""
+import logging
+import tempfile
+from pathlib import Path
+import pytest
+from scrapling.spiders.spider import Spider, SessionConfigurationError, LogCounterHandler, BLOCKED_CODES
+from scrapling.spiders.request import Request
+from scrapling.spiders.session import SessionManager
+from scrapling.spiders.result import CrawlStats
+from scrapling.core._types import Any, Dict, AsyncGenerator
+class TestLogCounterHandler:
+    """Test LogCounterHandler for tracking log counts."""
+    def test_initial_counts_are_zero(self):
+        """Test that handler starts with zero counts."""
+        handler = LogCounterHandler()
+        counts = handler.get_counts()
+        assert counts["debug"] == 0
+        assert counts["info"] == 0
+        assert counts["warning"] == 0
+        assert counts["error"] == 0
+        assert counts["critical"] == 0
+    def test_counts_debug_messages(self):
+        """Test counting debug level messages."""
+        handler = LogCounterHandler()
+        record = logging.LogRecord(
+            name="test",
+            level=logging.DEBUG,
+            pathname="",
+            lineno=0,
+            msg="test",
+            args=(),
+            exc_info=None,
+        )
+        handler.emit(record)
+        handler.emit(record)
+        assert handler.get_counts()["debug"] == 2
+    def test_counts_info_messages(self):
+        """Test counting info level messages."""
+        handler = LogCounterHandler()
+        record = logging.LogRecord(
+            name="test",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg="test",
+            args=(),
+            exc_info=None,
+        )
+        handler.emit(record)
+        assert handler.get_counts()["info"] == 1
+    def test_counts_warning_messages(self):
+        """Test counting warning level messages."""
+        handler = LogCounterHandler()
+        record = logging.LogRecord(
+            name="test",
+            level=logging.WARNING,
+            pathname="",
+            lineno=0,
+            msg="test",
+            args=(),
+            exc_info=None,
+        )
+        handler.emit(record)
+        assert handler.get_counts()["warning"] == 1
+    def test_counts_error_messages(self):
+        """Test counting error level messages."""
+        handler = LogCounterHandler()
+        record = logging.LogRecord(
+            name="test",
+            level=logging.ERROR,
+            pathname="",
+            lineno=0,
+            msg="test",
+            args=(),
+            exc_info=None,
+        )
+        handler.emit(record)
+        assert handler.get_counts()["error"] == 1
+    def test_counts_critical_messages(self):
+        """Test counting critical level messages."""
+        handler = LogCounterHandler()
+        record = logging.LogRecord(
+            name="test",
+            level=logging.CRITICAL,
+            pathname="",
+            lineno=0,
+            msg="test",
+            args=(),
+            exc_info=None,
+        )
+        handler.emit(record)
+        assert handler.get_counts()["critical"] == 1
+    def test_counts_multiple_levels(self):
+        """Test counting messages at different levels."""
+        handler = LogCounterHandler()
+        levels = [
+            logging.DEBUG,
+            logging.DEBUG,
+            logging.INFO,
+            logging.WARNING,
+            logging.ERROR,
+            logging.ERROR,
+            logging.ERROR,
+            logging.CRITICAL,
+        ]
+        for level in levels:
+            record = logging.LogRecord(
+                name="test",
+                level=level,
+                pathname="",
+                lineno=0,
+                msg="test",
+                args=(),
+                exc_info=None,
+            )
+            handler.emit(record)
+        counts = handler.get_counts()
+        assert counts["debug"] == 2
+        assert counts["info"] == 1
+        assert counts["warning"] == 1
+        assert counts["error"] == 3
+        assert counts["critical"] == 1
+class TestBlockedCodes:
+    """Test BLOCKED_CODES constant."""
+    def test_blocked_codes_contains_expected_values(self):
+        """Test that BLOCKED_CODES contains expected HTTP status codes."""
+        assert 401 in BLOCKED_CODES  # Unauthorized
+        assert 403 in BLOCKED_CODES  # Forbidden
+        assert 407 in BLOCKED_CODES  # Proxy Authentication Required
+        assert 429 in BLOCKED_CODES  # Too Many Requests
+        assert 444 in BLOCKED_CODES  # Connection Closed Without Response (nginx)
+        assert 500 in BLOCKED_CODES  # Internal Server Error
+        assert 502 in BLOCKED_CODES  # Bad Gateway
+        assert 503 in BLOCKED_CODES  # Service Unavailable
+        assert 504 in BLOCKED_CODES  # Gateway Timeout
+    def test_blocked_codes_does_not_contain_success(self):
+        """Test that success codes are not blocked."""
+        assert 200 not in BLOCKED_CODES
+        assert 201 not in BLOCKED_CODES
+        assert 204 not in BLOCKED_CODES
+        assert 301 not in BLOCKED_CODES
+        assert 302 not in BLOCKED_CODES
+class ConcreteSpider(Spider):
+    """Concrete spider implementation for testing."""
+    name = "test_spider"
+    start_urls = ["https://example.com"]
+    async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+        yield {"url": str(response)}
+class TestSpiderInit:
+    """Test Spider initialization."""
+    def test_spider_requires_name(self):
+        """Test that spider without name raises ValueError."""
+        class NoNameSpider(Spider):
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        with pytest.raises(ValueError, match="must have a name"):
+            NoNameSpider()
+    def test_spider_initializes_logger(self):
+        """Test that spider creates a logger."""
+        spider = ConcreteSpider()
+        assert spider.logger is not None
+        assert spider.logger.name == "scrapling.spiders.test_spider"
+    def test_spider_logger_has_log_counter(self):
+        """Test that spider logger has log counter handler."""
+        spider = ConcreteSpider()
+        assert spider._log_counter is not None
+        assert isinstance(spider._log_counter, LogCounterHandler)
+    def test_spider_with_crawldir(self):
+        """Test spider initialization with crawldir."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            spider = ConcreteSpider(crawldir=tmpdir)
+            assert spider.crawldir == Path(tmpdir)
+    def test_spider_without_crawldir(self):
+        """Test spider initialization without crawldir."""
+        spider = ConcreteSpider()
+        assert spider.crawldir is None
+    def test_spider_custom_interval(self):
+        """Test spider with custom checkpoint interval."""
+        spider = ConcreteSpider(interval=60.0)
+        assert spider._interval == 60.0
+    def test_spider_default_interval(self):
+        """Test spider has default checkpoint interval."""
+        spider = ConcreteSpider()
+        assert spider._interval == 300.0
+    def test_spider_repr(self):
+        """Test spider string representation."""
+        spider = ConcreteSpider()
+        repr_str = repr(spider)
+        assert "ConcreteSpider" in repr_str
+        assert "test_spider" in repr_str
+class TestSpiderClassAttributes:
+    """Test Spider class attribute defaults."""
+    def test_default_concurrent_requests(self):
+        """Test default concurrent_requests is 16."""
+        assert ConcreteSpider.concurrent_requests == 16
+    def test_default_concurrent_requests_per_domain(self):
+        """Test default concurrent_requests_per_domain is 0 (disabled)."""
+        assert ConcreteSpider.concurrent_requests_per_domain == 0
+    def test_default_download_delay(self):
+        """Test default download_delay is 0."""
+        assert ConcreteSpider.download_delay == 0.0
+    def test_default_max_blocked_retries(self):
+        """Test default max_blocked_retries is 3."""
+        assert ConcreteSpider.max_blocked_retries == 3
+    def test_default_logging_level(self):
+        """Test default logging level is DEBUG."""
+        assert ConcreteSpider.logging_level == logging.DEBUG
+    def test_default_allowed_domains_empty(self):
+        """Test default allowed_domains is empty set."""
+        assert ConcreteSpider.allowed_domains == set()
+class TestSpiderSessionConfiguration:
+    """Test Spider session configuration."""
+    def test_default_configure_sessions(self):
+        """Test that default configure_sessions adds a session."""
+        spider = ConcreteSpider()
+        assert len(spider._session_manager) > 0
+    def test_configure_sessions_error_raises_custom_exception(self):
+        """Test that errors in configure_sessions raise SessionConfigurationError."""
+        class BadSessionSpider(Spider):
+            name = "bad_spider"
+            def configure_sessions(self, manager: SessionManager) -> None:
+                raise RuntimeError("Configuration failed!")
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        with pytest.raises(SessionConfigurationError, match="Configuration failed"):
+            BadSessionSpider()
+    def test_configure_sessions_no_sessions_raises(self):
+        """Test that not adding any sessions raises SessionConfigurationError."""
+        class NoSessionSpider(Spider):
+            name = "no_session_spider"
+            def configure_sessions(self, manager: SessionManager) -> None:
+                pass  # Don't add any sessions
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        with pytest.raises(SessionConfigurationError, match="did not add any sessions"):
+            NoSessionSpider()
+class TestSpiderStartRequests:
+    """Test Spider start_requests method."""
+    @pytest.mark.asyncio
+    async def test_start_requests_yields_from_start_urls(self):
+        """Test that start_requests yields requests for start_urls."""
+        class MultiUrlSpider(Spider):
+            name = "multi_url"
+            start_urls = [
+                "https://example.com/1",
+                "https://example.com/2",
+                "https://example.com/3",
+            ]
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        spider = MultiUrlSpider()
+        requests = [r async for r in spider.start_requests()]
+        assert len(requests) == 3
+        assert requests[0].url == "https://example.com/1"
+        assert requests[1].url == "https://example.com/2"
+        assert requests[2].url == "https://example.com/3"
+    @pytest.mark.asyncio
+    async def test_start_requests_no_urls_raises(self):
+        """Test that start_requests raises when no start_urls."""
+        class NoUrlSpider(Spider):
+            name = "no_url"
+            start_urls = []
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        spider = NoUrlSpider()
+        with pytest.raises(RuntimeError, match="no starting point"):
+            async for _ in spider.start_requests():
+                pass
+    @pytest.mark.asyncio
+    async def test_start_requests_uses_default_session(self):
+        """Test that start_requests uses default session ID."""
+        spider = ConcreteSpider()
+        requests = [r async for r in spider.start_requests()]
+        # Should use the default session from session manager
+        default_sid = spider._session_manager.default_session_id
+        assert requests[0].sid == default_sid
+class TestSpiderHooks:
+    """Test Spider lifecycle hooks."""
+    @pytest.mark.asyncio
+    async def test_on_start_default(self):
+        """Test default on_start doesn't raise."""
+        spider = ConcreteSpider()
+        # Should not raise
+        await spider.on_start(resuming=False)
+        await spider.on_start(resuming=True)
+    @pytest.mark.asyncio
+    async def test_on_close_default(self):
+        """Test default on_close doesn't raise."""
+        spider = ConcreteSpider()
+        # Should not raise
+        await spider.on_close()
+    @pytest.mark.asyncio
+    async def test_on_error_default(self):
+        """Test default on_error logs the error."""
+        spider = ConcreteSpider()
+        request = Request("https://example.com")
+        error = ValueError("test error")
+        # Should not raise
+        await spider.on_error(request, error)
+    @pytest.mark.asyncio
+    async def test_on_scraped_item_default_returns_item(self):
+        """Test default on_scraped_item returns the item unchanged."""
+        spider = ConcreteSpider()
+        item = {"key": "value", "nested": {"a": 1}}
+        result = await spider.on_scraped_item(item)
+        assert result == item
+    @pytest.mark.asyncio
+    async def test_is_blocked_default_checks_status_codes(self):
+        """Test default is_blocked checks blocked status codes."""
+        class MockResponse:
+            def __init__(self, status: int):
+                self.status = status
+        spider = ConcreteSpider()
+        # Test blocked codes
+        assert await spider.is_blocked(MockResponse(403)) is True
+        assert await spider.is_blocked(MockResponse(429)) is True
+        assert await spider.is_blocked(MockResponse(503)) is True
+        # Test non-blocked codes
+        assert await spider.is_blocked(MockResponse(200)) is False
+        assert await spider.is_blocked(MockResponse(404)) is False
+    @pytest.mark.asyncio
+    async def test_retry_blocked_request_default_returns_request(self):
+        """Test default retry_blocked_request returns the request unchanged."""
+        class MockResponse:
+            status = 429
+        spider = ConcreteSpider()
+        request = Request("https://example.com", priority=5)
+        result = await spider.retry_blocked_request(request, MockResponse())
+        assert result is request
+class TestSpiderPause:
+    """Test Spider pause functionality."""
+    def test_pause_without_engine_raises(self):
+        """Test that pause without active engine raises RuntimeError."""
+        spider = ConcreteSpider()
+        with pytest.raises(RuntimeError, match="no crawl engine started"):
+            spider.pause()
+class TestSpiderStats:
+    """Test Spider stats property."""
+    def test_stats_without_engine_raises(self):
+        """Test that accessing stats without active crawl raises."""
+        spider = ConcreteSpider()
+        with pytest.raises(RuntimeError, match="No active crawl"):
+            _ = spider.stats
+class TestSpiderCustomization:
+    """Test Spider customization patterns."""
+    def test_custom_concurrent_requests(self):
+        """Test spider with custom concurrent_requests."""
+        class CustomSpider(Spider):
+            name = "custom"
+            concurrent_requests = 32
+            start_urls = ["https://example.com"]
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        spider = CustomSpider()
+        assert spider.concurrent_requests == 32
+    def test_custom_allowed_domains(self):
+        """Test spider with allowed_domains."""
+        class DomainSpider(Spider):
+            name = "domain_spider"
+            start_urls = ["https://example.com"]
+            allowed_domains = {"example.com", "api.example.com"}
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        spider = DomainSpider()
+        assert "example.com" in spider.allowed_domains
+        assert "api.example.com" in spider.allowed_domains
+    def test_custom_download_delay(self):
+        """Test spider with download delay."""
+        class SlowSpider(Spider):
+            name = "slow"
+            download_delay = 1.5
+            start_urls = ["https://example.com"]
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        spider = SlowSpider()
+        assert spider.download_delay == 1.5
+class TestSpiderLogging:
+    """Test Spider logging configuration."""
+    def test_custom_logging_level(self):
+        """Test spider with custom logging level."""
+        class QuietSpider(Spider):
+            name = "quiet"
+            logging_level = logging.WARNING
+            start_urls = ["https://example.com"]
+            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                yield None
+        spider = QuietSpider()
+        assert spider.logger.level == logging.WARNING
+    def test_log_file_creates_handler(self):
+        """Test spider with log file creates file handler."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            log_path = Path(tmpdir) / "spider.log"
+            class FileLogSpider(Spider):
+                name = "file_log"
+                log_file = str(log_path)
+                start_urls = ["https://example.com"]
+                async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+                    yield None
+            spider = FileLogSpider()
+            # Should have a file handler
+            file_handlers = [
+                h for h in spider.logger.handlers if isinstance(h, logging.FileHandler)
+            ]
+            assert len(file_handlers) == 1
+            # Clean up
+            for h in file_handlers:
+                h.close()
+    def test_logger_does_not_propagate(self):
+        """Test that spider logger does not propagate to parent."""
+        spider = ConcreteSpider()
+        assert spider.logger.propagate is False
+class TestSessionConfigurationError:
+    """Test SessionConfigurationError exception."""
+    def test_exception_message(self):
+        """Test that exception preserves message."""
+        error = SessionConfigurationError("Custom error message")
+        assert str(error) == "Custom error message"
+    def test_exception_is_exception(self):
+        """Test that it's a proper exception."""
+        error = SessionConfigurationError("test")
+        assert isinstance(error, Exception)