Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Aug 3, 2025

Commit

a2d3f34

1 Parent(s): 7e18800

test: adding new tests and updating existing ones

Browse files

Files changed (6) hide show

tests/fetchers/sync/test_requests_session.py +56 -0
tests/fetchers/test_base.py +84 -0
tests/fetchers/test_constants.py +27 -0
tests/fetchers/test_pages.py +154 -0
tests/fetchers/test_utils.py +218 -2
tests/fetchers/test_validator.py +79 -0

tests/fetchers/sync/test_requests_session.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import pytest
+from scrapling.engines.static import FetcherSession, FetcherClient, AsyncFetcherClient
+class TestFetcherSession:
+    """Test FetcherSession functionality"""
+    def test_fetcher_session_creation(self):
+        """Test FetcherSession creation"""
+        session = FetcherSession(
+            timeout=30,
+            retries=3,
+            stealthy_headers=True
+        )
+        assert session.default_timeout == 30
+        assert session.default_retries == 3
+        assert session.stealth is True
+    def test_fetcher_session_context_manager(self):
+        """Test FetcherSession as a context manager"""
+        session = FetcherSession()
+        with session as s:
+            assert s == session
+            assert session._curl_session is not None
+        # Session should be cleaned up
+    def test_fetcher_session_double_enter(self):
+        """Test error on double entering"""
+        session = FetcherSession()
+        with session:
+            with pytest.raises(RuntimeError):
+                session.__enter__()
+    def test_fetcher_client_creation(self):
+        """Test FetcherClient creation"""
+        client = FetcherClient()
+        # Should not have context manager methods
+        assert client.__enter__ is None
+        assert client.__exit__ is None
+        assert client._curl_session is True  # Special marker
+    def test_async_fetcher_client_creation(self):
+        """Test AsyncFetcherClient creation"""
+        client = AsyncFetcherClient()
+        # Should not have context manager methods
+        assert client.__aenter__ is None
+        assert client.__aexit__ is None
+        assert client._async_curl_session is True  # Special marker

tests/fetchers/test_base.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import pytest
+from scrapling.engines.toolbelt.custom import BaseFetcher
+class TestBaseFetcher:
+    """Test BaseFetcher configuration functionality"""
+    def test_default_configuration(self):
+        """Test default configuration values"""
+        config = BaseFetcher.display_config()
+        assert config['huge_tree'] is True
+        assert config['adaptive'] is False
+        assert config['keep_comments'] is False
+        assert config['keep_cdata'] is False
+    def test_configure_single_parameter(self):
+        """Test configuring single parameter"""
+        BaseFetcher.configure(adaptive=True)
+        config = BaseFetcher.display_config()
+        assert config['adaptive'] is True
+        # Reset
+        BaseFetcher.configure(adaptive=False)
+    def test_configure_multiple_parameters(self):
+        """Test configuring multiple parameters"""
+        BaseFetcher.configure(
+            huge_tree=False,
+            keep_comments=True,
+            adaptive=True
+        )
+        config = BaseFetcher.display_config()
+        assert config['huge_tree'] is False
+        assert config['keep_comments'] is True
+        assert config['adaptive'] is True
+        # Reset
+        BaseFetcher.configure(
+            huge_tree=True,
+            keep_comments=False,
+            adaptive=False
+        )
+    def test_configure_invalid_parameter(self):
+        """Test configuring invalid parameter"""
+        with pytest.raises(ValueError):
+            BaseFetcher.configure(invalid_param=True)
+    def test_configure_no_parameters(self):
+        """Test configure with no parameters"""
+        with pytest.raises(AttributeError):
+            BaseFetcher.configure()
+    def test_configure_non_parser_keyword(self):
+        """Test configuring non-parser keyword"""
+        with pytest.raises(AttributeError):
+            # Assuming there's some attribute that's not in parser_keywords
+            BaseFetcher.some_other_attr = "test"
+            BaseFetcher.configure(some_other_attr="new_value")
+    def test_generate_parser_arguments(self):
+        """Test parser arguments generation"""
+        BaseFetcher.configure(
+            huge_tree=False,
+            adaptive=True,
+            adaptive_domain="example.com"
+        )
+        args = BaseFetcher._generate_parser_arguments()
+        assert args['huge_tree'] is False
+        assert args['adaptive'] is True
+        assert args['adaptive_domain'] == "example.com"
+        # Reset
+        BaseFetcher.configure(
+            huge_tree=True,
+            adaptive=False
+        )
+        BaseFetcher.adaptive_domain = None

tests/fetchers/test_constants.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from scrapling.engines.constants import (
+    DEFAULT_DISABLED_RESOURCES,
+    DEFAULT_STEALTH_FLAGS,
+    HARMFUL_DEFAULT_ARGS
+)
+class TestConstants:
+    """Test constant values"""
+    def test_default_disabled_resources(self):
+        """Test default disabled resources"""
+        assert "image" in DEFAULT_DISABLED_RESOURCES
+        assert "font" in DEFAULT_DISABLED_RESOURCES
+        assert "stylesheet" in DEFAULT_DISABLED_RESOURCES
+        assert "media" in DEFAULT_DISABLED_RESOURCES
+    def test_harmful_default_args(self):
+        """Test harmful default arguments"""
+        assert "--enable-automation" in HARMFUL_DEFAULT_ARGS
+        assert "--disable-popup-blocking" in HARMFUL_DEFAULT_ARGS
+    def test_default_stealth_flags(self):
+        """Test default stealth flags"""
+        assert "--no-pings" in DEFAULT_STEALTH_FLAGS
+        assert "--incognito" in DEFAULT_STEALTH_FLAGS
+        assert "--disable-blink-features=AutomationControlled" in DEFAULT_STEALTH_FLAGS

tests/fetchers/test_pages.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import pytest
+from unittest.mock import Mock
+from scrapling.engines._browsers._page import PageInfo, PagePool
+class TestPageInfo:
+    """Test PageInfo functionality"""
+    def test_page_info_creation(self):
+        """Test PageInfo creation"""
+        mock_page = Mock()
+        page_info = PageInfo(mock_page, "ready", "https://example.com")
+        assert page_info.page == mock_page
+        assert page_info.state == "ready"
+        assert page_info.url == "https://example.com"
+    def test_page_info_marking(self):
+        """Test marking page"""
+        mock_page = Mock()
+        page_info = PageInfo(mock_page, "ready", None)
+        page_info.mark_busy("https://example.com")
+        assert page_info.state == "busy"
+        assert page_info.url == "https://example.com"
+        page_info.mark_ready()
+        assert page_info.state == "ready"
+        assert page_info.url == ""
+        page_info.mark_error()
+        assert page_info.state == "error"
+    def test_page_info_equality(self):
+        """Test PageInfo equality comparison"""
+        mock_page1 = Mock()
+        mock_page2 = Mock()
+        page_info1 = PageInfo(mock_page1, "ready", None)
+        page_info2 = PageInfo(mock_page1, "busy", None)  # Same page, different state
+        page_info3 = PageInfo(mock_page2, "ready", None)  # Different page
+        assert page_info1 == page_info2  # Same page
+        assert page_info1 != page_info3  # Different page
+        assert page_info1 != "not a page info"  # Different type
+    def test_page_info_repr(self):
+        """Test PageInfo string representation"""
+        mock_page = Mock()
+        page_info = PageInfo(mock_page, "ready", "https://example.com")
+        repr_str = repr(page_info)
+        assert "ready" in repr_str
+        assert "https://example.com" in repr_str
+class TestPagePool:
+    """Test PagePool functionality"""
+    def test_page_pool_creation(self):
+        """Test PagePool creation"""
+        pool = PagePool(max_pages=5)
+        assert pool.max_pages == 5
+        assert pool.pages_count == 0
+        assert pool.ready_count == 0
+        assert pool.busy_count == 0
+    def test_add_page(self):
+        """Test adding page to pool"""
+        pool = PagePool(max_pages=2)
+        mock_page = Mock()
+        page_info = pool.add_page(mock_page)
+        assert isinstance(page_info, PageInfo)
+        assert page_info.page == mock_page
+        assert page_info.state == "ready"
+        assert pool.pages_count == 1
+    def test_add_page_limit_exceeded(self):
+        """Test adding page when limit exceeded"""
+        pool = PagePool(max_pages=1)
+        # Add first page
+        pool.add_page(Mock())
+        # Try to add a second page
+        with pytest.raises(RuntimeError):
+            pool.add_page(Mock())
+    def test_get_ready_page(self):
+        """Test getting ready page"""
+        pool = PagePool(max_pages=3)
+        # Add pages
+        page1 = pool.add_page(Mock())
+        page2 = pool.add_page(Mock())
+        # Mark one as busy
+        page1.mark_busy("https://example.com")
+        # Should get the ready page
+        ready_page = pool.get_ready_page()
+        assert ready_page == page2
+    def test_get_ready_page_none_available(self):
+        """Test getting ready page when none available"""
+        pool = PagePool(max_pages=2)
+        # Add pages and mark all as busy
+        page1 = pool.add_page(Mock())
+        page2 = pool.add_page(Mock())
+        page1.mark_busy("https://example1.com")
+        page2.mark_busy("https://example2.com")
+        # Should return None
+        ready_page = pool.get_ready_page()
+        assert ready_page is None
+    def test_page_counts(self):
+        """Test page count properties"""
+        pool = PagePool(max_pages=3)
+        # Add pages with different states
+        page1 = pool.add_page(Mock())
+        page2 = pool.add_page(Mock())
+        page3 = pool.add_page(Mock())
+        page1.mark_busy("https://example.com")
+        page3.mark_error()
+        assert pool.pages_count == 3
+        assert pool.ready_count == 1
+        assert pool.busy_count == 1
+    def test_cleanup_error_pages(self):
+        """Test cleaning up error pages"""
+        pool = PagePool(max_pages=3)
+        # Add pages
+        page1 = pool.add_page(Mock())
+        page2 = pool.add_page(Mock())
+        page3 = pool.add_page(Mock())
+        # Mark some as error
+        page1.mark_error()
+        page3.mark_error()
+        assert pool.pages_count == 3
+        pool.cleanup_error_pages()
+        assert pool.pages_count == 1  # Only page2 should remain

tests/fetchers/test_utils.py CHANGED Viewed

@@ -1,6 +1,17 @@
 import pytest
-from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
 @pytest.fixture
@@ -122,7 +133,7 @@ def status_map():
 def test_parsing_content_type(content_type_map):
-    """Test if parsing different types of content-type returns the expected result"""
     for header_value, expected_encoding in content_type_map.items():
         assert ResponseEncoding.get_value(header_value) == expected_encoding
@@ -136,3 +147,208 @@ def test_parsing_response_status(status_map):
 def test_unknown_status_code():
     """Test handling of an unknown status code"""
     assert StatusText.get(1000) == "Unknown Status Code"

 import pytest
+from pathlib import Path
+from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText, Response
+from scrapling.engines.toolbelt.navigation import (
+    construct_proxy_dict,
+    construct_cdp_url,
+    js_bypass_path
+)
+from scrapling.engines.toolbelt.fingerprints import (
+    generate_convincing_referer,
+    get_os_name,
+    generate_headers
+)
 @pytest.fixture
 def test_parsing_content_type(content_type_map):
+    """Test if parsing different types of 'content-type' returns the expected result"""
     for header_value, expected_encoding in content_type_map.items():
         assert ResponseEncoding.get_value(header_value) == expected_encoding
 def test_unknown_status_code():
     """Test handling of an unknown status code"""
     assert StatusText.get(1000) == "Unknown Status Code"
+class TestConstructProxyDict:
+    """Test proxy dictionary construction"""
+    def test_proxy_string_basic(self):
+        """Test a basic proxy string"""
+        result = construct_proxy_dict("http://proxy.example.com:8080")
+        expected = {
+            "server": "http://proxy.example.com:8080",
+            "username": "",
+            "password": ""
+        }
+        assert result == expected
+    def test_proxy_string_with_auth(self):
+        """Test proxy string with authentication"""
+        result = construct_proxy_dict("http://user:pass@proxy.example.com:8080")
+        expected = {
+            "server": "http://proxy.example.com:8080",
+            "username": "user",
+            "password": "pass"
+        }
+        assert result == expected
+    def test_proxy_dict_input(self):
+        """Test proxy dictionary input"""
+        input_dict = {
+            "server": "http://proxy.example.com:8080",
+            "username": "user",
+            "password": "pass"
+        }
+        result = construct_proxy_dict(input_dict)
+        assert result == input_dict
+    def test_proxy_dict_minimal(self):
+        """Test minimal proxy dictionary"""
+        input_dict = {"server": "http://proxy.example.com:8080"}
+        result = construct_proxy_dict(input_dict)
+        expected = {
+            "server": "http://proxy.example.com:8080",
+            "username": "",
+            "password": ""
+        }
+        assert result == expected
+    def test_proxy_as_tuple(self):
+        """Test returning proxy as a tuple"""
+        result = construct_proxy_dict("http://proxy.example.com:8080", as_tuple=True)
+        assert isinstance(result, tuple)
+        result_dict = dict(result)
+        assert result_dict["server"] == "http://proxy.example.com:8080"
+    def test_invalid_proxy_string(self):
+        """Test invalid proxy string"""
+        with pytest.raises(ValueError):
+            construct_proxy_dict("invalid-proxy-format")
+    def test_invalid_proxy_dict(self):
+        """Test invalid proxy dictionary"""
+        with pytest.raises(TypeError):
+            construct_proxy_dict({"invalid": "structure"})
+class TestConstructCdpUrl:
+    """Test CDP URL construction"""
+    def test_basic_cdp_url(self):
+        """Test basic CDP URL"""
+        result = construct_cdp_url("ws://localhost:9222/devtools/browser")
+        assert result == "ws://localhost:9222/devtools/browser"
+    def test_cdp_url_with_params(self):
+        """Test CDP URL with query parameters"""
+        params = {"timeout": "30000", "headless": "true"}
+        result = construct_cdp_url("ws://localhost:9222/devtools/browser", params)
+        assert "timeout=30000" in result
+        assert "headless=true" in result
+    def test_cdp_url_without_leading_slash(self):
+        """Test CDP URL without a leading slash in the path"""
+        with pytest.raises(ValueError):
+            construct_cdp_url("ws://localhost:9222devtools/browser")
+    def test_invalid_cdp_scheme(self):
+        """Test invalid CDP URL scheme"""
+        with pytest.raises(ValueError):
+            construct_cdp_url("http://localhost:9222/devtools/browser")
+    def test_invalid_cdp_netloc(self):
+        """Test invalid CDP URL network location"""
+        with pytest.raises(ValueError):
+            construct_cdp_url("ws:///devtools/browser")
+    def test_malformed_cdp_url(self):
+        """Test malformed CDP URL"""
+        with pytest.raises(ValueError):
+            construct_cdp_url("not-a-url")
+class TestJsBypassPath:
+    """Test JavaScript bypass path utility"""
+    def test_js_bypass_path(self):
+        """Test getting JavaScript bypass file path"""
+        result = js_bypass_path("webdriver_fully.js")
+        assert isinstance(result, str)
+        assert result.endswith("webdriver_fully.js")
+        assert Path(result).exists()
+    def test_js_bypass_path_caching(self):
+        """Test that js_bypass_path is cached"""
+        result1 = js_bypass_path("webdriver_fully.js")
+        result2 = js_bypass_path("webdriver_fully.js")
+        assert result1 == result2
+class TestFingerprintFunctions:
+    """Test fingerprint generation functions"""
+    def test_generate_convincing_referer(self):
+        """Test referer generation"""
+        url = "https://sub.example.com/page.html"
+        result = generate_convincing_referer(url)
+        assert result.startswith("https://www.google.com/search?q=")
+        assert "example" in result
+    def test_generate_convincing_referer_caching(self):
+        """Test referer generation caching"""
+        url = "https://example.com"
+        result1 = generate_convincing_referer(url)
+        result2 = generate_convincing_referer(url)
+        assert result1 == result2
+    def test_get_os_name(self):
+        """Test OS name detection"""
+        result = get_os_name()
+        # Should return one of the known OS names or None
+        valid_names = ["linux", "macos", "windows", "ios"]
+        assert result is None or result in valid_names
+    def test_generate_headers_basic(self):
+        """Test basic header generation"""
+        headers = generate_headers()
+        assert isinstance(headers, dict)
+        assert "User-Agent" in headers
+        assert len(headers["User-Agent"]) > 0
+    def test_generate_headers_browser_mode(self):
+        """Test header generation in browser mode"""
+        headers = generate_headers(browser_mode=True)
+        assert isinstance(headers, dict)
+        assert "User-Agent" in headers
+class TestResponse:
+    """Test Response class functionality"""
+    def test_response_creation(self):
+        """Test Response object creation"""
+        response = Response(
+            url="https://example.com",
+            content="<html><body>Test</body></html>",
+            status=200,
+            reason="OK",
+            cookies={"session": "abc123"},
+            headers={"Content-Type": "text/html"},
+            request_headers={"User-Agent": "Test"},
+            encoding="utf-8"
+        )
+        assert response.url == "https://example.com"
+        assert response.status == 200
+        assert response.reason == "OK"
+        assert response.cookies == {"session": "abc123"}
+    def test_response_with_bytes_content(self):
+        """Test Response with 'bytes' content"""
+        content_bytes = "<html><body>Test</body></html>".encode('utf-8')
+        response = Response(
+            url="https://example.com",
+            content=content_bytes,
+            status=200,
+            reason="OK",
+            cookies={},
+            headers={},
+            request_headers={}
+        )
+        # Should handle 'bytes' content properly
+        assert response.status == 200

tests/fetchers/test_validator.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import pytest
+from scrapling.engines._browsers._validators import (
+    validate,
+    PlaywrightConfig,
+    CamoufoxConfig
+)
+class TestValidators:
+    """Test configuration validators"""
+    def test_playwright_config_valid(self):
+        """Test valid PlaywrightConfig"""
+        params = {
+            "max_pages": 2,
+            "headless": True,
+            "timeout": 30000,
+            "proxy": "http://proxy.example.com:8080"
+        }
+        config = validate(params, PlaywrightConfig)
+        assert config.max_pages == 2
+        assert config.headless is True
+        assert config.timeout == 30000
+        assert isinstance(config.proxy, tuple)  # Should be converted to tuple
+    def test_playwright_config_invalid_max_pages(self):
+        """Test PlaywrightConfig with invalid max_pages"""
+        params = {"max_pages": 0}
+        with pytest.raises(TypeError):
+            validate(params, PlaywrightConfig)
+        params = {"max_pages": 51}
+        with pytest.raises(TypeError):
+            validate(params, PlaywrightConfig)
+    def test_playwright_config_invalid_timeout(self):
+        """Test PlaywrightConfig with an invalid timeout"""
+        params = {"timeout": -1}
+        with pytest.raises(TypeError):
+            validate(params, PlaywrightConfig)
+    def test_playwright_config_invalid_cdp_url(self):
+        """Test PlaywrightConfig with invalid CDP URL"""
+        params = {"cdp_url": "invalid-url"}
+        with pytest.raises(TypeError):
+            validate(params, PlaywrightConfig)
+    def test_camoufox_config_valid(self):
+        """Test valid CamoufoxConfig"""
+        params = {
+            "max_pages": 1,
+            "headless": True,
+            "solve_cloudflare": False,
+            "timeout": 30000
+        }
+        config = validate(params, CamoufoxConfig)
+        assert config.max_pages == 1
+        assert config.headless is True
+        assert config.solve_cloudflare is False
+        assert config.timeout == 30000
+    def test_camoufox_config_cloudflare_timeout(self):
+        """Test CamoufoxConfig timeout adjustment for Cloudflare"""
+        params = {
+            "solve_cloudflare": True,
+            "timeout": 10000  # Less than the required 60,000
+        }
+        config = validate(params, CamoufoxConfig)
+        assert config.timeout == 60000  # Should be increased