Karim shoair commited on
Commit ·
4e9888e
1
Parent(s): 9b40891
test: adding new tests and updating existing ones
Browse files- tests/ai/test_ai_mcp.py +35 -54
- tests/core/__init__.py +0 -0
- tests/core/test_shell_core.py +243 -0
- tests/core/test_storage_core.py +37 -0
- tests/fetchers/async/test_camoufox.py +6 -1
- tests/fetchers/async/test_dynamic.py +1 -1
- tests/fetchers/sync/test_camoufox.py +6 -2
- tests/fetchers/sync/test_camoufox_session.py +97 -0
- tests/fetchers/sync/test_dynamic.py +1 -1
- tests/parser/test_general.py +4 -2
- tests/parser/test_parser_advanced.py +50 -0
tests/ai/test_ai_mcp.py
CHANGED
|
@@ -1,12 +1,18 @@
|
|
| 1 |
import pytest
|
|
|
|
| 2 |
from unittest.mock import Mock, patch
|
| 3 |
|
| 4 |
from scrapling.core.ai import ScraplingMCPServer, ResponseModel
|
| 5 |
|
| 6 |
|
|
|
|
| 7 |
class TestMCPServer:
|
| 8 |
"""Test MCP server functionality"""
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
@pytest.fixture
|
| 11 |
def server(self):
|
| 12 |
return ScraplingMCPServer()
|
|
@@ -16,71 +22,46 @@ class TestMCPServer:
|
|
| 16 |
assert server._server is not None
|
| 17 |
assert server._server.name == "Scrapling"
|
| 18 |
|
| 19 |
-
def test_get_tool(self):
|
| 20 |
"""Test the get tool method"""
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
mock_get.return_value = mock_response
|
| 26 |
-
|
| 27 |
-
with patch('scrapling.core.ai.Convertor._extract_content') as mock_extract:
|
| 28 |
-
mock_extract.return_value = iter(["Content"])
|
| 29 |
-
|
| 30 |
-
result = ScraplingMCPServer.get(
|
| 31 |
-
url="https://example.com",
|
| 32 |
-
extraction_type="markdown"
|
| 33 |
-
)
|
| 34 |
-
|
| 35 |
-
assert isinstance(result, ResponseModel)
|
| 36 |
-
assert result.status == 200
|
| 37 |
-
assert result.url == "https://example.com"
|
| 38 |
|
| 39 |
@pytest.mark.asyncio
|
| 40 |
-
async def test_bulk_get_tool(self):
|
| 41 |
"""Test the bulk_get tool method"""
|
| 42 |
-
|
| 43 |
-
mock_instance = Mock()
|
| 44 |
-
mock_session.return_value.__aenter__.return_value = mock_instance
|
| 45 |
-
|
| 46 |
-
# Mock async get method
|
| 47 |
-
async def mock_async_get(*args, **kwargs):
|
| 48 |
-
mock_resp = Mock()
|
| 49 |
-
mock_resp.status = 200
|
| 50 |
-
mock_resp.url = args[0]
|
| 51 |
-
return mock_resp
|
| 52 |
-
|
| 53 |
-
mock_instance.get = mock_async_get
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
results = await ScraplingMCPServer.bulk_get(
|
| 59 |
-
urls=("https://example1.com", "https://example2.com"),
|
| 60 |
-
extraction_type="html"
|
| 61 |
-
)
|
| 62 |
-
|
| 63 |
-
assert len(results) == 2
|
| 64 |
-
assert all(isinstance(r, ResponseModel) for r in results)
|
| 65 |
|
| 66 |
@pytest.mark.asyncio
|
| 67 |
-
async def test_fetch_tool(self):
|
| 68 |
"""Test the fetch tool method"""
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
mock_response.url = "https://example.com"
|
| 73 |
-
mock_fetch.return_value = mock_response
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
| 82 |
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
def test_serve_method(self, server):
|
| 86 |
"""Test the serve method"""
|
|
|
|
| 1 |
import pytest
|
| 2 |
+
import pytest_httpbin
|
| 3 |
from unittest.mock import Mock, patch
|
| 4 |
|
| 5 |
from scrapling.core.ai import ScraplingMCPServer, ResponseModel
|
| 6 |
|
| 7 |
|
| 8 |
+
@pytest_httpbin.use_class_based_httpbin
|
| 9 |
class TestMCPServer:
|
| 10 |
"""Test MCP server functionality"""
|
| 11 |
|
| 12 |
+
@pytest.fixture(scope="class")
|
| 13 |
+
def test_url(self, httpbin):
|
| 14 |
+
return f"{httpbin.url}/html"
|
| 15 |
+
|
| 16 |
@pytest.fixture
|
| 17 |
def server(self):
|
| 18 |
return ScraplingMCPServer()
|
|
|
|
| 22 |
assert server._server is not None
|
| 23 |
assert server._server.name == "Scrapling"
|
| 24 |
|
| 25 |
+
def test_get_tool(self, server, test_url):
|
| 26 |
"""Test the get tool method"""
|
| 27 |
+
result = server.get(url=test_url, extraction_type="markdown")
|
| 28 |
+
assert isinstance(result, ResponseModel)
|
| 29 |
+
assert result.status == 200
|
| 30 |
+
assert result.url == test_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
@pytest.mark.asyncio
|
| 33 |
+
async def test_bulk_get_tool(self, server, test_url):
|
| 34 |
"""Test the bulk_get tool method"""
|
| 35 |
+
results = await server.bulk_get(urls=(test_url, test_url), extraction_type="html")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
assert len(results) == 2
|
| 38 |
+
assert all(isinstance(r, ResponseModel) for r in results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
@pytest.mark.asyncio
|
| 41 |
+
async def test_fetch_tool(self, server, test_url):
|
| 42 |
"""Test the fetch tool method"""
|
| 43 |
+
result = await server.fetch(url=test_url, headless=True)
|
| 44 |
+
assert isinstance(result, ResponseModel)
|
| 45 |
+
assert result.status == 200
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
@pytest.mark.asyncio
|
| 48 |
+
async def test_bulk_fetch_tool(self, server, test_url):
|
| 49 |
+
"""Test the bulk_fetch tool method"""
|
| 50 |
+
result = await server.bulk_fetch(urls=(test_url, test_url), headless=True)
|
| 51 |
+
assert all(isinstance(r, ResponseModel) for r in result)
|
| 52 |
|
| 53 |
+
@pytest.mark.asyncio
|
| 54 |
+
async def test_stealthy_fetch_tool(self, server, test_url):
|
| 55 |
+
"""Test the stealthy_fetch tool method"""
|
| 56 |
+
result = await server.stealthy_fetch(url=test_url, headless=True)
|
| 57 |
+
assert isinstance(result, ResponseModel)
|
| 58 |
+
assert result.status == 200
|
| 59 |
|
| 60 |
+
@pytest.mark.asyncio
|
| 61 |
+
async def test_bulk_stealthy_fetch_tool(self, server, test_url):
|
| 62 |
+
"""Test the bulk_stealthy_fetch tool method"""
|
| 63 |
+
result = await server.bulk_stealthy_fetch(urls=(test_url, test_url), headless=True)
|
| 64 |
+
assert all(isinstance(r, ResponseModel) for r in result)
|
| 65 |
|
| 66 |
def test_serve_method(self, server):
|
| 67 |
"""Test the serve method"""
|
tests/core/__init__.py
ADDED
|
File without changes
|
tests/core/test_shell_core.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from scrapling.core.shell import (
|
| 4 |
+
_CookieParser,
|
| 5 |
+
_ParseHeaders,
|
| 6 |
+
Request,
|
| 7 |
+
_known_logging_levels,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TestCookieParser:
|
| 12 |
+
"""Test cookie parsing functionality"""
|
| 13 |
+
|
| 14 |
+
def test_simple_cookie_parsing(self):
|
| 15 |
+
"""Test parsing a simple cookie"""
|
| 16 |
+
cookie_string = "session_id=abc123"
|
| 17 |
+
cookies = list(_CookieParser(cookie_string))
|
| 18 |
+
assert len(cookies) == 1
|
| 19 |
+
assert cookies[0] == ("session_id", "abc123")
|
| 20 |
+
|
| 21 |
+
def test_multiple_cookies_parsing(self):
|
| 22 |
+
"""Test parsing multiple cookies"""
|
| 23 |
+
cookie_string = "session_id=abc123; theme=dark; lang=en"
|
| 24 |
+
cookies = list(_CookieParser(cookie_string))
|
| 25 |
+
assert len(cookies) == 3
|
| 26 |
+
cookie_dict = dict(cookies)
|
| 27 |
+
assert cookie_dict["session_id"] == "abc123"
|
| 28 |
+
assert cookie_dict["theme"] == "dark"
|
| 29 |
+
assert cookie_dict["lang"] == "en"
|
| 30 |
+
|
| 31 |
+
def test_cookie_with_attributes(self):
|
| 32 |
+
"""Test parsing cookies with attributes"""
|
| 33 |
+
cookie_string = "session_id=abc123; Path=/; HttpOnly; Secure"
|
| 34 |
+
cookies = list(_CookieParser(cookie_string))
|
| 35 |
+
assert len(cookies) == 1
|
| 36 |
+
assert cookies[0] == ("session_id", "abc123")
|
| 37 |
+
|
| 38 |
+
def test_empty_cookie_string(self):
|
| 39 |
+
"""Test parsing empty cookie string"""
|
| 40 |
+
cookies = list(_CookieParser(""))
|
| 41 |
+
assert len(cookies) == 0
|
| 42 |
+
|
| 43 |
+
def test_malformed_cookie_handling(self):
|
| 44 |
+
"""Test handling of malformed cookies"""
|
| 45 |
+
# Should not raise exception but may return an empty list
|
| 46 |
+
cookies = list(_CookieParser("invalid_cookie_format"))
|
| 47 |
+
assert isinstance(cookies, list)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class TestParseHeaders:
|
| 51 |
+
"""Test header parsing functionality"""
|
| 52 |
+
|
| 53 |
+
def test_simple_headers(self):
|
| 54 |
+
"""Test parsing simple headers"""
|
| 55 |
+
header_lines = [
|
| 56 |
+
"Content-Type: text/html",
|
| 57 |
+
"Content-Length: 1234",
|
| 58 |
+
"User-Agent: TestAgent/1.0"
|
| 59 |
+
]
|
| 60 |
+
headers, cookies = _ParseHeaders(header_lines)
|
| 61 |
+
|
| 62 |
+
assert headers["Content-Type"] == "text/html"
|
| 63 |
+
assert headers["Content-Length"] == "1234"
|
| 64 |
+
assert headers["User-Agent"] == "TestAgent/1.0"
|
| 65 |
+
assert len(cookies) == 0
|
| 66 |
+
|
| 67 |
+
def test_headers_with_cookies(self):
|
| 68 |
+
"""Test parsing headers with cookie headers"""
|
| 69 |
+
header_lines = [
|
| 70 |
+
"Content-Type: text/html",
|
| 71 |
+
"Set-Cookie: session_id=abc123",
|
| 72 |
+
"Set-Cookie: theme=dark; Path=/",
|
| 73 |
+
]
|
| 74 |
+
headers, cookies = _ParseHeaders(header_lines)
|
| 75 |
+
|
| 76 |
+
assert headers["Content-Type"] == "text/html"
|
| 77 |
+
assert "Set-Cookie" in headers # Should contain the first Set-Cookie
|
| 78 |
+
# Cookie parsing behavior depends on implementation
|
| 79 |
+
|
| 80 |
+
def test_headers_without_colons(self):
|
| 81 |
+
"""Test headers without colons"""
|
| 82 |
+
header_lines = [
|
| 83 |
+
"Content-Type: text/html",
|
| 84 |
+
"InvalidHeader;", # Header ending with semicolon
|
| 85 |
+
]
|
| 86 |
+
headers, cookies = _ParseHeaders(header_lines)
|
| 87 |
+
|
| 88 |
+
assert headers["Content-Type"] == "text/html"
|
| 89 |
+
assert "InvalidHeader" in headers
|
| 90 |
+
assert headers["InvalidHeader"] == ""
|
| 91 |
+
|
| 92 |
+
def test_invalid_header_format(self):
|
| 93 |
+
"""Test invalid header format raises error"""
|
| 94 |
+
header_lines = [
|
| 95 |
+
"Content-Type: text/html",
|
| 96 |
+
"InvalidHeaderWithoutColon", # No colon, no semicolon
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
with pytest.raises(ValueError, match="Could not parse header without colon"):
|
| 100 |
+
_ParseHeaders(header_lines)
|
| 101 |
+
|
| 102 |
+
def test_headers_with_multiple_colons(self):
|
| 103 |
+
"""Test headers with multiple colons"""
|
| 104 |
+
header_lines = [
|
| 105 |
+
"Authorization: Bearer: token123",
|
| 106 |
+
"X-Custom: value:with:colons",
|
| 107 |
+
]
|
| 108 |
+
headers, cookies = _ParseHeaders(header_lines)
|
| 109 |
+
|
| 110 |
+
assert headers["Authorization"] == "Bearer: token123"
|
| 111 |
+
assert headers["X-Custom"] == "value:with:colons"
|
| 112 |
+
|
| 113 |
+
def test_headers_with_whitespace(self):
|
| 114 |
+
"""Test headers with extra whitespace"""
|
| 115 |
+
header_lines = [
|
| 116 |
+
" Content-Type : text/html ",
|
| 117 |
+
"\tUser-Agent\t:\tTestAgent/1.0\t",
|
| 118 |
+
]
|
| 119 |
+
headers, cookies = _ParseHeaders(header_lines)
|
| 120 |
+
|
| 121 |
+
# Should handle whitespace correctly
|
| 122 |
+
assert "Content-Type" in headers or " Content-Type " in headers
|
| 123 |
+
assert "text/html" in str(headers.values()) or " text/html " in str(headers.values())
|
| 124 |
+
|
| 125 |
+
def test_parse_cookies_disabled(self):
|
| 126 |
+
"""Test parsing with cookies disabled"""
|
| 127 |
+
header_lines = [
|
| 128 |
+
"Content-Type: text/html",
|
| 129 |
+
"Set-Cookie: session_id=abc123",
|
| 130 |
+
]
|
| 131 |
+
headers, cookies = _ParseHeaders(header_lines, parse_cookies=False)
|
| 132 |
+
|
| 133 |
+
assert headers["Content-Type"] == "text/html"
|
| 134 |
+
# Cookie parsing behavior when disabled
|
| 135 |
+
assert len(cookies) == 0 or "Set-Cookie" in headers
|
| 136 |
+
|
| 137 |
+
def test_empty_header_lines(self):
|
| 138 |
+
"""Test parsing empty header lines"""
|
| 139 |
+
headers, cookies = _ParseHeaders([])
|
| 140 |
+
assert len(headers) == 0
|
| 141 |
+
assert len(cookies) == 0
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class TestRequestNamedTuple:
|
| 145 |
+
"""Test Request namedtuple functionality"""
|
| 146 |
+
|
| 147 |
+
def test_request_creation(self):
|
| 148 |
+
"""Test creating Request namedtuple"""
|
| 149 |
+
request = Request(
|
| 150 |
+
method="GET",
|
| 151 |
+
url="https://example.com",
|
| 152 |
+
params={"q": "test"},
|
| 153 |
+
data=None,
|
| 154 |
+
json_data=None,
|
| 155 |
+
headers={"User-Agent": "Test"},
|
| 156 |
+
cookies={"session": "abc123"},
|
| 157 |
+
proxy=None,
|
| 158 |
+
follow_redirects=True
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
assert request.method == "GET"
|
| 162 |
+
assert request.url == "https://example.com"
|
| 163 |
+
assert request.params == {"q": "test"}
|
| 164 |
+
assert request.headers == {"User-Agent": "Test"}
|
| 165 |
+
assert request.follow_redirects is True
|
| 166 |
+
|
| 167 |
+
def test_request_defaults(self):
|
| 168 |
+
"""Test Request with default/None values"""
|
| 169 |
+
request = Request(
|
| 170 |
+
method="POST",
|
| 171 |
+
url="https://api.example.com",
|
| 172 |
+
params=None,
|
| 173 |
+
data='{"key": "value"}',
|
| 174 |
+
json_data={"key": "value"},
|
| 175 |
+
headers={},
|
| 176 |
+
cookies={},
|
| 177 |
+
proxy="http://proxy:8080",
|
| 178 |
+
follow_redirects=False
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
assert request.method == "POST"
|
| 182 |
+
assert request.data == '{"key": "value"}'
|
| 183 |
+
assert request.json_data == {"key": "value"}
|
| 184 |
+
assert request.proxy == "http://proxy:8080"
|
| 185 |
+
assert request.follow_redirects is False
|
| 186 |
+
|
| 187 |
+
def test_request_field_access(self):
|
| 188 |
+
"""Test accessing Request fields"""
|
| 189 |
+
request = Request(
|
| 190 |
+
"GET", "https://example.com", {}, None, None, {}, {}, None, True
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
# Test field access by name
|
| 194 |
+
assert hasattr(request, 'method')
|
| 195 |
+
assert hasattr(request, 'url')
|
| 196 |
+
assert hasattr(request, 'params')
|
| 197 |
+
assert hasattr(request, 'data')
|
| 198 |
+
assert hasattr(request, 'json_data')
|
| 199 |
+
assert hasattr(request, 'headers')
|
| 200 |
+
assert hasattr(request, 'cookies')
|
| 201 |
+
assert hasattr(request, 'proxy')
|
| 202 |
+
assert hasattr(request, 'follow_redirects')
|
| 203 |
+
|
| 204 |
+
# Test field access by index
|
| 205 |
+
assert request[0] == "GET"
|
| 206 |
+
assert request[1] == "https://example.com"
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
class TestLoggingLevels:
|
| 210 |
+
"""Test logging level constants"""
|
| 211 |
+
|
| 212 |
+
def test_known_logging_levels(self):
|
| 213 |
+
"""Test that all known logging levels are defined"""
|
| 214 |
+
expected_levels = ["debug", "info", "warning", "error", "critical", "fatal"]
|
| 215 |
+
|
| 216 |
+
for level in expected_levels:
|
| 217 |
+
assert level in _known_logging_levels
|
| 218 |
+
assert isinstance(_known_logging_levels[level], int)
|
| 219 |
+
|
| 220 |
+
def test_logging_level_values(self):
|
| 221 |
+
"""Test logging level values are correct"""
|
| 222 |
+
from logging import DEBUG, INFO, WARNING, ERROR, CRITICAL, FATAL
|
| 223 |
+
|
| 224 |
+
assert _known_logging_levels["debug"] == DEBUG
|
| 225 |
+
assert _known_logging_levels["info"] == INFO
|
| 226 |
+
assert _known_logging_levels["warning"] == WARNING
|
| 227 |
+
assert _known_logging_levels["error"] == ERROR
|
| 228 |
+
assert _known_logging_levels["critical"] == CRITICAL
|
| 229 |
+
assert _known_logging_levels["fatal"] == FATAL
|
| 230 |
+
|
| 231 |
+
def test_level_hierarchy(self):
|
| 232 |
+
"""Test that logging levels have correct hierarchy"""
|
| 233 |
+
levels = [
|
| 234 |
+
_known_logging_levels["debug"],
|
| 235 |
+
_known_logging_levels["info"],
|
| 236 |
+
_known_logging_levels["warning"],
|
| 237 |
+
_known_logging_levels["error"],
|
| 238 |
+
_known_logging_levels["critical"],
|
| 239 |
+
]
|
| 240 |
+
|
| 241 |
+
# Levels should be in ascending order
|
| 242 |
+
for i in range(len(levels) - 1):
|
| 243 |
+
assert levels[i] < levels[i + 1]
|
tests/core/test_storage_core.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tempfile
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
from scrapling.core.storage import SQLiteStorageSystem
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class TestSQLiteStorageSystem:
|
| 8 |
+
"""Test SQLiteStorageSystem functionality"""
|
| 9 |
+
|
| 10 |
+
def test_sqlite_storage_creation(self):
|
| 11 |
+
"""Test SQLite storage system creation"""
|
| 12 |
+
# Use an in-memory database for testing
|
| 13 |
+
storage = SQLiteStorageSystem(storage_file=":memory:")
|
| 14 |
+
assert storage is not None
|
| 15 |
+
|
| 16 |
+
def test_sqlite_storage_with_file(self):
|
| 17 |
+
"""Test SQLite storage with an actual file"""
|
| 18 |
+
with tempfile.NamedTemporaryFile(suffix='.db', delete=False) as tmp_file:
|
| 19 |
+
db_path = tmp_file.name
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
storage = SQLiteStorageSystem(storage_file=db_path)
|
| 23 |
+
assert storage is not None
|
| 24 |
+
assert os.path.exists(db_path)
|
| 25 |
+
finally:
|
| 26 |
+
if os.path.exists(db_path):
|
| 27 |
+
os.unlink(db_path)
|
| 28 |
+
|
| 29 |
+
def test_sqlite_storage_initialization_args(self):
|
| 30 |
+
"""Test SQLite storage with various initialization arguments"""
|
| 31 |
+
# Test with URL parameter
|
| 32 |
+
storage = SQLiteStorageSystem(
|
| 33 |
+
storage_file=":memory:",
|
| 34 |
+
url="https://example.com"
|
| 35 |
+
)
|
| 36 |
+
assert storage is not None
|
| 37 |
+
assert storage.url == "https://example.com"
|
tests/fetchers/async/test_camoufox.py
CHANGED
|
@@ -24,8 +24,13 @@ class TestStealthyFetcher:
|
|
| 24 |
"html_url": f"{url}/html",
|
| 25 |
"delayed_url": f"{url}/delay/10", # 10 Seconds delay response
|
| 26 |
"cookies_url": f"{url}/cookies/set/test/value",
|
|
|
|
| 27 |
}
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
async def test_basic_fetch(self, fetcher, urls):
|
| 30 |
"""Test doing a basic fetch request with multiple statuses"""
|
| 31 |
assert (await fetcher.async_fetch(urls["status_200"])).status == 200
|
|
@@ -63,7 +68,7 @@ class TestStealthyFetcher:
|
|
| 63 |
{
|
| 64 |
"network_idle": True,
|
| 65 |
"wait": 10,
|
| 66 |
-
"cookies": [],
|
| 67 |
"google_search": True,
|
| 68 |
"extra_headers": {"ayo": ""},
|
| 69 |
"os_randomize": True,
|
|
|
|
| 24 |
"html_url": f"{url}/html",
|
| 25 |
"delayed_url": f"{url}/delay/10", # 10 Seconds delay response
|
| 26 |
"cookies_url": f"{url}/cookies/set/test/value",
|
| 27 |
+
"cloudflare_url": "https://nopecha.com/demo/cloudflare", # Interactive turnstile page
|
| 28 |
}
|
| 29 |
|
| 30 |
+
async def test_cloudflare_fetch(self, fetcher, urls):
|
| 31 |
+
"""Test if Cloudflare bypass is working"""
|
| 32 |
+
assert (await fetcher.async_fetch(urls["cloudflare_url"], solve_cloudflare=True)).status == 200
|
| 33 |
+
|
| 34 |
async def test_basic_fetch(self, fetcher, urls):
|
| 35 |
"""Test doing a basic fetch request with multiple statuses"""
|
| 36 |
assert (await fetcher.async_fetch(urls["status_200"])).status == 200
|
|
|
|
| 68 |
{
|
| 69 |
"network_idle": True,
|
| 70 |
"wait": 10,
|
| 71 |
+
"cookies": [{"name": "test", "value": "123", "domain": "example.com", "path": "/"}],
|
| 72 |
"google_search": True,
|
| 73 |
"extra_headers": {"ayo": ""},
|
| 74 |
"os_randomize": True,
|
tests/fetchers/async/test_dynamic.py
CHANGED
|
@@ -65,7 +65,7 @@ class TestDynamicFetcherAsync:
|
|
| 65 |
"locale": "en-US",
|
| 66 |
"extra_headers": {"ayo": ""},
|
| 67 |
"useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
|
| 68 |
-
"cookies": [],
|
| 69 |
"network_idle": True,
|
| 70 |
"custom_config": {"keep_comments": False, "keep_cdata": False},
|
| 71 |
},
|
|
|
|
| 65 |
"locale": "en-US",
|
| 66 |
"extra_headers": {"ayo": ""},
|
| 67 |
"useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
|
| 68 |
+
"cookies": [{"name": "test", "value": "123", "domain": "example.com", "path": "/"}],
|
| 69 |
"network_idle": True,
|
| 70 |
"custom_config": {"keep_comments": False, "keep_cdata": False},
|
| 71 |
},
|
tests/fetchers/sync/test_camoufox.py
CHANGED
|
@@ -2,7 +2,6 @@ import pytest
|
|
| 2 |
import pytest_httpbin
|
| 3 |
|
| 4 |
from scrapling import StealthyFetcher
|
| 5 |
-
|
| 6 |
StealthyFetcher.adaptive = True
|
| 7 |
|
| 8 |
|
|
@@ -23,6 +22,11 @@ class TestStealthyFetcher:
|
|
| 23 |
self.html_url = f"{httpbin.url}/html"
|
| 24 |
self.delayed_url = f"{httpbin.url}/delay/10" # 10 Seconds delay response
|
| 25 |
self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def test_basic_fetch(self, fetcher):
|
| 28 |
"""Test doing a basic fetch request with multiple statuses"""
|
|
@@ -60,7 +64,7 @@ class TestStealthyFetcher:
|
|
| 60 |
"network_idle": True,
|
| 61 |
"wait": 10,
|
| 62 |
"timeout": 30_000,
|
| 63 |
-
"cookies": [],
|
| 64 |
"google_search": True,
|
| 65 |
"extra_headers": {"ayo": ""},
|
| 66 |
"os_randomize": True,
|
|
|
|
| 2 |
import pytest_httpbin
|
| 3 |
|
| 4 |
from scrapling import StealthyFetcher
|
|
|
|
| 5 |
StealthyFetcher.adaptive = True
|
| 6 |
|
| 7 |
|
|
|
|
| 22 |
self.html_url = f"{httpbin.url}/html"
|
| 23 |
self.delayed_url = f"{httpbin.url}/delay/10" # 10 Seconds delay response
|
| 24 |
self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
|
| 25 |
+
self.cloudflare_url = "https://nopecha.com/demo/cloudflare" # Interactive turnstile page
|
| 26 |
+
|
| 27 |
+
def test_cloudflare_fetch(self, fetcher):
|
| 28 |
+
"""Test if Cloudflare bypass is working"""
|
| 29 |
+
assert fetcher.fetch(self.cloudflare_url, solve_cloudflare=True).status == 200
|
| 30 |
|
| 31 |
def test_basic_fetch(self, fetcher):
|
| 32 |
"""Test doing a basic fetch request with multiple statuses"""
|
|
|
|
| 64 |
"network_idle": True,
|
| 65 |
"wait": 10,
|
| 66 |
"timeout": 30_000,
|
| 67 |
+
"cookies": [{"name": "test", "value": "123", "domain": "example.com", "path": "/"}],
|
| 68 |
"google_search": True,
|
| 69 |
"extra_headers": {"ayo": ""},
|
| 70 |
"os_randomize": True,
|
tests/fetchers/sync/test_camoufox_session.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import pytest
|
| 3 |
+
import pytest_httpbin
|
| 4 |
+
|
| 5 |
+
from scrapling.engines._browsers._camoufox import StealthySession, __CF_PATTERN__
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TestCamoufoxConstants:
|
| 9 |
+
"""Test Camoufox constants and patterns"""
|
| 10 |
+
|
| 11 |
+
def test_cf_pattern_regex(self):
|
| 12 |
+
"""Test __CF_PATTERN__ regex compilation"""
|
| 13 |
+
|
| 14 |
+
assert isinstance(__CF_PATTERN__, re.Pattern)
|
| 15 |
+
|
| 16 |
+
# Test matching URLs
|
| 17 |
+
test_urls = [
|
| 18 |
+
"https://challenges.cloudflare.com/cdn-cgi/challenge-platform/h/123456",
|
| 19 |
+
"https://challenges.cloudflare.com/cdn-cgi/challenge-platform/orchestrate/jsch/v1",
|
| 20 |
+
"http://challenges.cloudflare.com/cdn-cgi/challenge-platform/scripts/abc"
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
for url in test_urls:
|
| 24 |
+
assert __CF_PATTERN__.search(url) is not None
|
| 25 |
+
|
| 26 |
+
# Test non-matching URLs
|
| 27 |
+
non_matching_urls = [
|
| 28 |
+
"https://example.com/challenge",
|
| 29 |
+
"https://cloudflare.com/something",
|
| 30 |
+
"https://challenges.cloudflare.com/other-path"
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
for url in non_matching_urls:
|
| 34 |
+
assert __CF_PATTERN__.search(url) is None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@pytest_httpbin.use_class_based_httpbin
|
| 38 |
+
class TestStealthySession:
|
| 39 |
+
|
| 40 |
+
"""All the code is tested in the async version tests, so no need to repeat it here. The async class inherits from this one."""
|
| 41 |
+
@pytest.fixture(autouse=True)
|
| 42 |
+
def setup_urls(self, httpbin):
|
| 43 |
+
"""Fixture to set up URLs for testing"""
|
| 44 |
+
self.status_200 = f"{httpbin.url}/status/200"
|
| 45 |
+
self.status_404 = f"{httpbin.url}/status/404"
|
| 46 |
+
self.status_501 = f"{httpbin.url}/status/501"
|
| 47 |
+
self.basic_url = f"{httpbin.url}/get"
|
| 48 |
+
self.html_url = f"{httpbin.url}/html"
|
| 49 |
+
self.delayed_url = f"{httpbin.url}/delay/10" # 10 Seconds delay response
|
| 50 |
+
self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
|
| 51 |
+
|
| 52 |
+
def test_session_creation(self):
|
| 53 |
+
"""Test if the session is created correctly"""
|
| 54 |
+
|
| 55 |
+
with StealthySession(
|
| 56 |
+
max_pages=3,
|
| 57 |
+
headless=True,
|
| 58 |
+
block_images=True,
|
| 59 |
+
disable_resources=True,
|
| 60 |
+
solve_cloudflare=True,
|
| 61 |
+
wait=1000,
|
| 62 |
+
timeout=60000,
|
| 63 |
+
cookies=[{"name": "test", "value": "123", "domain": "example.com", "path": "/"}],
|
| 64 |
+
) as session:
|
| 65 |
+
|
| 66 |
+
assert session.max_pages == 3
|
| 67 |
+
assert session.headless is True
|
| 68 |
+
assert session.block_images is True
|
| 69 |
+
assert session.disable_resources is True
|
| 70 |
+
assert session.solve_cloudflare is True
|
| 71 |
+
assert session.wait == 1000
|
| 72 |
+
assert session.timeout == 60000
|
| 73 |
+
assert session.context is not None
|
| 74 |
+
|
| 75 |
+
# Test Cloudflare detection
|
| 76 |
+
for cloudflare_type in ('managed', 'interactive', 'non-interactive'):
|
| 77 |
+
page_content = f"""
|
| 78 |
+
<html>
|
| 79 |
+
<script>
|
| 80 |
+
cType: '{cloudflare_type}'
|
| 81 |
+
</script>
|
| 82 |
+
</html>
|
| 83 |
+
"""
|
| 84 |
+
result = session._detect_cloudflare(page_content)
|
| 85 |
+
assert result == cloudflare_type
|
| 86 |
+
|
| 87 |
+
page_content = """
|
| 88 |
+
<html>
|
| 89 |
+
<body>
|
| 90 |
+
<p>Regular page content</p>
|
| 91 |
+
</body>
|
| 92 |
+
</html>
|
| 93 |
+
"""
|
| 94 |
+
|
| 95 |
+
result = StealthySession._detect_cloudflare(page_content)
|
| 96 |
+
assert result is None
|
| 97 |
+
assert session.fetch(self.status_200).status == 200
|
tests/fetchers/sync/test_dynamic.py
CHANGED
|
@@ -63,7 +63,7 @@ class TestDynamicFetcher:
|
|
| 63 |
"locale": "en-US",
|
| 64 |
"extra_headers": {"ayo": ""},
|
| 65 |
"useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
|
| 66 |
-
"cookies": [],
|
| 67 |
"network_idle": True,
|
| 68 |
"custom_config": {"keep_comments": False, "keep_cdata": False},
|
| 69 |
},
|
|
|
|
| 63 |
"locale": "en-US",
|
| 64 |
"extra_headers": {"ayo": ""},
|
| 65 |
"useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
|
| 66 |
+
"cookies": [{"name": "test", "value": "123", "domain": "example.com", "path": "/"}],
|
| 67 |
"network_idle": True,
|
| 68 |
"custom_config": {"keep_comments": False, "keep_cdata": False},
|
| 69 |
},
|
tests/parser/test_general.py
CHANGED
|
@@ -221,7 +221,7 @@ class TestElementNavigation:
|
|
| 221 |
"""Test parent and sibling navigation"""
|
| 222 |
table = page.css(".product-list")[0]
|
| 223 |
parent = table.parent
|
| 224 |
-
assert parent
|
| 225 |
|
| 226 |
parent_siblings = parent.siblings
|
| 227 |
assert len(parent_siblings) == 1
|
|
@@ -267,7 +267,7 @@ class TestJSONAndAttributes:
|
|
| 267 |
products = page.css(".product")
|
| 268 |
product_ids = [product.attrib["data-id"] for product in products]
|
| 269 |
assert product_ids == ["1", "2", "3"]
|
| 270 |
-
assert "data-id" in products[0]
|
| 271 |
|
| 272 |
# Review rating calculations
|
| 273 |
reviews = page.css(".review")
|
|
@@ -316,7 +316,9 @@ def test_selectors_generation(page):
|
|
| 316 |
|
| 317 |
def _traverse(element: Selector):
|
| 318 |
assert isinstance(element.generate_css_selector, str)
|
|
|
|
| 319 |
assert isinstance(element.generate_xpath_selector, str)
|
|
|
|
| 320 |
for branch in element.children:
|
| 321 |
_traverse(branch)
|
| 322 |
|
|
|
|
| 221 |
"""Test parent and sibling navigation"""
|
| 222 |
table = page.css(".product-list")[0]
|
| 223 |
parent = table.parent
|
| 224 |
+
assert parent["id"] == "products"
|
| 225 |
|
| 226 |
parent_siblings = parent.siblings
|
| 227 |
assert len(parent_siblings) == 1
|
|
|
|
| 267 |
products = page.css(".product")
|
| 268 |
product_ids = [product.attrib["data-id"] for product in products]
|
| 269 |
assert product_ids == ["1", "2", "3"]
|
| 270 |
+
assert "data-id" in products[0]
|
| 271 |
|
| 272 |
# Review rating calculations
|
| 273 |
reviews = page.css(".review")
|
|
|
|
| 316 |
|
| 317 |
def _traverse(element: Selector):
|
| 318 |
assert isinstance(element.generate_css_selector, str)
|
| 319 |
+
assert isinstance(element.generate_full_css_selector, str)
|
| 320 |
assert isinstance(element.generate_xpath_selector, str)
|
| 321 |
+
assert isinstance(element.generate_full_xpath_selector, str)
|
| 322 |
for branch in element.children:
|
| 323 |
_traverse(branch)
|
| 324 |
|
tests/parser/test_parser_advanced.py
CHANGED
|
@@ -1,8 +1,58 @@
|
|
| 1 |
import re
|
| 2 |
import pytest
|
|
|
|
| 3 |
|
| 4 |
from scrapling import Selector, Selectors
|
| 5 |
from scrapling.core.custom_types import TextHandler, TextHandlers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
class TestAdvancedSelectors:
|
|
|
|
| 1 |
import re
|
| 2 |
import pytest
|
| 3 |
+
from unittest.mock import Mock
|
| 4 |
|
| 5 |
from scrapling import Selector, Selectors
|
| 6 |
from scrapling.core.custom_types import TextHandler, TextHandlers
|
| 7 |
+
from scrapling.core.storage import SQLiteStorageSystem
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TestSelectorAdvancedFeatures:
|
| 11 |
+
"""Test advanced Selector features like adaptive matching"""
|
| 12 |
+
|
| 13 |
+
def test_adaptive_initialization_with_storage(self):
|
| 14 |
+
"""Test adaptive initialization with custom storage"""
|
| 15 |
+
html = "<html><body><p>Test</p></body></html>"
|
| 16 |
+
|
| 17 |
+
# Use the actual SQLiteStorageSystem for this test
|
| 18 |
+
selector = Selector(
|
| 19 |
+
content=html,
|
| 20 |
+
adaptive=True,
|
| 21 |
+
storage=SQLiteStorageSystem,
|
| 22 |
+
storage_args={"storage_file": ":memory:", "url": "https://example.com"}
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
assert selector._Selector__adaptive_enabled is True
|
| 26 |
+
assert selector._storage is not None
|
| 27 |
+
|
| 28 |
+
def test_adaptive_initialization_with_default_storage_args(self):
|
| 29 |
+
"""Test adaptive initialization with default storage args"""
|
| 30 |
+
html = "<html><body><p>Test</p></body></html>"
|
| 31 |
+
url = "https://example.com"
|
| 32 |
+
|
| 33 |
+
# Test that adaptive mode uses default storage when no explicit args provided
|
| 34 |
+
selector = Selector(
|
| 35 |
+
content=html,
|
| 36 |
+
url=url,
|
| 37 |
+
adaptive=True
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# Should create storage with default args
|
| 41 |
+
assert selector._storage is not None
|
| 42 |
+
|
| 43 |
+
def test_adaptive_with_existing_storage(self):
|
| 44 |
+
"""Test adaptive initialization with existing storage object"""
|
| 45 |
+
html = "<html><body><p>Test</p></body></html>"
|
| 46 |
+
|
| 47 |
+
mock_storage = Mock()
|
| 48 |
+
|
| 49 |
+
selector = Selector(
|
| 50 |
+
content=html,
|
| 51 |
+
adaptive=True,
|
| 52 |
+
_storage=mock_storage
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
assert selector._storage is mock_storage
|
| 56 |
|
| 57 |
|
| 58 |
class TestAdvancedSelectors:
|