Karim shoair commited on
Commit ·
c2dbf4c
1
Parent(s): ac1e174
style: Use shorter and more accurate naming for constants
Browse files
scrapling/engines/_browsers/_base.py
CHANGED
|
@@ -40,11 +40,7 @@ from scrapling.core._types import (
|
|
| 40 |
Generator,
|
| 41 |
AsyncGenerator,
|
| 42 |
)
|
| 43 |
-
from scrapling.engines.constants import
|
| 44 |
-
DEFAULT_STEALTH_FLAGS,
|
| 45 |
-
HARMFUL_DEFAULT_ARGS,
|
| 46 |
-
DEFAULT_FLAGS,
|
| 47 |
-
)
|
| 48 |
|
| 49 |
|
| 50 |
class SyncSession:
|
|
@@ -389,8 +385,8 @@ class BaseSessionMixin:
|
|
| 389 |
# Dark color scheme bypasses the 'prefersLightColor' check in creepjs
|
| 390 |
self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
|
| 391 |
self._browser_options: Dict[str, Any] = {
|
| 392 |
-
"args":
|
| 393 |
-
"ignore_default_args":
|
| 394 |
}
|
| 395 |
if "__max_pages" in params:
|
| 396 |
params["max_pages"] = params.pop("__max_pages")
|
|
@@ -484,7 +480,7 @@ class StealthySessionMixin(BaseSessionMixin):
|
|
| 484 |
config = cast(StealthConfig, self._config)
|
| 485 |
flags: Tuple[str, ...] = tuple()
|
| 486 |
if not config.cdp_url:
|
| 487 |
-
flags =
|
| 488 |
|
| 489 |
if config.block_webrtc:
|
| 490 |
flags += (
|
|
|
|
| 40 |
Generator,
|
| 41 |
AsyncGenerator,
|
| 42 |
)
|
| 43 |
+
from scrapling.engines.constants import STEALTH_ARGS, HARMFUL_ARGS, DEFAULT_ARGS
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
class SyncSession:
|
|
|
|
| 385 |
# Dark color scheme bypasses the 'prefersLightColor' check in creepjs
|
| 386 |
self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
|
| 387 |
self._browser_options: Dict[str, Any] = {
|
| 388 |
+
"args": DEFAULT_ARGS,
|
| 389 |
+
"ignore_default_args": HARMFUL_ARGS,
|
| 390 |
}
|
| 391 |
if "__max_pages" in params:
|
| 392 |
params["max_pages"] = params.pop("__max_pages")
|
|
|
|
| 480 |
config = cast(StealthConfig, self._config)
|
| 481 |
flags: Tuple[str, ...] = tuple()
|
| 482 |
if not config.cdp_url:
|
| 483 |
+
flags = DEFAULT_ARGS + STEALTH_ARGS
|
| 484 |
|
| 485 |
if config.block_webrtc:
|
| 486 |
flags += (
|
scrapling/engines/constants.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# Disable loading these resources for speed
|
| 2 |
-
|
| 3 |
"font",
|
| 4 |
"image",
|
| 5 |
"media",
|
|
@@ -12,7 +12,7 @@ DEFAULT_DISABLED_RESOURCES = {
|
|
| 12 |
"stylesheet",
|
| 13 |
}
|
| 14 |
|
| 15 |
-
|
| 16 |
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
| 17 |
"--enable-automation",
|
| 18 |
"--disable-popup-blocking",
|
|
@@ -21,7 +21,7 @@ HARMFUL_DEFAULT_ARGS = (
|
|
| 21 |
"--disable-extensions",
|
| 22 |
)
|
| 23 |
|
| 24 |
-
|
| 25 |
# Speed up chromium browsers by default
|
| 26 |
"--no-pings",
|
| 27 |
"--no-first-run",
|
|
@@ -36,7 +36,7 @@ DEFAULT_FLAGS = (
|
|
| 36 |
"--disable-search-engine-choice-screen",
|
| 37 |
)
|
| 38 |
|
| 39 |
-
|
| 40 |
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
| 41 |
# Generally this will make the browser faster and less detectable
|
| 42 |
# "--incognito",
|
|
|
|
| 1 |
# Disable loading these resources for speed
|
| 2 |
+
EXTRA_RESOURCES = {
|
| 3 |
"font",
|
| 4 |
"image",
|
| 5 |
"media",
|
|
|
|
| 12 |
"stylesheet",
|
| 13 |
}
|
| 14 |
|
| 15 |
+
HARMFUL_ARGS = (
|
| 16 |
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
| 17 |
"--enable-automation",
|
| 18 |
"--disable-popup-blocking",
|
|
|
|
| 21 |
"--disable-extensions",
|
| 22 |
)
|
| 23 |
|
| 24 |
+
DEFAULT_ARGS = (
|
| 25 |
# Speed up chromium browsers by default
|
| 26 |
"--no-pings",
|
| 27 |
"--no-first-run",
|
|
|
|
| 36 |
"--disable-search-engine-choice-screen",
|
| 37 |
)
|
| 38 |
|
| 39 |
+
STEALTH_ARGS = (
|
| 40 |
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
| 41 |
# Generally this will make the browser faster and less detectable
|
| 42 |
# "--incognito",
|
scrapling/engines/toolbelt/navigation.py
CHANGED
|
@@ -12,7 +12,7 @@ from playwright.sync_api import Route
|
|
| 12 |
|
| 13 |
from scrapling.core.utils import log
|
| 14 |
from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
|
| 15 |
-
from scrapling.engines.constants import
|
| 16 |
|
| 17 |
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
| 18 |
|
|
@@ -30,7 +30,7 @@ def create_intercept_handler(disable_resources: bool, blocked_domains: Optional[
|
|
| 30 |
:param blocked_domains: Set of domain names to block requests to.
|
| 31 |
:return: A sync route handler function.
|
| 32 |
"""
|
| 33 |
-
disabled_resources =
|
| 34 |
domains = blocked_domains or set()
|
| 35 |
|
| 36 |
def handler(route: Route):
|
|
@@ -57,7 +57,7 @@ def create_async_intercept_handler(disable_resources: bool, blocked_domains: Opt
|
|
| 57 |
:param blocked_domains: Set of domain names to block requests to.
|
| 58 |
:return: An async route handler function.
|
| 59 |
"""
|
| 60 |
-
disabled_resources =
|
| 61 |
domains = blocked_domains or set()
|
| 62 |
|
| 63 |
async def handler(route: async_Route):
|
|
|
|
| 12 |
|
| 13 |
from scrapling.core.utils import log
|
| 14 |
from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
|
| 15 |
+
from scrapling.engines.constants import EXTRA_RESOURCES
|
| 16 |
|
| 17 |
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
| 18 |
|
|
|
|
| 30 |
:param blocked_domains: Set of domain names to block requests to.
|
| 31 |
:return: A sync route handler function.
|
| 32 |
"""
|
| 33 |
+
disabled_resources = EXTRA_RESOURCES if disable_resources else set()
|
| 34 |
domains = blocked_domains or set()
|
| 35 |
|
| 36 |
def handler(route: Route):
|
|
|
|
| 57 |
:param blocked_domains: Set of domain names to block requests to.
|
| 58 |
:return: An async route handler function.
|
| 59 |
"""
|
| 60 |
+
disabled_resources = EXTRA_RESOURCES if disable_resources else set()
|
| 61 |
domains = blocked_domains or set()
|
| 62 |
|
| 63 |
async def handler(route: async_Route):
|
tests/fetchers/test_constants.py
CHANGED
|
@@ -1,9 +1,4 @@
|
|
| 1 |
-
from scrapling.engines.constants import
|
| 2 |
-
DEFAULT_DISABLED_RESOURCES,
|
| 3 |
-
DEFAULT_STEALTH_FLAGS,
|
| 4 |
-
HARMFUL_DEFAULT_ARGS,
|
| 5 |
-
DEFAULT_FLAGS,
|
| 6 |
-
)
|
| 7 |
|
| 8 |
|
| 9 |
class TestConstants:
|
|
@@ -11,18 +6,18 @@ class TestConstants:
|
|
| 11 |
|
| 12 |
def test_default_disabled_resources(self):
|
| 13 |
"""Test default disabled resources"""
|
| 14 |
-
assert "image" in
|
| 15 |
-
assert "font" in
|
| 16 |
-
assert "stylesheet" in
|
| 17 |
-
assert "media" in
|
| 18 |
|
| 19 |
def test_harmful_default_args(self):
|
| 20 |
"""Test harmful default arguments"""
|
| 21 |
-
assert "--enable-automation" in
|
| 22 |
-
assert "--disable-popup-blocking" in
|
| 23 |
|
| 24 |
def test_flags(self):
|
| 25 |
"""Test default stealth flags"""
|
| 26 |
-
assert "--no-pings" in
|
| 27 |
-
# assert "--incognito" in
|
| 28 |
-
assert "--disable-blink-features=AutomationControlled" in
|
|
|
|
| 1 |
+
from scrapling.engines.constants import EXTRA_RESOURCES, STEALTH_ARGS, HARMFUL_ARGS, DEFAULT_ARGS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
class TestConstants:
|
|
|
|
| 6 |
|
| 7 |
def test_default_disabled_resources(self):
|
| 8 |
"""Test default disabled resources"""
|
| 9 |
+
assert "image" in EXTRA_RESOURCES
|
| 10 |
+
assert "font" in EXTRA_RESOURCES
|
| 11 |
+
assert "stylesheet" in EXTRA_RESOURCES
|
| 12 |
+
assert "media" in EXTRA_RESOURCES
|
| 13 |
|
| 14 |
def test_harmful_default_args(self):
|
| 15 |
"""Test harmful default arguments"""
|
| 16 |
+
assert "--enable-automation" in HARMFUL_ARGS
|
| 17 |
+
assert "--disable-popup-blocking" in HARMFUL_ARGS
|
| 18 |
|
| 19 |
def test_flags(self):
|
| 20 |
"""Test default stealth flags"""
|
| 21 |
+
assert "--no-pings" in DEFAULT_ARGS
|
| 22 |
+
# assert "--incognito" in STEALTH_ARGS
|
| 23 |
+
assert "--disable-blink-features=AutomationControlled" in STEALTH_ARGS
|
tests/fetchers/test_utils.py
CHANGED
|
@@ -8,7 +8,6 @@ from scrapling.engines.toolbelt.navigation import (
|
|
| 8 |
create_async_intercept_handler,
|
| 9 |
js_bypass_path,
|
| 10 |
)
|
| 11 |
-
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 12 |
from scrapling.engines.toolbelt.fingerprints import (
|
| 13 |
generate_convincing_referer,
|
| 14 |
get_os_name,
|
|
|
|
| 8 |
create_async_intercept_handler,
|
| 9 |
js_bypass_path,
|
| 10 |
)
|
|
|
|
| 11 |
from scrapling.engines.toolbelt.fingerprints import (
|
| 12 |
generate_convincing_referer,
|
| 13 |
get_os_name,
|