Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Feb 27

Commit

cf06b6e

1 Parent(s): 130d1d8

fix(stealth): improve stealth mode by removing unnecessary scripts

Browse files

It doesn't add anything to the table as before (it might turn out I was wrong and add them again lol)

Files changed (11) hide show

MANIFEST.in +0 -1
scrapling/engines/_browsers/_config_tools.py +0 -34
scrapling/engines/_browsers/_stealth.py +0 -17
scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -40
scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -5
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -3
scrapling/engines/toolbelt/bypasses/screen_props.js +0 -27
scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -27
scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -213
scrapling/engines/toolbelt/navigation.py +0 -14
tests/fetchers/test_utils.py +0 -21

MANIFEST.in CHANGED Viewed

@@ -1,7 +1,6 @@
 include LICENSE
 include *.db
 include *.js
-include scrapling/engines/toolbelt/bypasses/*.js
 include scrapling/*.db
 include scrapling/*.db*
 include scrapling/*.db-*

 include LICENSE
 include *.db
 include *.js
 include scrapling/*.db
 include scrapling/*.db*
 include scrapling/*.db-*

scrapling/engines/_browsers/_config_tools.py CHANGED Viewed

@@ -1,38 +1,4 @@
-from functools import lru_cache
-from scrapling.engines.toolbelt.navigation import js_bypass_path
 from scrapling.engines.toolbelt.fingerprints import generate_headers
 __default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
 __default_chrome_useragent__ = generate_headers(browser_mode="chrome").get("User-Agent")
-@lru_cache(1)
-def _compiled_stealth_scripts():
-    """Pre-read and compile stealth scripts"""
-    # Basic bypasses nothing fancy as I'm still working on it
-    # But with adding these bypasses to the above config, it bypasses many online tests like
-    # https://bot.sannysoft.com/
-    # https://kaliiiiiiiiii.github.io/brotector/
-    # https://pixelscan.net/
-    # https://iphey.com/
-    # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
-    # https://arh.antoinevastel.com/bots/areyouheadless/
-    # https://prescience-data.github.io/execution-monitor.html
-    stealth_scripts_paths = tuple(
-        js_bypass_path(script)
-        for script in (
-            # Order is important
-            "webdriver_fully.js",
-            "window_chrome.js",
-            "navigator_plugins.js",
-            "notification_permission.js",
-            "screen_props.js",
-            "playwright_fingerprint.js",
-        )
-    )
-    scripts = []
-    for script_path in stealth_scripts_paths:
-        with open(script_path, "r") as f:
-            scripts.append(f.read())
-    return tuple(scripts)

 from scrapling.engines.toolbelt.fingerprints import generate_headers
 __default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
 __default_chrome_useragent__ = generate_headers(browser_mode="chrome").get("User-Agent")

scrapling/engines/_browsers/_stealth.py CHANGED Viewed

@@ -17,7 +17,6 @@ from scrapling.core._types import Any, Optional, ProxyType, Unpack
 from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
 from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
 from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
-from scrapling.engines._browsers._config_tools import _compiled_stealth_scripts
 from scrapling.engines._browsers._types import StealthSession, StealthFetchParams
 from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin
 from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig
@@ -109,14 +108,6 @@ class StealthySession(SyncSession, StealthySessionMixin):
         else:
             raise RuntimeError("Session has been already started")
-    def _initialize_context(self, config, ctx: BrowserContext) -> BrowserContext:
-        """Initialize the browser context."""
-        for script in _compiled_stealth_scripts():
-            ctx.add_init_script(script=script)
-        ctx = super()._initialize_context(config, ctx)
-        return ctx
     def _cloudflare_solver(self, page: Page) -> None:  # pragma: no cover
         """Solve the cloudflare challenge displayed on the playwright page passed
@@ -372,14 +363,6 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
         else:
             raise RuntimeError("Session has been already started")
-    async def _initialize_context(self, config: Any, ctx: AsyncBrowserContext) -> AsyncBrowserContext:
-        """Initialize the browser context."""
-        for script in _compiled_stealth_scripts():
-            await ctx.add_init_script(script=script)
-        ctx = await super()._initialize_context(config, ctx)
-        return ctx
     async def _cloudflare_solver(self, page: async_Page) -> None:  # pragma: no cover
         """Solve the cloudflare challenge displayed on the playwright page passed

 from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
 from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
 from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
 from scrapling.engines._browsers._types import StealthSession, StealthFetchParams
 from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin
 from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig
         else:
             raise RuntimeError("Session has been already started")
     def _cloudflare_solver(self, page: Page) -> None:  # pragma: no cover
         """Solve the cloudflare challenge displayed on the playwright page passed
         else:
             raise RuntimeError("Session has been already started")
     async def _cloudflare_solver(self, page: async_Page) -> None:  # pragma: no cover
         """Solve the cloudflare challenge displayed on the playwright page passed

scrapling/engines/toolbelt/bypasses/navigator_plugins.js DELETED Viewed

@@ -1,40 +0,0 @@
-if(navigator.plugins.length == 0){
-    Object.defineProperty(navigator, 'plugins', {
-        get: () => {
-            const PDFViewerPlugin = Object.create(Plugin.prototype, {
-                description: { value: 'Portable Document Format', enumerable: false },
-                filename: { value: 'internal-pdf-viewer', enumerable: false },
-                name: { value: 'PDF Viewer', enumerable: false },
-            });
-            const ChromePDFViewer = Object.create(Plugin.prototype, {
-                description: { value: 'Portable Document Format', enumerable: false },
-                filename: { value: 'internal-pdf-viewer', enumerable: false },
-                name: { value: 'Chrome PDF Viewer', enumerable: false },
-            });
-            const ChromiumPDFViewer = Object.create(Plugin.prototype, {
-                description: { value: 'Portable Document Format', enumerable: false },
-                filename: { value: 'internal-pdf-viewer', enumerable: false },
-                name: { value: 'Chromium PDF Viewer', enumerable: false },
-            });
-            const EdgePDFViewer = Object.create(Plugin.prototype, {
-                description: { value: 'Portable Document Format', enumerable: false },
-                filename: { value: 'internal-pdf-viewer', enumerable: false },
-                name: { value: 'Microsoft Edge PDF Viewer', enumerable: false },
-            });
-            const WebKitPDFPlugin = Object.create(Plugin.prototype, {
-                description: { value: 'Portable Document Format', enumerable: false },
-                filename: { value: 'internal-pdf-viewer', enumerable: false },
-                name: { value: 'WebKit built-in PDF', enumerable: false },
-            });
-            return Object.create(PluginArray.prototype, {
-                length: { value: 5 },
-                0: { value: PDFViewerPlugin },
-                1: { value: ChromePDFViewer },
-                2: { value: ChromiumPDFViewer },
-                3: { value: EdgePDFViewer },
-                4: { value: WebKitPDFPlugin },
-            });
-        },
-    });
-}

scrapling/engines/toolbelt/bypasses/notification_permission.js DELETED Viewed

@@ -1,5 +0,0 @@
-// Bypasses `notificationIsDenied` test in creepsjs's 'Like Headless' sections
-const isSecure = document.location.protocol.startsWith('https')
-if (isSecure){
-    Object.defineProperty(Notification, 'permission', {get: () => 'default'})
-}

scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js DELETED Viewed

@@ -1,3 +0,0 @@
-// Remove playwright fingerprint => https://github.com/microsoft/playwright/commit/c9e673c6dca746384338ab6bb0cf63c7e7caa9b2#diff-087773eea292da9db5a3f27de8f1a2940cdb895383ad750c3cd8e01772a35b40R915
-delete window.__pwInitScripts;
-delete window.__playwright__binding__;

scrapling/engines/toolbelt/bypasses/screen_props.js DELETED Viewed

@@ -1,27 +0,0 @@
-const windowScreenProps = {
-    // Dimensions
-    innerHeight: 0,
-    innerWidth: 0,
-    outerHeight: 754,
-    outerWidth: 1313,
-    // Position
-    screenX: 19,
-    pageXOffset: 0,
-    pageYOffset: 0,
-    // Display
-    devicePixelRatio: 2
-};
-try {
-    for (const [prop, value] of Object.entries(windowScreenProps)) {
-        if (value > 0) {
-            // The 0 values are introduced by collecting in the hidden iframe.
-            // They are document sizes anyway so no need to test them or inject them.
-            window[prop] = value;
-        }
-    }
-} catch (e) {
-    console.warn(e);
-};

scrapling/engines/toolbelt/bypasses/webdriver_fully.js DELETED Viewed

@@ -1,27 +0,0 @@
-// Create a function that looks like a native getter
-const nativeGetter = function get webdriver() {
-    return false;
-};
-// Copy over native function properties
-Object.defineProperties(nativeGetter, {
-    name: { value: 'get webdriver', configurable: true },
-    length: { value: 0, configurable: true },
-    toString: {
-        value: function() {
-            return `function get webdriver() { [native code] }`;
-        },
-        configurable: true
-    }
-});
-// Make it look native
-Object.setPrototypeOf(nativeGetter, Function.prototype);
-// Apply the modified descriptor
-Object.defineProperty(Navigator.prototype, 'webdriver', {
-    get: nativeGetter,
-    set: undefined,
-    enumerable: true,
-    configurable: true
-});

scrapling/engines/toolbelt/bypasses/window_chrome.js DELETED Viewed

@@ -1,213 +0,0 @@
-// To escape `HEADCHR_CHROME_OBJ` test in headless mode => https://github.com/antoinevastel/fp-collect/blob/master/src/fpCollect.js#L322
-// Faking window.chrome fully
-if (!window.chrome) {
-    // First, save all existing properties
-    const originalKeys = Object.getOwnPropertyNames(window);
-    const tempObj = {};
-    // Recreate all properties in original order
-    for (const key of originalKeys) {
-        const descriptor = Object.getOwnPropertyDescriptor(window, key);
-        const value = window[key];
-        // delete window[key];
-        Object.defineProperty(tempObj, key, descriptor);
-    }
-    // Use the exact property descriptor found in headful Chrome
-    // fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`
-    const mockChrome = {
-        loadTimes: {},
-        csi: {},
-        app: {
-            isInstalled: false
-        },
-        // Add other Chrome-specific properties
-    };
-    Object.defineProperty(tempObj, 'chrome', {
-        writable: true,
-        enumerable: true,
-        configurable: false,
-        value: mockChrome
-    });
-    for (const key of Object.getOwnPropertyNames(tempObj)) {
-        try {
-            Object.defineProperty(window, key,
-                Object.getOwnPropertyDescriptor(tempObj, key));
-        } catch (e) {}
-    };
-    // todo: solve this
-    // Using line below bypasses the hasHighChromeIndex test in creepjs ==> https://github.com/abrahamjuliot/creepjs/blob/master/src/headless/index.ts#L121
-    // Chrome object have to be in the end of the window properties
-    // Object.assign(window, tempObj);
-    // But makes window.chrome unreadable on 'https://bot.sannysoft.com/'
-}
-// That means we're running headful and don't need to mock anything
-if ('app' in window.chrome) {
-    return; // Nothing to do here
-}
-const makeError = {
-    ErrorInInvocation: fn => {
-        const err = new TypeError(`Error in invocation of app.${fn}()`);
-        return utils.stripErrorWithAnchor(
-            err,
-            `at ${fn} (eval at <anonymous>`,
-        );
-    },
-};
-// check with: `JSON.stringify(window.chrome['app'])`
-const STATIC_DATA = JSON.parse(
-    `
-{
-  "isInstalled": false,
-  "InstallState": {
-    "DISABLED": "disabled",
-    "INSTALLED": "installed",
-    "NOT_INSTALLED": "not_installed"
-  },
-  "RunningState": {
-    "CANNOT_RUN": "cannot_run",
-    "READY_TO_RUN": "ready_to_run",
-    "RUNNING": "running"
-  }
-}
-    `.trim(),
-    );
-window.chrome.app = {
-    ...STATIC_DATA,
-    get isInstalled() {
-        return false;
-    },
-    getDetails: function getDetails() {
-        if (arguments.length) {
-            throw makeError.ErrorInInvocation(`getDetails`);
-        }
-        return null;
-    },
-    getIsInstalled: function getDetails() {
-        if (arguments.length) {
-            throw makeError.ErrorInInvocation(`getIsInstalled`);
-        }
-        return false;
-    },
-    runningState: function getDetails() {
-        if (arguments.length) {
-            throw makeError.ErrorInInvocation(`runningState`);
-        }
-        return 'cannot_run';
-    },
-};
-// Check that the Navigation Timing API v1 is available, we need that
-if (!window.performance || !window.performance.timing) {
-    return;
-}
-const {timing} = window.performance;
-window.chrome.csi = function () {
-    return {
-        onloadT: timing.domContentLoadedEventEnd,
-        startE: timing.navigationStart,
-        pageT: Date.now() - timing.navigationStart,
-        tran: 15, // Transition type or something
-    };
-};
-if (!window.PerformancePaintTiming){
-    return;
-}
-const {performance} = window;
-// Some stuff is not available on about:blank as it requires a navigation to occur,
-// let's harden the code to not fail then:
-const ntEntryFallback = {
-    nextHopProtocol: 'h2',
-    type: 'other',
-};
-// The API exposes some funky info regarding the connection
-const protocolInfo = {
-    get connectionInfo() {
-        const ntEntry =
-            performance.getEntriesByType('navigation')[0] || ntEntryFallback;
-        return ntEntry.nextHopProtocol;
-    },
-    get npnNegotiatedProtocol() {
-        // NPN is deprecated in favor of ALPN, but this implementation returns the
-        // HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
-        const ntEntry =
-            performance.getEntriesByType('navigation')[0] || ntEntryFallback;
-        return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)
-            ? ntEntry.nextHopProtocol
-            : 'unknown';
-    },
-    get navigationType() {
-        const ntEntry =
-            performance.getEntriesByType('navigation')[0] || ntEntryFallback;
-        return ntEntry.type;
-    },
-    get wasAlternateProtocolAvailable() {
-        // The Alternate-Protocol header is deprecated in favor of Alt-Svc
-        // (https://www.mnot.net/blog/2016/03/09/alt-svc), so technically this
-        // should always return false.
-        return false;
-    },
-    get wasFetchedViaSpdy() {
-        // SPDY is deprecated in favor of HTTP/2, but this implementation returns
-        // true for HTTP/2 or HTTP2+QUIC/39 as well.
-        const ntEntry =
-            performance.getEntriesByType('navigation')[0] || ntEntryFallback;
-        return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
-    },
-    get wasNpnNegotiated() {
-        // NPN is deprecated in favor of ALPN, but this implementation returns true
-        // for HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
-        const ntEntry =
-            performance.getEntriesByType('navigation')[0] || ntEntryFallback;
-        return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
-    },
-};
-// Truncate number to specific number of decimals, most of the `loadTimes` stuff has 3
-function toFixed(num, fixed) {
-    var re = new RegExp('^-?\\d+(?:.\\d{0,' + (fixed || -1) + '})?');
-    return num.toString().match(re)[0];
-}
-const timingInfo = {
-    get firstPaintAfterLoadTime() {
-        // This was never actually implemented and always returns 0.
-        return 0;
-    },
-    get requestTime() {
-        return timing.navigationStart / 1000;
-    },
-    get startLoadTime() {
-        return timing.navigationStart / 1000;
-    },
-    get commitLoadTime() {
-        return timing.responseStart / 1000;
-    },
-    get finishDocumentLoadTime() {
-        return timing.domContentLoadedEventEnd / 1000;
-    },
-    get finishLoadTime() {
-        return timing.loadEventEnd / 1000;
-    },
-    get firstPaintTime() {
-        const fpEntry = performance.getEntriesByType('paint')[0] || {
-            startTime: timing.loadEventEnd / 1000, // Fallback if no navigation occured (`about:blank`)
-        };
-        return toFixed(
-            (fpEntry.startTime + performance.timeOrigin) / 1000,
-            3,
-        );
-    },
-};
-window.chrome.loadTimes = function () {
-    return {
-        ...protocolInfo,
-        ...timingInfo,
-    };
-};

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -2,8 +2,6 @@
 Functions related to files and URLs
 """
-from pathlib import Path
-from functools import lru_cache
 from urllib.parse import urlparse
 from playwright.async_api import Route as async_Route
@@ -14,8 +12,6 @@ from scrapling.core.utils import log
 from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
 from scrapling.engines.constants import EXTRA_RESOURCES
-__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
 class ProxyDict(Struct):
     server: str
@@ -111,13 +107,3 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:
             raise TypeError(f"Invalid proxy dictionary: {e}")
     raise TypeError(f"Invalid proxy string: {proxy_string}")
-@lru_cache(10, typed=True)
-def js_bypass_path(filename: str) -> str:
-    """Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it
-    :param filename: The base filename of the JS file.
-    :return: The full path of the JS file.
-    """
-    return str(__BYPASSES_DIR__ / filename)

 Functions related to files and URLs
 """
 from urllib.parse import urlparse
 from playwright.async_api import Route as async_Route
 from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
 from scrapling.engines.constants import EXTRA_RESOURCES
 class ProxyDict(Struct):
     server: str
             raise TypeError(f"Invalid proxy dictionary: {e}")
     raise TypeError(f"Invalid proxy string: {proxy_string}")

tests/fetchers/test_utils.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import pytest
-from pathlib import Path
 from scrapling.engines.toolbelt.custom import StatusText, Response
 from scrapling.engines.toolbelt.navigation import (
     construct_proxy_dict,
     create_intercept_handler,
     create_async_intercept_handler,
-    js_bypass_path,
 )
 from scrapling.engines.toolbelt.fingerprints import (
     generate_convincing_referer,
@@ -203,25 +201,6 @@ class TestConstructProxyDict:
             construct_proxy_dict({"invalid": "structure"})
-class TestJsBypassPath:
-    """Test JavaScript bypass path utility"""
-    def test_js_bypass_path(self):
-        """Test getting JavaScript bypass file path"""
-        result = js_bypass_path("webdriver_fully.js")
-        assert isinstance(result, str)
-        assert result.endswith("webdriver_fully.js")
-        assert Path(result).exists()
-    def test_js_bypass_path_caching(self):
-        """Test that js_bypass_path is cached"""
-        result1 = js_bypass_path("webdriver_fully.js")
-        result2 = js_bypass_path("webdriver_fully.js")
-        assert result1 == result2
 class TestFingerprintFunctions:
     """Test fingerprint generation functions"""

 import pytest
 from scrapling.engines.toolbelt.custom import StatusText, Response
 from scrapling.engines.toolbelt.navigation import (
     construct_proxy_dict,
     create_intercept_handler,
     create_async_intercept_handler,
 )
 from scrapling.engines.toolbelt.fingerprints import (
     generate_convincing_referer,
             construct_proxy_dict({"invalid": "structure"})
 class TestFingerprintFunctions:
     """Test fingerprint generation functions"""