Karim shoair commited on
Commit
cf06b6e
·
1 Parent(s): 130d1d8

fix(stealth): improve stealth mode by removing unnecessary scripts

Browse files

It doesn't add anything to the table as before (it might turn out I was wrong and add them again lol)

MANIFEST.in CHANGED
@@ -1,7 +1,6 @@
1
  include LICENSE
2
  include *.db
3
  include *.js
4
- include scrapling/engines/toolbelt/bypasses/*.js
5
  include scrapling/*.db
6
  include scrapling/*.db*
7
  include scrapling/*.db-*
 
1
  include LICENSE
2
  include *.db
3
  include *.js
 
4
  include scrapling/*.db
5
  include scrapling/*.db*
6
  include scrapling/*.db-*
scrapling/engines/_browsers/_config_tools.py CHANGED
@@ -1,38 +1,4 @@
1
- from functools import lru_cache
2
-
3
- from scrapling.engines.toolbelt.navigation import js_bypass_path
4
  from scrapling.engines.toolbelt.fingerprints import generate_headers
5
 
6
  __default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
7
  __default_chrome_useragent__ = generate_headers(browser_mode="chrome").get("User-Agent")
8
-
9
-
10
- @lru_cache(1)
11
- def _compiled_stealth_scripts():
12
- """Pre-read and compile stealth scripts"""
13
- # Basic bypasses nothing fancy as I'm still working on it
14
- # But with adding these bypasses to the above config, it bypasses many online tests like
15
- # https://bot.sannysoft.com/
16
- # https://kaliiiiiiiiii.github.io/brotector/
17
- # https://pixelscan.net/
18
- # https://iphey.com/
19
- # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
20
- # https://arh.antoinevastel.com/bots/areyouheadless/
21
- # https://prescience-data.github.io/execution-monitor.html
22
- stealth_scripts_paths = tuple(
23
- js_bypass_path(script)
24
- for script in (
25
- # Order is important
26
- "webdriver_fully.js",
27
- "window_chrome.js",
28
- "navigator_plugins.js",
29
- "notification_permission.js",
30
- "screen_props.js",
31
- "playwright_fingerprint.js",
32
- )
33
- )
34
- scripts = []
35
- for script_path in stealth_scripts_paths:
36
- with open(script_path, "r") as f:
37
- scripts.append(f.read())
38
- return tuple(scripts)
 
 
 
 
1
  from scrapling.engines.toolbelt.fingerprints import generate_headers
2
 
3
  __default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
4
  __default_chrome_useragent__ = generate_headers(browser_mode="chrome").get("User-Agent")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapling/engines/_browsers/_stealth.py CHANGED
@@ -17,7 +17,6 @@ from scrapling.core._types import Any, Optional, ProxyType, Unpack
17
  from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
18
  from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
19
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
20
- from scrapling.engines._browsers._config_tools import _compiled_stealth_scripts
21
  from scrapling.engines._browsers._types import StealthSession, StealthFetchParams
22
  from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin
23
  from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig
@@ -109,14 +108,6 @@ class StealthySession(SyncSession, StealthySessionMixin):
109
  else:
110
  raise RuntimeError("Session has been already started")
111
 
112
- def _initialize_context(self, config, ctx: BrowserContext) -> BrowserContext:
113
- """Initialize the browser context."""
114
- for script in _compiled_stealth_scripts():
115
- ctx.add_init_script(script=script)
116
-
117
- ctx = super()._initialize_context(config, ctx)
118
- return ctx
119
-
120
  def _cloudflare_solver(self, page: Page) -> None: # pragma: no cover
121
  """Solve the cloudflare challenge displayed on the playwright page passed
122
 
@@ -372,14 +363,6 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
372
  else:
373
  raise RuntimeError("Session has been already started")
374
 
375
- async def _initialize_context(self, config: Any, ctx: AsyncBrowserContext) -> AsyncBrowserContext:
376
- """Initialize the browser context."""
377
- for script in _compiled_stealth_scripts():
378
- await ctx.add_init_script(script=script)
379
-
380
- ctx = await super()._initialize_context(config, ctx)
381
- return ctx
382
-
383
  async def _cloudflare_solver(self, page: async_Page) -> None: # pragma: no cover
384
  """Solve the cloudflare challenge displayed on the playwright page passed
385
 
 
17
  from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
18
  from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
19
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
 
20
  from scrapling.engines._browsers._types import StealthSession, StealthFetchParams
21
  from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin
22
  from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig
 
108
  else:
109
  raise RuntimeError("Session has been already started")
110
 
 
 
 
 
 
 
 
 
111
  def _cloudflare_solver(self, page: Page) -> None: # pragma: no cover
112
  """Solve the cloudflare challenge displayed on the playwright page passed
113
 
 
363
  else:
364
  raise RuntimeError("Session has been already started")
365
 
 
 
 
 
 
 
 
 
366
  async def _cloudflare_solver(self, page: async_Page) -> None: # pragma: no cover
367
  """Solve the cloudflare challenge displayed on the playwright page passed
368
 
scrapling/engines/toolbelt/bypasses/navigator_plugins.js DELETED
@@ -1,40 +0,0 @@
1
- if(navigator.plugins.length == 0){
2
- Object.defineProperty(navigator, 'plugins', {
3
- get: () => {
4
- const PDFViewerPlugin = Object.create(Plugin.prototype, {
5
- description: { value: 'Portable Document Format', enumerable: false },
6
- filename: { value: 'internal-pdf-viewer', enumerable: false },
7
- name: { value: 'PDF Viewer', enumerable: false },
8
- });
9
- const ChromePDFViewer = Object.create(Plugin.prototype, {
10
- description: { value: 'Portable Document Format', enumerable: false },
11
- filename: { value: 'internal-pdf-viewer', enumerable: false },
12
- name: { value: 'Chrome PDF Viewer', enumerable: false },
13
- });
14
- const ChromiumPDFViewer = Object.create(Plugin.prototype, {
15
- description: { value: 'Portable Document Format', enumerable: false },
16
- filename: { value: 'internal-pdf-viewer', enumerable: false },
17
- name: { value: 'Chromium PDF Viewer', enumerable: false },
18
- });
19
- const EdgePDFViewer = Object.create(Plugin.prototype, {
20
- description: { value: 'Portable Document Format', enumerable: false },
21
- filename: { value: 'internal-pdf-viewer', enumerable: false },
22
- name: { value: 'Microsoft Edge PDF Viewer', enumerable: false },
23
- });
24
- const WebKitPDFPlugin = Object.create(Plugin.prototype, {
25
- description: { value: 'Portable Document Format', enumerable: false },
26
- filename: { value: 'internal-pdf-viewer', enumerable: false },
27
- name: { value: 'WebKit built-in PDF', enumerable: false },
28
- });
29
-
30
- return Object.create(PluginArray.prototype, {
31
- length: { value: 5 },
32
- 0: { value: PDFViewerPlugin },
33
- 1: { value: ChromePDFViewer },
34
- 2: { value: ChromiumPDFViewer },
35
- 3: { value: EdgePDFViewer },
36
- 4: { value: WebKitPDFPlugin },
37
- });
38
- },
39
- });
40
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapling/engines/toolbelt/bypasses/notification_permission.js DELETED
@@ -1,5 +0,0 @@
1
- // Bypasses `notificationIsDenied` test in creepsjs's 'Like Headless' sections
2
- const isSecure = document.location.protocol.startsWith('https')
3
- if (isSecure){
4
- Object.defineProperty(Notification, 'permission', {get: () => 'default'})
5
- }
 
 
 
 
 
 
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js DELETED
@@ -1,3 +0,0 @@
1
- // Remove playwright fingerprint => https://github.com/microsoft/playwright/commit/c9e673c6dca746384338ab6bb0cf63c7e7caa9b2#diff-087773eea292da9db5a3f27de8f1a2940cdb895383ad750c3cd8e01772a35b40R915
2
- delete window.__pwInitScripts;
3
- delete window.__playwright__binding__;
 
 
 
 
scrapling/engines/toolbelt/bypasses/screen_props.js DELETED
@@ -1,27 +0,0 @@
1
- const windowScreenProps = {
2
- // Dimensions
3
- innerHeight: 0,
4
- innerWidth: 0,
5
- outerHeight: 754,
6
- outerWidth: 1313,
7
-
8
- // Position
9
- screenX: 19,
10
- pageXOffset: 0,
11
- pageYOffset: 0,
12
-
13
- // Display
14
- devicePixelRatio: 2
15
- };
16
-
17
- try {
18
- for (const [prop, value] of Object.entries(windowScreenProps)) {
19
- if (value > 0) {
20
- // The 0 values are introduced by collecting in the hidden iframe.
21
- // They are document sizes anyway so no need to test them or inject them.
22
- window[prop] = value;
23
- }
24
- }
25
- } catch (e) {
26
- console.warn(e);
27
- };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapling/engines/toolbelt/bypasses/webdriver_fully.js DELETED
@@ -1,27 +0,0 @@
1
- // Create a function that looks like a native getter
2
- const nativeGetter = function get webdriver() {
3
- return false;
4
- };
5
-
6
- // Copy over native function properties
7
- Object.defineProperties(nativeGetter, {
8
- name: { value: 'get webdriver', configurable: true },
9
- length: { value: 0, configurable: true },
10
- toString: {
11
- value: function() {
12
- return `function get webdriver() { [native code] }`;
13
- },
14
- configurable: true
15
- }
16
- });
17
-
18
- // Make it look native
19
- Object.setPrototypeOf(nativeGetter, Function.prototype);
20
-
21
- // Apply the modified descriptor
22
- Object.defineProperty(Navigator.prototype, 'webdriver', {
23
- get: nativeGetter,
24
- set: undefined,
25
- enumerable: true,
26
- configurable: true
27
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapling/engines/toolbelt/bypasses/window_chrome.js DELETED
@@ -1,213 +0,0 @@
1
- // To escape `HEADCHR_CHROME_OBJ` test in headless mode => https://github.com/antoinevastel/fp-collect/blob/master/src/fpCollect.js#L322
2
- // Faking window.chrome fully
3
-
4
- if (!window.chrome) {
5
- // First, save all existing properties
6
- const originalKeys = Object.getOwnPropertyNames(window);
7
- const tempObj = {};
8
-
9
- // Recreate all properties in original order
10
- for (const key of originalKeys) {
11
- const descriptor = Object.getOwnPropertyDescriptor(window, key);
12
- const value = window[key];
13
- // delete window[key];
14
- Object.defineProperty(tempObj, key, descriptor);
15
- }
16
-
17
- // Use the exact property descriptor found in headful Chrome
18
- // fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`
19
- const mockChrome = {
20
- loadTimes: {},
21
- csi: {},
22
- app: {
23
- isInstalled: false
24
- },
25
- // Add other Chrome-specific properties
26
- };
27
-
28
- Object.defineProperty(tempObj, 'chrome', {
29
- writable: true,
30
- enumerable: true,
31
- configurable: false,
32
- value: mockChrome
33
- });
34
- for (const key of Object.getOwnPropertyNames(tempObj)) {
35
- try {
36
- Object.defineProperty(window, key,
37
- Object.getOwnPropertyDescriptor(tempObj, key));
38
- } catch (e) {}
39
- };
40
- // todo: solve this
41
- // Using line below bypasses the hasHighChromeIndex test in creepjs ==> https://github.com/abrahamjuliot/creepjs/blob/master/src/headless/index.ts#L121
42
- // Chrome object have to be in the end of the window properties
43
- // Object.assign(window, tempObj);
44
- // But makes window.chrome unreadable on 'https://bot.sannysoft.com/'
45
- }
46
-
47
- // That means we're running headful and don't need to mock anything
48
- if ('app' in window.chrome) {
49
- return; // Nothing to do here
50
- }
51
- const makeError = {
52
- ErrorInInvocation: fn => {
53
- const err = new TypeError(`Error in invocation of app.${fn}()`);
54
- return utils.stripErrorWithAnchor(
55
- err,
56
- `at ${fn} (eval at <anonymous>`,
57
- );
58
- },
59
- };
60
- // check with: `JSON.stringify(window.chrome['app'])`
61
- const STATIC_DATA = JSON.parse(
62
- `
63
- {
64
- "isInstalled": false,
65
- "InstallState": {
66
- "DISABLED": "disabled",
67
- "INSTALLED": "installed",
68
- "NOT_INSTALLED": "not_installed"
69
- },
70
- "RunningState": {
71
- "CANNOT_RUN": "cannot_run",
72
- "READY_TO_RUN": "ready_to_run",
73
- "RUNNING": "running"
74
- }
75
- }
76
- `.trim(),
77
- );
78
- window.chrome.app = {
79
- ...STATIC_DATA,
80
-
81
- get isInstalled() {
82
- return false;
83
- },
84
-
85
- getDetails: function getDetails() {
86
- if (arguments.length) {
87
- throw makeError.ErrorInInvocation(`getDetails`);
88
- }
89
- return null;
90
- },
91
- getIsInstalled: function getDetails() {
92
- if (arguments.length) {
93
- throw makeError.ErrorInInvocation(`getIsInstalled`);
94
- }
95
- return false;
96
- },
97
- runningState: function getDetails() {
98
- if (arguments.length) {
99
- throw makeError.ErrorInInvocation(`runningState`);
100
- }
101
- return 'cannot_run';
102
- },
103
- };
104
- // Check that the Navigation Timing API v1 is available, we need that
105
- if (!window.performance || !window.performance.timing) {
106
- return;
107
- }
108
- const {timing} = window.performance;
109
- window.chrome.csi = function () {
110
- return {
111
- onloadT: timing.domContentLoadedEventEnd,
112
- startE: timing.navigationStart,
113
- pageT: Date.now() - timing.navigationStart,
114
- tran: 15, // Transition type or something
115
- };
116
- };
117
- if (!window.PerformancePaintTiming){
118
- return;
119
- }
120
- const {performance} = window;
121
- // Some stuff is not available on about:blank as it requires a navigation to occur,
122
- // let's harden the code to not fail then:
123
- const ntEntryFallback = {
124
- nextHopProtocol: 'h2',
125
- type: 'other',
126
- };
127
-
128
- // The API exposes some funky info regarding the connection
129
- const protocolInfo = {
130
- get connectionInfo() {
131
- const ntEntry =
132
- performance.getEntriesByType('navigation')[0] || ntEntryFallback;
133
- return ntEntry.nextHopProtocol;
134
- },
135
- get npnNegotiatedProtocol() {
136
- // NPN is deprecated in favor of ALPN, but this implementation returns the
137
- // HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
138
- const ntEntry =
139
- performance.getEntriesByType('navigation')[0] || ntEntryFallback;
140
- return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)
141
- ? ntEntry.nextHopProtocol
142
- : 'unknown';
143
- },
144
- get navigationType() {
145
- const ntEntry =
146
- performance.getEntriesByType('navigation')[0] || ntEntryFallback;
147
- return ntEntry.type;
148
- },
149
- get wasAlternateProtocolAvailable() {
150
- // The Alternate-Protocol header is deprecated in favor of Alt-Svc
151
- // (https://www.mnot.net/blog/2016/03/09/alt-svc), so technically this
152
- // should always return false.
153
- return false;
154
- },
155
- get wasFetchedViaSpdy() {
156
- // SPDY is deprecated in favor of HTTP/2, but this implementation returns
157
- // true for HTTP/2 or HTTP2+QUIC/39 as well.
158
- const ntEntry =
159
- performance.getEntriesByType('navigation')[0] || ntEntryFallback;
160
- return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
161
- },
162
- get wasNpnNegotiated() {
163
- // NPN is deprecated in favor of ALPN, but this implementation returns true
164
- // for HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
165
- const ntEntry =
166
- performance.getEntriesByType('navigation')[0] || ntEntryFallback;
167
- return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
168
- },
169
- };
170
-
171
- // Truncate number to specific number of decimals, most of the `loadTimes` stuff has 3
172
- function toFixed(num, fixed) {
173
- var re = new RegExp('^-?\\d+(?:.\\d{0,' + (fixed || -1) + '})?');
174
- return num.toString().match(re)[0];
175
- }
176
-
177
- const timingInfo = {
178
- get firstPaintAfterLoadTime() {
179
- // This was never actually implemented and always returns 0.
180
- return 0;
181
- },
182
- get requestTime() {
183
- return timing.navigationStart / 1000;
184
- },
185
- get startLoadTime() {
186
- return timing.navigationStart / 1000;
187
- },
188
- get commitLoadTime() {
189
- return timing.responseStart / 1000;
190
- },
191
- get finishDocumentLoadTime() {
192
- return timing.domContentLoadedEventEnd / 1000;
193
- },
194
- get finishLoadTime() {
195
- return timing.loadEventEnd / 1000;
196
- },
197
- get firstPaintTime() {
198
- const fpEntry = performance.getEntriesByType('paint')[0] || {
199
- startTime: timing.loadEventEnd / 1000, // Fallback if no navigation occured (`about:blank`)
200
- };
201
- return toFixed(
202
- (fpEntry.startTime + performance.timeOrigin) / 1000,
203
- 3,
204
- );
205
- },
206
- };
207
-
208
- window.chrome.loadTimes = function () {
209
- return {
210
- ...protocolInfo,
211
- ...timingInfo,
212
- };
213
- };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapling/engines/toolbelt/navigation.py CHANGED
@@ -2,8 +2,6 @@
2
  Functions related to files and URLs
3
  """
4
 
5
- from pathlib import Path
6
- from functools import lru_cache
7
  from urllib.parse import urlparse
8
 
9
  from playwright.async_api import Route as async_Route
@@ -14,8 +12,6 @@ from scrapling.core.utils import log
14
  from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
15
  from scrapling.engines.constants import EXTRA_RESOURCES
16
 
17
- __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
18
-
19
 
20
  class ProxyDict(Struct):
21
  server: str
@@ -111,13 +107,3 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:
111
  raise TypeError(f"Invalid proxy dictionary: {e}")
112
 
113
  raise TypeError(f"Invalid proxy string: {proxy_string}")
114
-
115
-
116
- @lru_cache(10, typed=True)
117
- def js_bypass_path(filename: str) -> str:
118
- """Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it
119
-
120
- :param filename: The base filename of the JS file.
121
- :return: The full path of the JS file.
122
- """
123
- return str(__BYPASSES_DIR__ / filename)
 
2
  Functions related to files and URLs
3
  """
4
 
 
 
5
  from urllib.parse import urlparse
6
 
7
  from playwright.async_api import Route as async_Route
 
12
  from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
13
  from scrapling.engines.constants import EXTRA_RESOURCES
14
 
 
 
15
 
16
  class ProxyDict(Struct):
17
  server: str
 
107
  raise TypeError(f"Invalid proxy dictionary: {e}")
108
 
109
  raise TypeError(f"Invalid proxy string: {proxy_string}")
 
 
 
 
 
 
 
 
 
 
tests/fetchers/test_utils.py CHANGED
@@ -1,12 +1,10 @@
1
  import pytest
2
- from pathlib import Path
3
 
4
  from scrapling.engines.toolbelt.custom import StatusText, Response
5
  from scrapling.engines.toolbelt.navigation import (
6
  construct_proxy_dict,
7
  create_intercept_handler,
8
  create_async_intercept_handler,
9
- js_bypass_path,
10
  )
11
  from scrapling.engines.toolbelt.fingerprints import (
12
  generate_convincing_referer,
@@ -203,25 +201,6 @@ class TestConstructProxyDict:
203
  construct_proxy_dict({"invalid": "structure"})
204
 
205
 
206
- class TestJsBypassPath:
207
- """Test JavaScript bypass path utility"""
208
-
209
- def test_js_bypass_path(self):
210
- """Test getting JavaScript bypass file path"""
211
- result = js_bypass_path("webdriver_fully.js")
212
-
213
- assert isinstance(result, str)
214
- assert result.endswith("webdriver_fully.js")
215
- assert Path(result).exists()
216
-
217
- def test_js_bypass_path_caching(self):
218
- """Test that js_bypass_path is cached"""
219
- result1 = js_bypass_path("webdriver_fully.js")
220
- result2 = js_bypass_path("webdriver_fully.js")
221
-
222
- assert result1 == result2
223
-
224
-
225
  class TestFingerprintFunctions:
226
  """Test fingerprint generation functions"""
227
 
 
1
  import pytest
 
2
 
3
  from scrapling.engines.toolbelt.custom import StatusText, Response
4
  from scrapling.engines.toolbelt.navigation import (
5
  construct_proxy_dict,
6
  create_intercept_handler,
7
  create_async_intercept_handler,
 
8
  )
9
  from scrapling.engines.toolbelt.fingerprints import (
10
  generate_convincing_referer,
 
201
  construct_proxy_dict({"invalid": "structure"})
202
 
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  class TestFingerprintFunctions:
205
  """Test fingerprint generation functions"""
206