Karim shoair commited on
Commit ·
34c0fee
1
Parent(s): e07b1cd
refactor(Playwright Engine): Separate what we can for cleaner code and the async function later
Browse files
scrapling/engines/pw.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
-
from scrapling.core._types import Callable, Dict,
|
| 4 |
-
from scrapling.core.utils import log
|
| 5 |
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
| 6 |
NSTBROWSER_DEFAULT_QUERY)
|
| 7 |
from scrapling.engines.toolbelt import (Response, StatusText,
|
| 8 |
check_type_validity, construct_cdp_url,
|
| 9 |
construct_proxy_dict, do_nothing,
|
|
|
|
| 10 |
generate_convincing_referer,
|
| 11 |
generate_headers, intercept_route,
|
| 12 |
js_bypass_path)
|
|
@@ -94,10 +95,8 @@ class PlaywrightEngine:
|
|
| 94 |
# '--disable-extensions',
|
| 95 |
]
|
| 96 |
|
| 97 |
-
def _cdp_url_logic(self
|
| 98 |
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
| 99 |
-
|
| 100 |
-
:param flags: Chrome flags to be added to NSTBrowser query
|
| 101 |
:return: CDP URL
|
| 102 |
"""
|
| 103 |
cdp_url = self.cdp_url
|
|
@@ -106,7 +105,8 @@ class PlaywrightEngine:
|
|
| 106 |
config = self.nstbrowser_config
|
| 107 |
else:
|
| 108 |
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
| 109 |
-
if
|
|
|
|
| 110 |
query.update({
|
| 111 |
"args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
|
| 112 |
})
|
|
@@ -122,6 +122,68 @@ class PlaywrightEngine:
|
|
| 122 |
|
| 123 |
return cdp_url
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
def fetch(self, url: str) -> Response:
|
| 126 |
"""Opens up the browser and do your request based on your chosen options.
|
| 127 |
|
|
@@ -135,61 +197,14 @@ class PlaywrightEngine:
|
|
| 135 |
from rebrowser_playwright.sync_api import sync_playwright
|
| 136 |
|
| 137 |
with sync_playwright() as p:
|
| 138 |
-
# Handle the UserAgent early
|
| 139 |
-
if self.useragent:
|
| 140 |
-
extra_headers = {}
|
| 141 |
-
useragent = self.useragent
|
| 142 |
-
else:
|
| 143 |
-
extra_headers = {}
|
| 144 |
-
useragent = generate_headers(browser_mode=True).get('User-Agent')
|
| 145 |
-
|
| 146 |
-
# Prepare the flags before diving
|
| 147 |
-
flags = DEFAULT_STEALTH_FLAGS
|
| 148 |
-
if self.hide_canvas:
|
| 149 |
-
flags += ['--fingerprinting-canvas-image-data-noise']
|
| 150 |
-
if self.disable_webgl:
|
| 151 |
-
flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
|
| 152 |
-
|
| 153 |
# Creating the browser
|
| 154 |
if self.cdp_url:
|
| 155 |
-
cdp_url = self._cdp_url_logic(
|
| 156 |
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
| 157 |
else:
|
| 158 |
-
|
| 159 |
-
browser = p.chromium.launch(
|
| 160 |
-
headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
| 161 |
-
)
|
| 162 |
-
else:
|
| 163 |
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
|
| 164 |
-
|
| 165 |
-
# Creating the context
|
| 166 |
-
if self.stealth:
|
| 167 |
-
context = browser.new_context(
|
| 168 |
-
locale=self.locale,
|
| 169 |
-
is_mobile=False,
|
| 170 |
-
has_touch=False,
|
| 171 |
-
proxy=self.proxy,
|
| 172 |
-
color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
|
| 173 |
-
user_agent=useragent,
|
| 174 |
-
device_scale_factor=2,
|
| 175 |
-
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
| 176 |
-
service_workers="allow",
|
| 177 |
-
ignore_https_errors=True,
|
| 178 |
-
extra_http_headers=extra_headers,
|
| 179 |
-
screen={"width": 1920, "height": 1080},
|
| 180 |
-
viewport={"width": 1920, "height": 1080},
|
| 181 |
-
permissions=["geolocation", 'notifications'],
|
| 182 |
-
)
|
| 183 |
-
else:
|
| 184 |
-
context = browser.new_context(
|
| 185 |
-
locale=self.locale,
|
| 186 |
-
proxy=self.proxy,
|
| 187 |
-
color_scheme='dark',
|
| 188 |
-
user_agent=useragent,
|
| 189 |
-
device_scale_factor=2,
|
| 190 |
-
extra_http_headers=extra_headers
|
| 191 |
-
)
|
| 192 |
|
|
|
|
| 193 |
# Finally we are in business
|
| 194 |
page = context.new_page()
|
| 195 |
page.set_default_navigation_timeout(self.timeout)
|
|
@@ -202,22 +217,8 @@ class PlaywrightEngine:
|
|
| 202 |
page.route("**/*", intercept_route)
|
| 203 |
|
| 204 |
if self.stealth:
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
# https://bot.sannysoft.com/
|
| 208 |
-
# https://kaliiiiiiiiii.github.io/brotector/
|
| 209 |
-
# https://pixelscan.net/
|
| 210 |
-
# https://iphey.com/
|
| 211 |
-
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
| 212 |
-
# https://arh.antoinevastel.com/bots/areyouheadless/
|
| 213 |
-
# https://prescience-data.github.io/execution-monitor.html
|
| 214 |
-
page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
|
| 215 |
-
page.add_init_script(path=js_bypass_path('window_chrome.js'))
|
| 216 |
-
page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
|
| 217 |
-
page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
|
| 218 |
-
page.add_init_script(path=js_bypass_path('notification_permission.js'))
|
| 219 |
-
page.add_init_script(path=js_bypass_path('screen_props.js'))
|
| 220 |
-
page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
|
| 221 |
|
| 222 |
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
| 223 |
page.wait_for_load_state(state="domcontentloaded")
|
|
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
+
from scrapling.core._types import Callable, Dict, Optional, Union
|
| 4 |
+
from scrapling.core.utils import log, lru_cache
|
| 5 |
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
| 6 |
NSTBROWSER_DEFAULT_QUERY)
|
| 7 |
from scrapling.engines.toolbelt import (Response, StatusText,
|
| 8 |
check_type_validity, construct_cdp_url,
|
| 9 |
construct_proxy_dict, do_nothing,
|
| 10 |
+
do_nothing_async,
|
| 11 |
generate_convincing_referer,
|
| 12 |
generate_headers, intercept_route,
|
| 13 |
js_bypass_path)
|
|
|
|
| 95 |
# '--disable-extensions',
|
| 96 |
]
|
| 97 |
|
| 98 |
+
def _cdp_url_logic(self) -> str:
|
| 99 |
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
|
|
|
|
|
|
| 100 |
:return: CDP URL
|
| 101 |
"""
|
| 102 |
cdp_url = self.cdp_url
|
|
|
|
| 105 |
config = self.nstbrowser_config
|
| 106 |
else:
|
| 107 |
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
| 108 |
+
if self.stealth:
|
| 109 |
+
flags = self.__set_flags()
|
| 110 |
query.update({
|
| 111 |
"args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
|
| 112 |
})
|
|
|
|
| 122 |
|
| 123 |
return cdp_url
|
| 124 |
|
| 125 |
+
@lru_cache(typed=True)
|
| 126 |
+
def __set_flags(self):
|
| 127 |
+
"""Returns the flags that will be used while launching the browser if stealth mode is enabled"""
|
| 128 |
+
flags = DEFAULT_STEALTH_FLAGS
|
| 129 |
+
if self.hide_canvas:
|
| 130 |
+
flags += ('--fingerprinting-canvas-image-data-noise',)
|
| 131 |
+
if self.disable_webgl:
|
| 132 |
+
flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
|
| 133 |
+
|
| 134 |
+
return flags
|
| 135 |
+
|
| 136 |
+
def __launch_kwargs(self):
|
| 137 |
+
"""Creates the arguments we will use while launching playwright's browser"""
|
| 138 |
+
launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
|
| 139 |
+
if self.stealth:
|
| 140 |
+
launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
|
| 141 |
+
|
| 142 |
+
return launch_kwargs
|
| 143 |
+
|
| 144 |
+
def __context_kwargs(self):
|
| 145 |
+
"""Creates the arguments for the browser context"""
|
| 146 |
+
context_kwargs = {
|
| 147 |
+
"proxy": self.proxy,
|
| 148 |
+
"locale": self.locale,
|
| 149 |
+
"color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
|
| 150 |
+
"device_scale_factor": 2,
|
| 151 |
+
"extra_http_headers": self.extra_headers if self.extra_headers else {},
|
| 152 |
+
"user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
|
| 153 |
+
}
|
| 154 |
+
if self.stealth:
|
| 155 |
+
context_kwargs.update({
|
| 156 |
+
'is_mobile': False,
|
| 157 |
+
'has_touch': False,
|
| 158 |
+
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
| 159 |
+
'service_workers': 'allow',
|
| 160 |
+
'ignore_https_errors': True,
|
| 161 |
+
'screen': {'width': 1920, 'height': 1080},
|
| 162 |
+
'viewport': {'width': 1920, 'height': 1080},
|
| 163 |
+
'permissions': ['geolocation', 'notifications']
|
| 164 |
+
})
|
| 165 |
+
|
| 166 |
+
return context_kwargs
|
| 167 |
+
|
| 168 |
+
@lru_cache()
|
| 169 |
+
def __stealth_scripts(self):
|
| 170 |
+
# Basic bypasses nothing fancy as I'm still working on it
|
| 171 |
+
# But with adding these bypasses to the above config, it bypasses many online tests like
|
| 172 |
+
# https://bot.sannysoft.com/
|
| 173 |
+
# https://kaliiiiiiiiii.github.io/brotector/
|
| 174 |
+
# https://pixelscan.net/
|
| 175 |
+
# https://iphey.com/
|
| 176 |
+
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
| 177 |
+
# https://arh.antoinevastel.com/bots/areyouheadless/
|
| 178 |
+
# https://prescience-data.github.io/execution-monitor.html
|
| 179 |
+
return tuple(
|
| 180 |
+
js_bypass_path(script) for script in (
|
| 181 |
+
# Order is important
|
| 182 |
+
'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
|
| 183 |
+
'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
|
| 184 |
+
)
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
def fetch(self, url: str) -> Response:
|
| 188 |
"""Opens up the browser and do your request based on your chosen options.
|
| 189 |
|
|
|
|
| 197 |
from rebrowser_playwright.sync_api import sync_playwright
|
| 198 |
|
| 199 |
with sync_playwright() as p:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
# Creating the browser
|
| 201 |
if self.cdp_url:
|
| 202 |
+
cdp_url = self._cdp_url_logic()
|
| 203 |
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
| 204 |
else:
|
| 205 |
+
browser = p.chromium.launch(**self.__launch_kwargs())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
+
context = browser.new_context(**self.__context_kwargs())
|
| 208 |
# Finally we are in business
|
| 209 |
page = context.new_page()
|
| 210 |
page.set_default_navigation_timeout(self.timeout)
|
|
|
|
| 217 |
page.route("**/*", intercept_route)
|
| 218 |
|
| 219 |
if self.stealth:
|
| 220 |
+
for script in self.__stealth_scripts():
|
| 221 |
+
page.add_init_script(path=script)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
| 224 |
page.wait_for_load_state(state="domcontentloaded")
|
scrapling/engines/toolbelt/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
|
| 2 |
-
check_type_validity, do_nothing,
|
|
|
|
| 3 |
from .fingerprints import (generate_convincing_referer, generate_headers,
|
| 4 |
get_os_name)
|
| 5 |
from .navigation import (construct_cdp_url, construct_proxy_dict,
|
|
|
|
| 1 |
from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
|
| 2 |
+
check_type_validity, do_nothing, do_nothing_async,
|
| 3 |
+
get_variable_name)
|
| 4 |
from .fingerprints import (generate_convincing_referer, generate_headers,
|
| 5 |
get_os_name)
|
| 6 |
from .navigation import (construct_cdp_url, construct_proxy_dict,
|
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -302,3 +302,8 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
|
|
| 302 |
def do_nothing(page):
|
| 303 |
# Just works as a filler for `page_action` argument in browser engines
|
| 304 |
return page
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
def do_nothing(page):
|
| 303 |
# Just works as a filler for `page_action` argument in browser engines
|
| 304 |
return page
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
async def do_nothing_async(page):
|
| 308 |
+
# Just works as a filler for `page_action` argument in browser engines
|
| 309 |
+
return page
|