Karim shoair commited on
Commit
34c0fee
·
1 Parent(s): e07b1cd

refactor(Playwright Engine): Separate what we can for cleaner code and the async function later

Browse files
scrapling/engines/pw.py CHANGED
@@ -1,12 +1,13 @@
1
  import json
2
 
3
- from scrapling.core._types import Callable, Dict, List, Optional, Union
4
- from scrapling.core.utils import log
5
  from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
  NSTBROWSER_DEFAULT_QUERY)
7
  from scrapling.engines.toolbelt import (Response, StatusText,
8
  check_type_validity, construct_cdp_url,
9
  construct_proxy_dict, do_nothing,
 
10
  generate_convincing_referer,
11
  generate_headers, intercept_route,
12
  js_bypass_path)
@@ -94,10 +95,8 @@ class PlaywrightEngine:
94
  # '--disable-extensions',
95
  ]
96
 
97
- def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
98
  """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
99
-
100
- :param flags: Chrome flags to be added to NSTBrowser query
101
  :return: CDP URL
102
  """
103
  cdp_url = self.cdp_url
@@ -106,7 +105,8 @@ class PlaywrightEngine:
106
  config = self.nstbrowser_config
107
  else:
108
  query = NSTBROWSER_DEFAULT_QUERY.copy()
109
- if flags:
 
110
  query.update({
111
  "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
112
  })
@@ -122,6 +122,68 @@ class PlaywrightEngine:
122
 
123
  return cdp_url
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def fetch(self, url: str) -> Response:
126
  """Opens up the browser and do your request based on your chosen options.
127
 
@@ -135,61 +197,14 @@ class PlaywrightEngine:
135
  from rebrowser_playwright.sync_api import sync_playwright
136
 
137
  with sync_playwright() as p:
138
- # Handle the UserAgent early
139
- if self.useragent:
140
- extra_headers = {}
141
- useragent = self.useragent
142
- else:
143
- extra_headers = {}
144
- useragent = generate_headers(browser_mode=True).get('User-Agent')
145
-
146
- # Prepare the flags before diving
147
- flags = DEFAULT_STEALTH_FLAGS
148
- if self.hide_canvas:
149
- flags += ['--fingerprinting-canvas-image-data-noise']
150
- if self.disable_webgl:
151
- flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
152
-
153
  # Creating the browser
154
  if self.cdp_url:
155
- cdp_url = self._cdp_url_logic(flags if self.stealth else None)
156
  browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
157
  else:
158
- if self.stealth:
159
- browser = p.chromium.launch(
160
- headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
161
- )
162
- else:
163
- browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
164
-
165
- # Creating the context
166
- if self.stealth:
167
- context = browser.new_context(
168
- locale=self.locale,
169
- is_mobile=False,
170
- has_touch=False,
171
- proxy=self.proxy,
172
- color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
173
- user_agent=useragent,
174
- device_scale_factor=2,
175
- # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
176
- service_workers="allow",
177
- ignore_https_errors=True,
178
- extra_http_headers=extra_headers,
179
- screen={"width": 1920, "height": 1080},
180
- viewport={"width": 1920, "height": 1080},
181
- permissions=["geolocation", 'notifications'],
182
- )
183
- else:
184
- context = browser.new_context(
185
- locale=self.locale,
186
- proxy=self.proxy,
187
- color_scheme='dark',
188
- user_agent=useragent,
189
- device_scale_factor=2,
190
- extra_http_headers=extra_headers
191
- )
192
 
 
193
  # Finally we are in business
194
  page = context.new_page()
195
  page.set_default_navigation_timeout(self.timeout)
@@ -202,22 +217,8 @@ class PlaywrightEngine:
202
  page.route("**/*", intercept_route)
203
 
204
  if self.stealth:
205
- # Basic bypasses nothing fancy as I'm still working on it
206
- # But with adding these bypasses to the above config, it bypasses many online tests like
207
- # https://bot.sannysoft.com/
208
- # https://kaliiiiiiiiii.github.io/brotector/
209
- # https://pixelscan.net/
210
- # https://iphey.com/
211
- # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
212
- # https://arh.antoinevastel.com/bots/areyouheadless/
213
- # https://prescience-data.github.io/execution-monitor.html
214
- page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
215
- page.add_init_script(path=js_bypass_path('window_chrome.js'))
216
- page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
217
- page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
218
- page.add_init_script(path=js_bypass_path('notification_permission.js'))
219
- page.add_init_script(path=js_bypass_path('screen_props.js'))
220
- page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
221
 
222
  res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
223
  page.wait_for_load_state(state="domcontentloaded")
 
1
  import json
2
 
3
+ from scrapling.core._types import Callable, Dict, Optional, Union
4
+ from scrapling.core.utils import log, lru_cache
5
  from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
  NSTBROWSER_DEFAULT_QUERY)
7
  from scrapling.engines.toolbelt import (Response, StatusText,
8
  check_type_validity, construct_cdp_url,
9
  construct_proxy_dict, do_nothing,
10
+ do_nothing_async,
11
  generate_convincing_referer,
12
  generate_headers, intercept_route,
13
  js_bypass_path)
 
95
  # '--disable-extensions',
96
  ]
97
 
98
+ def _cdp_url_logic(self) -> str:
99
  """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
 
 
100
  :return: CDP URL
101
  """
102
  cdp_url = self.cdp_url
 
105
  config = self.nstbrowser_config
106
  else:
107
  query = NSTBROWSER_DEFAULT_QUERY.copy()
108
+ if self.stealth:
109
+ flags = self.__set_flags()
110
  query.update({
111
  "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
112
  })
 
122
 
123
  return cdp_url
124
 
125
+ @lru_cache(typed=True)
126
+ def __set_flags(self):
127
+ """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
128
+ flags = DEFAULT_STEALTH_FLAGS
129
+ if self.hide_canvas:
130
+ flags += ('--fingerprinting-canvas-image-data-noise',)
131
+ if self.disable_webgl:
132
+ flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
133
+
134
+ return flags
135
+
136
+ def __launch_kwargs(self):
137
+ """Creates the arguments we will use while launching playwright's browser"""
138
+ launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
139
+ if self.stealth:
140
+ launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
141
+
142
+ return launch_kwargs
143
+
144
+ def __context_kwargs(self):
145
+ """Creates the arguments for the browser context"""
146
+ context_kwargs = {
147
+ "proxy": self.proxy,
148
+ "locale": self.locale,
149
+ "color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
150
+ "device_scale_factor": 2,
151
+ "extra_http_headers": self.extra_headers if self.extra_headers else {},
152
+ "user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
153
+ }
154
+ if self.stealth:
155
+ context_kwargs.update({
156
+ 'is_mobile': False,
157
+ 'has_touch': False,
158
+ # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
159
+ 'service_workers': 'allow',
160
+ 'ignore_https_errors': True,
161
+ 'screen': {'width': 1920, 'height': 1080},
162
+ 'viewport': {'width': 1920, 'height': 1080},
163
+ 'permissions': ['geolocation', 'notifications']
164
+ })
165
+
166
+ return context_kwargs
167
+
168
+ @lru_cache()
169
+ def __stealth_scripts(self):
170
+ # Basic bypasses nothing fancy as I'm still working on it
171
+ # But with adding these bypasses to the above config, it bypasses many online tests like
172
+ # https://bot.sannysoft.com/
173
+ # https://kaliiiiiiiiii.github.io/brotector/
174
+ # https://pixelscan.net/
175
+ # https://iphey.com/
176
+ # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
177
+ # https://arh.antoinevastel.com/bots/areyouheadless/
178
+ # https://prescience-data.github.io/execution-monitor.html
179
+ return tuple(
180
+ js_bypass_path(script) for script in (
181
+ # Order is important
182
+ 'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
183
+ 'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
184
+ )
185
+ )
186
+
187
  def fetch(self, url: str) -> Response:
188
  """Opens up the browser and do your request based on your chosen options.
189
 
 
197
  from rebrowser_playwright.sync_api import sync_playwright
198
 
199
  with sync_playwright() as p:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  # Creating the browser
201
  if self.cdp_url:
202
+ cdp_url = self._cdp_url_logic()
203
  browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
204
  else:
205
+ browser = p.chromium.launch(**self.__launch_kwargs())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
+ context = browser.new_context(**self.__context_kwargs())
208
  # Finally we are in business
209
  page = context.new_page()
210
  page.set_default_navigation_timeout(self.timeout)
 
217
  page.route("**/*", intercept_route)
218
 
219
  if self.stealth:
220
+ for script in self.__stealth_scripts():
221
+ page.add_init_script(path=script)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
  res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
224
  page.wait_for_load_state(state="domcontentloaded")
scrapling/engines/toolbelt/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
  from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
2
- check_type_validity, do_nothing, get_variable_name)
 
3
  from .fingerprints import (generate_convincing_referer, generate_headers,
4
  get_os_name)
5
  from .navigation import (construct_cdp_url, construct_proxy_dict,
 
1
  from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
2
+ check_type_validity, do_nothing, do_nothing_async,
3
+ get_variable_name)
4
  from .fingerprints import (generate_convincing_referer, generate_headers,
5
  get_os_name)
6
  from .navigation import (construct_cdp_url, construct_proxy_dict,
scrapling/engines/toolbelt/custom.py CHANGED
@@ -302,3 +302,8 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
302
  def do_nothing(page):
303
  # Just works as a filler for `page_action` argument in browser engines
304
  return page
 
 
 
 
 
 
302
  def do_nothing(page):
303
  # Just works as a filler for `page_action` argument in browser engines
304
  return page
305
+
306
+
307
+ async def do_nothing_async(page):
308
+ # Just works as a filler for `page_action` argument in browser engines
309
+ return page