Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Dec 15, 2024

Commit

34c0fee

1 Parent(s): e07b1cd

refactor(Playwright Engine): Separate what we can for cleaner code and the async function later

Browse files

Files changed (3) hide show

scrapling/engines/pw.py +73 -72
scrapling/engines/toolbelt/__init__.py +2 -1
scrapling/engines/toolbelt/custom.py +5 -0

scrapling/engines/pw.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import json
-from scrapling.core._types import Callable, Dict, List, Optional, Union
-from scrapling.core.utils import log
 from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
                                          NSTBROWSER_DEFAULT_QUERY)
 from scrapling.engines.toolbelt import (Response, StatusText,
                                         check_type_validity, construct_cdp_url,
                                         construct_proxy_dict, do_nothing,
                                         generate_convincing_referer,
                                         generate_headers, intercept_route,
                                         js_bypass_path)
@@ -94,10 +95,8 @@ class PlaywrightEngine:
             # '--disable-extensions',
         ]
-    def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
         """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
-        :param flags: Chrome flags to be added to NSTBrowser query
         :return: CDP URL
         """
         cdp_url = self.cdp_url
@@ -106,7 +105,8 @@ class PlaywrightEngine:
                 config = self.nstbrowser_config
             else:
                 query = NSTBROWSER_DEFAULT_QUERY.copy()
-                if flags:
                     query.update({
                         "args": dict(zip(flags, [''] * len(flags))),  # browser args should be a dictionary
                     })
@@ -122,6 +122,68 @@ class PlaywrightEngine:
         return cdp_url
     def fetch(self, url: str) -> Response:
         """Opens up the browser and do your request based on your chosen options.
@@ -135,61 +197,14 @@ class PlaywrightEngine:
             from rebrowser_playwright.sync_api import sync_playwright
         with sync_playwright() as p:
-            # Handle the UserAgent early
-            if self.useragent:
-                extra_headers = {}
-                useragent = self.useragent
-            else:
-                extra_headers = {}
-                useragent = generate_headers(browser_mode=True).get('User-Agent')
-            # Prepare the flags before diving
-            flags = DEFAULT_STEALTH_FLAGS
-            if self.hide_canvas:
-                flags += ['--fingerprinting-canvas-image-data-noise']
-            if self.disable_webgl:
-                flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
             # Creating the browser
             if self.cdp_url:
-                cdp_url = self._cdp_url_logic(flags if self.stealth else None)
                 browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
             else:
-                if self.stealth:
-                    browser = p.chromium.launch(
-                        headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
-                    )
-                else:
-                    browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
-            # Creating the context
-            if self.stealth:
-                context = browser.new_context(
-                    locale=self.locale,
-                    is_mobile=False,
-                    has_touch=False,
-                    proxy=self.proxy,
-                    color_scheme='dark',  # Bypasses the 'prefersLightColor' check in creepjs
-                    user_agent=useragent,
-                    device_scale_factor=2,
-                    # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
-                    service_workers="allow",
-                    ignore_https_errors=True,
-                    extra_http_headers=extra_headers,
-                    screen={"width": 1920, "height": 1080},
-                    viewport={"width": 1920, "height": 1080},
-                    permissions=["geolocation", 'notifications'],
-                )
-            else:
-                context = browser.new_context(
-                    locale=self.locale,
-                    proxy=self.proxy,
-                    color_scheme='dark',
-                    user_agent=useragent,
-                    device_scale_factor=2,
-                    extra_http_headers=extra_headers
-                )
             # Finally we are in business
             page = context.new_page()
             page.set_default_navigation_timeout(self.timeout)
@@ -202,22 +217,8 @@ class PlaywrightEngine:
                 page.route("**/*", intercept_route)
             if self.stealth:
-                # Basic bypasses nothing fancy as I'm still working on it
-                # But with adding these bypasses to the above config, it bypasses many online tests like
-                # https://bot.sannysoft.com/
-                # https://kaliiiiiiiiii.github.io/brotector/
-                # https://pixelscan.net/
-                # https://iphey.com/
-                # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
-                # https://arh.antoinevastel.com/bots/areyouheadless/
-                # https://prescience-data.github.io/execution-monitor.html
-                page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
-                page.add_init_script(path=js_bypass_path('window_chrome.js'))
-                page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
-                page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
-                page.add_init_script(path=js_bypass_path('notification_permission.js'))
-                page.add_init_script(path=js_bypass_path('screen_props.js'))
-                page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
             res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
             page.wait_for_load_state(state="domcontentloaded")

 import json
+from scrapling.core._types import Callable, Dict, Optional, Union
+from scrapling.core.utils import log, lru_cache
 from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
                                          NSTBROWSER_DEFAULT_QUERY)
 from scrapling.engines.toolbelt import (Response, StatusText,
                                         check_type_validity, construct_cdp_url,
                                         construct_proxy_dict, do_nothing,
+                                        do_nothing_async,
                                         generate_convincing_referer,
                                         generate_headers, intercept_route,
                                         js_bypass_path)
             # '--disable-extensions',
         ]
+    def _cdp_url_logic(self) -> str:
         """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
         :return: CDP URL
         """
         cdp_url = self.cdp_url
                 config = self.nstbrowser_config
             else:
                 query = NSTBROWSER_DEFAULT_QUERY.copy()
+                if self.stealth:
+                    flags = self.__set_flags()
                     query.update({
                         "args": dict(zip(flags, [''] * len(flags))),  # browser args should be a dictionary
                     })
         return cdp_url
+    @lru_cache(typed=True)
+    def __set_flags(self):
+        """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
+        flags = DEFAULT_STEALTH_FLAGS
+        if self.hide_canvas:
+            flags += ('--fingerprinting-canvas-image-data-noise',)
+        if self.disable_webgl:
+            flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
+        return flags
+    def __launch_kwargs(self):
+        """Creates the arguments we will use while launching playwright's browser"""
+        launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
+        if self.stealth:
+            launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
+        return launch_kwargs
+    def __context_kwargs(self):
+        """Creates the arguments for the browser context"""
+        context_kwargs = {
+            "proxy": self.proxy,
+            "locale": self.locale,
+            "color_scheme": 'dark',  # Bypasses the 'prefersLightColor' check in creepjs
+            "device_scale_factor": 2,
+            "extra_http_headers": self.extra_headers if self.extra_headers else {},
+            "user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
+        }
+        if self.stealth:
+            context_kwargs.update({
+                'is_mobile': False,
+                'has_touch': False,
+                # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
+                'service_workers': 'allow',
+                'ignore_https_errors': True,
+                'screen': {'width': 1920, 'height': 1080},
+                'viewport': {'width': 1920, 'height': 1080},
+                'permissions': ['geolocation', 'notifications']
+            })
+        return context_kwargs
+    @lru_cache()
+    def __stealth_scripts(self):
+        # Basic bypasses nothing fancy as I'm still working on it
+        # But with adding these bypasses to the above config, it bypasses many online tests like
+        # https://bot.sannysoft.com/
+        # https://kaliiiiiiiiii.github.io/brotector/
+        # https://pixelscan.net/
+        # https://iphey.com/
+        # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
+        # https://arh.antoinevastel.com/bots/areyouheadless/
+        # https://prescience-data.github.io/execution-monitor.html
+        return tuple(
+            js_bypass_path(script) for script in (
+                # Order is important
+                'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
+                'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
+            )
+        )
     def fetch(self, url: str) -> Response:
         """Opens up the browser and do your request based on your chosen options.
             from rebrowser_playwright.sync_api import sync_playwright
         with sync_playwright() as p:
             # Creating the browser
             if self.cdp_url:
+                cdp_url = self._cdp_url_logic()
                 browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
             else:
+                browser = p.chromium.launch(**self.__launch_kwargs())
+            context = browser.new_context(**self.__context_kwargs())
             # Finally we are in business
             page = context.new_page()
             page.set_default_navigation_timeout(self.timeout)
                 page.route("**/*", intercept_route)
             if self.stealth:
+                for script in self.__stealth_scripts():
+                    page.add_init_script(path=script)
             res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
             page.wait_for_load_state(state="domcontentloaded")

scrapling/engines/toolbelt/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
-                     check_type_validity, do_nothing, get_variable_name)
 from .fingerprints import (generate_convincing_referer, generate_headers,
                            get_os_name)
 from .navigation import (construct_cdp_url, construct_proxy_dict,

 from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
+                     check_type_validity, do_nothing, do_nothing_async,
+                     get_variable_name)
 from .fingerprints import (generate_convincing_referer, generate_headers,
                            get_os_name)
 from .navigation import (construct_cdp_url, construct_proxy_dict,

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -302,3 +302,8 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
 def do_nothing(page):
     # Just works as a filler for `page_action` argument in browser engines
     return page

 def do_nothing(page):
     # Just works as a filler for `page_action` argument in browser engines
     return page
+async def do_nothing_async(page):
+    # Just works as a filler for `page_action` argument in browser engines
+    return page