Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Dec 15, 2024

Commit

a11649b

1 Parent(s): 34c0fee

feat(PlaywrightFetcher): Add async support for PlaywrightFetcher

Browse files

Files changed (7) hide show

pytest.ini +2 -0
scrapling/engines/camo.py +5 -4
scrapling/engines/pw.py +81 -10
scrapling/engines/toolbelt/__init__.py +3 -4
scrapling/engines/toolbelt/custom.py +0 -11
scrapling/engines/toolbelt/navigation.py +16 -3
scrapling/fetchers.py +63 -3

pytest.ini CHANGED Viewed

@@ -1,2 +1,4 @@
 [pytest]
 addopts = -p no:warnings --doctest-modules --ignore=setup.py --verbose

 [pytest]
+asyncio_mode = auto
+asyncio_default_fixture_loop_scope = function
 addopts = -p no:warnings --doctest-modules --ignore=setup.py --verbose

scrapling/engines/camo.py CHANGED Viewed

@@ -6,7 +6,7 @@ from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
 from scrapling.core.utils import log
 from scrapling.engines.toolbelt import (Response, StatusText,
                                         check_type_validity,
-                                        construct_proxy_dict, do_nothing,
                                         generate_convincing_referer,
                                         get_os_name, intercept_route)
@@ -15,7 +15,7 @@ class CamoufoxEngine:
     def __init__(
             self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
-            timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
             wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
             proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
             geoip: Optional[bool] = False,
@@ -65,7 +65,7 @@ class CamoufoxEngine:
         if callable(page_action):
             self.page_action = page_action
         else:
-            self.page_action = do_nothing
             log.error('[Ignored] Argument "page_action" must be callable')
         self.wait_selector = wait_selector
@@ -106,7 +106,8 @@ class CamoufoxEngine:
             if self.network_idle:
                 page.wait_for_load_state('networkidle')
-            page = self.page_action(page)
             if self.wait_selector and type(self.wait_selector) is str:
                 waiter = page.locator(self.wait_selector)

 from scrapling.core.utils import log
 from scrapling.engines.toolbelt import (Response, StatusText,
                                         check_type_validity,
+                                        construct_proxy_dict,
                                         generate_convincing_referer,
                                         get_os_name, intercept_route)
     def __init__(
             self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
+            timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
             wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
             proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
             geoip: Optional[bool] = False,
         if callable(page_action):
             self.page_action = page_action
         else:
+            self.page_action = None
             log.error('[Ignored] Argument "page_action" must be callable')
         self.wait_selector = wait_selector
             if self.network_idle:
                 page.wait_for_load_state('networkidle')
+            if self.page_action is not None:
+                page = self.page_action(page)
             if self.wait_selector and type(self.wait_selector) is str:
                 waiter = page.locator(self.wait_selector)

scrapling/engines/pw.py CHANGED Viewed

@@ -5,9 +5,9 @@ from scrapling.core.utils import log, lru_cache
 from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
                                          NSTBROWSER_DEFAULT_QUERY)
 from scrapling.engines.toolbelt import (Response, StatusText,
                                         check_type_validity, construct_cdp_url,
-                                        construct_proxy_dict, do_nothing,
-                                        do_nothing_async,
                                         generate_convincing_referer,
                                         generate_headers, intercept_route,
                                         js_bypass_path)
@@ -20,7 +20,7 @@ class PlaywrightEngine:
             useragent: Optional[str] = None,
             network_idle: Optional[bool] = False,
             timeout: Optional[float] = 30000,
-            page_action: Callable = do_nothing,
             wait_selector: Optional[str] = None,
             locale: Optional[str] = 'en-US',
             wait_selector_state: Optional[str] = 'attached',
@@ -75,10 +75,10 @@ class PlaywrightEngine:
         self.cdp_url = cdp_url
         self.useragent = useragent
         self.timeout = check_type_validity(timeout, [int, float], 30000)
-        if callable(page_action):
             self.page_action = page_action
         else:
-            self.page_action = do_nothing
             log.error('[Ignored] Argument "page_action" must be callable')
         self.wait_selector = wait_selector
@@ -225,7 +225,8 @@ class PlaywrightEngine:
             if self.network_idle:
                 page.wait_for_load_state('networkidle')
-            page = self.page_action(page)
             if self.wait_selector and type(self.wait_selector) is str:
                 waiter = page.locator(self.wait_selector)
@@ -238,11 +239,8 @@ class PlaywrightEngine:
             # This will be parsed inside `Response`
             encoding = res.headers.get('content-type', '') or 'utf-8'  # default encoding
-            status_text = res.status_text
             # PlayWright API sometimes give empty status text for some reason!
-            if not status_text:
-                status_text = StatusText.get(res.status)
             response = Response(
                 url=res.url,
@@ -258,3 +256,76 @@ class PlaywrightEngine:
             )
             page.close()
         return response

 from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
                                          NSTBROWSER_DEFAULT_QUERY)
 from scrapling.engines.toolbelt import (Response, StatusText,
+                                        async_intercept_route,
                                         check_type_validity, construct_cdp_url,
+                                        construct_proxy_dict,
                                         generate_convincing_referer,
                                         generate_headers, intercept_route,
                                         js_bypass_path)
             useragent: Optional[str] = None,
             network_idle: Optional[bool] = False,
             timeout: Optional[float] = 30000,
+            page_action: Callable = None,
             wait_selector: Optional[str] = None,
             locale: Optional[str] = 'en-US',
             wait_selector_state: Optional[str] = 'attached',
         self.cdp_url = cdp_url
         self.useragent = useragent
         self.timeout = check_type_validity(timeout, [int, float], 30000)
+        if page_action is not None and callable(page_action):
             self.page_action = page_action
         else:
+            self.page_action = None
             log.error('[Ignored] Argument "page_action" must be callable')
         self.wait_selector = wait_selector
             if self.network_idle:
                 page.wait_for_load_state('networkidle')
+            if self.page_action is not None:
+                page = self.page_action(page)
             if self.wait_selector and type(self.wait_selector) is str:
                 waiter = page.locator(self.wait_selector)
             # This will be parsed inside `Response`
             encoding = res.headers.get('content-type', '') or 'utf-8'  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
+            status_text = res.status_text or StatusText.get(res.status)
             response = Response(
                 url=res.url,
             )
             page.close()
         return response
+    async def async_fetch(self, url: str) -> Response:
+        """Async version of `fetch`
+        :param url: Target url.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
+        """
+        if not self.stealth or self.real_chrome:
+            # Because rebrowser_playwright doesn't play well with real browsers
+            from playwright.async_api import async_playwright
+        else:
+            from rebrowser_playwright.async_api import async_playwright
+        async with async_playwright() as p:
+            # Creating the browser
+            if self.cdp_url:
+                cdp_url = self._cdp_url_logic()
+                browser = await p.chromium.connect_over_cdp(endpoint_url=cdp_url)
+            else:
+                browser = await p.chromium.launch(**self.__launch_kwargs())
+            context = await browser.new_context(**self.__context_kwargs())
+            # Finally we are in business
+            page = await context.new_page()
+            page.set_default_navigation_timeout(self.timeout)
+            page.set_default_timeout(self.timeout)
+            if self.extra_headers:
+                await page.set_extra_http_headers(self.extra_headers)
+            if self.disable_resources:
+                await page.route("**/*", async_intercept_route)
+            if self.stealth:
+                for script in self.__stealth_scripts():
+                    await page.add_init_script(path=script)
+            res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
+            await page.wait_for_load_state(state="domcontentloaded")
+            if self.network_idle:
+                await page.wait_for_load_state('networkidle')
+            if self.page_action is not None:
+                page = await self.page_action(page)
+            if self.wait_selector and type(self.wait_selector) is str:
+                waiter = page.locator(self.wait_selector)
+                await waiter.first.wait_for(state=self.wait_selector_state)
+                # Wait again after waiting for the selector, helpful with protections like Cloudflare
+                await page.wait_for_load_state(state="load")
+                await page.wait_for_load_state(state="domcontentloaded")
+                if self.network_idle:
+                    await page.wait_for_load_state('networkidle')
+            # This will be parsed inside `Response`
+            encoding = res.headers.get('content-type', '') or 'utf-8'  # default encoding
+            # PlayWright API sometimes give empty status text for some reason!
+            status_text = res.status_text or StatusText.get(res.status)
+            response = Response(
+                url=res.url,
+                text=await page.content(),
+                body=(await page.content()).encode('utf-8'),
+                status=res.status,
+                reason=status_text,
+                encoding=encoding,
+                cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
+                headers=await res.all_headers(),
+                request_headers=await res.request.all_headers(),
+                **self.adaptor_arguments
+            )
+            await page.close()
+        return response

scrapling/engines/toolbelt/__init__.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
-                     check_type_validity, do_nothing, do_nothing_async,
-                     get_variable_name)
 from .fingerprints import (generate_convincing_referer, generate_headers,
                            get_os_name)
-from .navigation import (construct_cdp_url, construct_proxy_dict,
-                         intercept_route, js_bypass_path)

 from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
+                     check_type_validity, get_variable_name)
 from .fingerprints import (generate_convincing_referer, generate_headers,
                            get_os_name)
+from .navigation import (async_intercept_route, construct_cdp_url,
+                         construct_proxy_dict, intercept_route, js_bypass_path)

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -296,14 +296,3 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
         return default_value
     return variable
-# Pew Pew
-def do_nothing(page):
-    # Just works as a filler for `page_action` argument in browser engines
-    return page
-async def do_nothing_async(page):
-    # Just works as a filler for `page_action` argument in browser engines
-    return page


296	return default_value
297
298	return variable

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -4,6 +4,7 @@ Functions related to files and URLs
 import os
 from urllib.parse import urlencode, urlparse
 from playwright.sync_api import Route
 from scrapling.core._types import Dict, Optional, Union
@@ -11,7 +12,7 @@ from scrapling.core.utils import log, lru_cache
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
-def intercept_route(route: Route) -> Union[Route, None]:
     """This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
     :param route: PlayWright `Route` object of the current page
@@ -19,8 +20,20 @@ def intercept_route(route: Route) -> Union[Route, None]:
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
         log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
-        return route.abort()
-    return route.continue_()
 def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:

 import os
 from urllib.parse import urlencode, urlparse
+from playwright.async_api import Route as async_Route
 from playwright.sync_api import Route
 from scrapling.core._types import Dict, Optional, Union
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
+def intercept_route(route: Route):
     """This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
     :param route: PlayWright `Route` object of the current page
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
         log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
+        route.abort()
+    route.continue_()
+async def async_intercept_route(route: async_Route):
+    """This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
+    :param route: PlayWright `Route` object of the current page
+    :return: PlayWright `Route` object
+    """
+    if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
+        log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
+        await route.abort()
+    await route.continue_()
 def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:

scrapling/fetchers.py CHANGED Viewed

@@ -2,7 +2,7 @@ from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
                                    Union)
 from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
                                check_if_engine_usable)
-from scrapling.engines.toolbelt import BaseFetcher, Response, do_nothing
 class Fetcher(BaseFetcher):
@@ -175,7 +175,7 @@ class StealthyFetcher(BaseFetcher):
     def fetch(
             self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
-            timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
             wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
             os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
     ) -> Response:
@@ -250,7 +250,7 @@ class PlayWrightFetcher(BaseFetcher):
     def fetch(
             self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
             useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
-            page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
             hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
             proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
             stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
@@ -307,6 +307,66 @@ class PlayWrightFetcher(BaseFetcher):
         )
         return engine.fetch(url)
 class CustomFetcher(BaseFetcher):
     def fetch(self, url: str, browser_engine, **kwargs) -> Response:

                                    Union)
 from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
                                check_if_engine_usable)
+from scrapling.engines.toolbelt import BaseFetcher, Response
 class Fetcher(BaseFetcher):
     def fetch(
             self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
+            timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
             wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
             os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
     ) -> Response:
     def fetch(
             self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
             useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
+            page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
             hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
             proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
             stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
         )
         return engine.fetch(url)
+    async def async_fetch(
+            self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
+            useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
+            page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
+            hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
+            proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
+            stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
+            cdp_url: Optional[str] = None,
+            nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
+    ) -> Response:
+        """Opens up a browser and do your request based on your chosen options below.
+        :param url: Target url.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+            This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
+        :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
+        :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
+        :param wait_selector: Wait for a specific css selector to be in a specific state.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
+        :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
+        :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
+        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+        :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
+        :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
+        :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
+        """
+        engine = PlaywrightEngine(
+            proxy=proxy,
+            locale=locale,
+            timeout=timeout,
+            stealth=stealth,
+            cdp_url=cdp_url,
+            headless=headless,
+            useragent=useragent,
+            real_chrome=real_chrome,
+            page_action=page_action,
+            hide_canvas=hide_canvas,
+            network_idle=network_idle,
+            google_search=google_search,
+            extra_headers=extra_headers,
+            wait_selector=wait_selector,
+            disable_webgl=disable_webgl,
+            nstbrowser_mode=nstbrowser_mode,
+            nstbrowser_config=nstbrowser_config,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
+            adaptor_arguments=self.adaptor_arguments,
+        )
+        return await engine.async_fetch(url)
 class CustomFetcher(BaseFetcher):
     def fetch(self, url: str, browser_engine, **kwargs) -> Response: