Karim shoair commited on
Commit ·
a11649b
1
Parent(s): 34c0fee
feat(PlaywrightFetcher): Add async support for PlaywrightFetcher
Browse files- pytest.ini +2 -0
- scrapling/engines/camo.py +5 -4
- scrapling/engines/pw.py +81 -10
- scrapling/engines/toolbelt/__init__.py +3 -4
- scrapling/engines/toolbelt/custom.py +0 -11
- scrapling/engines/toolbelt/navigation.py +16 -3
- scrapling/fetchers.py +63 -3
pytest.ini
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
[pytest]
|
|
|
|
|
|
|
| 2 |
addopts = -p no:warnings --doctest-modules --ignore=setup.py --verbose
|
|
|
|
| 1 |
[pytest]
|
| 2 |
+
asyncio_mode = auto
|
| 3 |
+
asyncio_default_fixture_loop_scope = function
|
| 4 |
addopts = -p no:warnings --doctest-modules --ignore=setup.py --verbose
|
scrapling/engines/camo.py
CHANGED
|
@@ -6,7 +6,7 @@ from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
|
| 6 |
from scrapling.core.utils import log
|
| 7 |
from scrapling.engines.toolbelt import (Response, StatusText,
|
| 8 |
check_type_validity,
|
| 9 |
-
construct_proxy_dict,
|
| 10 |
generate_convincing_referer,
|
| 11 |
get_os_name, intercept_route)
|
| 12 |
|
|
@@ -15,7 +15,7 @@ class CamoufoxEngine:
|
|
| 15 |
def __init__(
|
| 16 |
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
| 17 |
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
| 18 |
-
timeout: Optional[float] = 30000, page_action: Callable =
|
| 19 |
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
| 20 |
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
| 21 |
geoip: Optional[bool] = False,
|
|
@@ -65,7 +65,7 @@ class CamoufoxEngine:
|
|
| 65 |
if callable(page_action):
|
| 66 |
self.page_action = page_action
|
| 67 |
else:
|
| 68 |
-
self.page_action =
|
| 69 |
log.error('[Ignored] Argument "page_action" must be callable')
|
| 70 |
|
| 71 |
self.wait_selector = wait_selector
|
|
@@ -106,7 +106,8 @@ class CamoufoxEngine:
|
|
| 106 |
if self.network_idle:
|
| 107 |
page.wait_for_load_state('networkidle')
|
| 108 |
|
| 109 |
-
|
|
|
|
| 110 |
|
| 111 |
if self.wait_selector and type(self.wait_selector) is str:
|
| 112 |
waiter = page.locator(self.wait_selector)
|
|
|
|
| 6 |
from scrapling.core.utils import log
|
| 7 |
from scrapling.engines.toolbelt import (Response, StatusText,
|
| 8 |
check_type_validity,
|
| 9 |
+
construct_proxy_dict,
|
| 10 |
generate_convincing_referer,
|
| 11 |
get_os_name, intercept_route)
|
| 12 |
|
|
|
|
| 15 |
def __init__(
|
| 16 |
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
| 17 |
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
| 18 |
+
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
| 19 |
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
| 20 |
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
| 21 |
geoip: Optional[bool] = False,
|
|
|
|
| 65 |
if callable(page_action):
|
| 66 |
self.page_action = page_action
|
| 67 |
else:
|
| 68 |
+
self.page_action = None
|
| 69 |
log.error('[Ignored] Argument "page_action" must be callable')
|
| 70 |
|
| 71 |
self.wait_selector = wait_selector
|
|
|
|
| 106 |
if self.network_idle:
|
| 107 |
page.wait_for_load_state('networkidle')
|
| 108 |
|
| 109 |
+
if self.page_action is not None:
|
| 110 |
+
page = self.page_action(page)
|
| 111 |
|
| 112 |
if self.wait_selector and type(self.wait_selector) is str:
|
| 113 |
waiter = page.locator(self.wait_selector)
|
scrapling/engines/pw.py
CHANGED
|
@@ -5,9 +5,9 @@ from scrapling.core.utils import log, lru_cache
|
|
| 5 |
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
| 6 |
NSTBROWSER_DEFAULT_QUERY)
|
| 7 |
from scrapling.engines.toolbelt import (Response, StatusText,
|
|
|
|
| 8 |
check_type_validity, construct_cdp_url,
|
| 9 |
-
construct_proxy_dict,
|
| 10 |
-
do_nothing_async,
|
| 11 |
generate_convincing_referer,
|
| 12 |
generate_headers, intercept_route,
|
| 13 |
js_bypass_path)
|
|
@@ -20,7 +20,7 @@ class PlaywrightEngine:
|
|
| 20 |
useragent: Optional[str] = None,
|
| 21 |
network_idle: Optional[bool] = False,
|
| 22 |
timeout: Optional[float] = 30000,
|
| 23 |
-
page_action: Callable =
|
| 24 |
wait_selector: Optional[str] = None,
|
| 25 |
locale: Optional[str] = 'en-US',
|
| 26 |
wait_selector_state: Optional[str] = 'attached',
|
|
@@ -75,10 +75,10 @@ class PlaywrightEngine:
|
|
| 75 |
self.cdp_url = cdp_url
|
| 76 |
self.useragent = useragent
|
| 77 |
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
| 78 |
-
if callable(page_action):
|
| 79 |
self.page_action = page_action
|
| 80 |
else:
|
| 81 |
-
self.page_action =
|
| 82 |
log.error('[Ignored] Argument "page_action" must be callable')
|
| 83 |
|
| 84 |
self.wait_selector = wait_selector
|
|
@@ -225,7 +225,8 @@ class PlaywrightEngine:
|
|
| 225 |
if self.network_idle:
|
| 226 |
page.wait_for_load_state('networkidle')
|
| 227 |
|
| 228 |
-
|
|
|
|
| 229 |
|
| 230 |
if self.wait_selector and type(self.wait_selector) is str:
|
| 231 |
waiter = page.locator(self.wait_selector)
|
|
@@ -238,11 +239,8 @@ class PlaywrightEngine:
|
|
| 238 |
|
| 239 |
# This will be parsed inside `Response`
|
| 240 |
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
| 241 |
-
|
| 242 |
-
status_text = res.status_text
|
| 243 |
# PlayWright API sometimes give empty status text for some reason!
|
| 244 |
-
|
| 245 |
-
status_text = StatusText.get(res.status)
|
| 246 |
|
| 247 |
response = Response(
|
| 248 |
url=res.url,
|
|
@@ -258,3 +256,76 @@ class PlaywrightEngine:
|
|
| 258 |
)
|
| 259 |
page.close()
|
| 260 |
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
| 6 |
NSTBROWSER_DEFAULT_QUERY)
|
| 7 |
from scrapling.engines.toolbelt import (Response, StatusText,
|
| 8 |
+
async_intercept_route,
|
| 9 |
check_type_validity, construct_cdp_url,
|
| 10 |
+
construct_proxy_dict,
|
|
|
|
| 11 |
generate_convincing_referer,
|
| 12 |
generate_headers, intercept_route,
|
| 13 |
js_bypass_path)
|
|
|
|
| 20 |
useragent: Optional[str] = None,
|
| 21 |
network_idle: Optional[bool] = False,
|
| 22 |
timeout: Optional[float] = 30000,
|
| 23 |
+
page_action: Callable = None,
|
| 24 |
wait_selector: Optional[str] = None,
|
| 25 |
locale: Optional[str] = 'en-US',
|
| 26 |
wait_selector_state: Optional[str] = 'attached',
|
|
|
|
| 75 |
self.cdp_url = cdp_url
|
| 76 |
self.useragent = useragent
|
| 77 |
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
| 78 |
+
if page_action is not None and callable(page_action):
|
| 79 |
self.page_action = page_action
|
| 80 |
else:
|
| 81 |
+
self.page_action = None
|
| 82 |
log.error('[Ignored] Argument "page_action" must be callable')
|
| 83 |
|
| 84 |
self.wait_selector = wait_selector
|
|
|
|
| 225 |
if self.network_idle:
|
| 226 |
page.wait_for_load_state('networkidle')
|
| 227 |
|
| 228 |
+
if self.page_action is not None:
|
| 229 |
+
page = self.page_action(page)
|
| 230 |
|
| 231 |
if self.wait_selector and type(self.wait_selector) is str:
|
| 232 |
waiter = page.locator(self.wait_selector)
|
|
|
|
| 239 |
|
| 240 |
# This will be parsed inside `Response`
|
| 241 |
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
|
|
|
|
|
|
| 242 |
# PlayWright API sometimes give empty status text for some reason!
|
| 243 |
+
status_text = res.status_text or StatusText.get(res.status)
|
|
|
|
| 244 |
|
| 245 |
response = Response(
|
| 246 |
url=res.url,
|
|
|
|
| 256 |
)
|
| 257 |
page.close()
|
| 258 |
return response
|
| 259 |
+
|
| 260 |
+
async def async_fetch(self, url: str) -> Response:
|
| 261 |
+
"""Async version of `fetch`
|
| 262 |
+
|
| 263 |
+
:param url: Target url.
|
| 264 |
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 265 |
+
"""
|
| 266 |
+
if not self.stealth or self.real_chrome:
|
| 267 |
+
# Because rebrowser_playwright doesn't play well with real browsers
|
| 268 |
+
from playwright.async_api import async_playwright
|
| 269 |
+
else:
|
| 270 |
+
from rebrowser_playwright.async_api import async_playwright
|
| 271 |
+
|
| 272 |
+
async with async_playwright() as p:
|
| 273 |
+
# Creating the browser
|
| 274 |
+
if self.cdp_url:
|
| 275 |
+
cdp_url = self._cdp_url_logic()
|
| 276 |
+
browser = await p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
| 277 |
+
else:
|
| 278 |
+
browser = await p.chromium.launch(**self.__launch_kwargs())
|
| 279 |
+
|
| 280 |
+
context = await browser.new_context(**self.__context_kwargs())
|
| 281 |
+
# Finally we are in business
|
| 282 |
+
page = await context.new_page()
|
| 283 |
+
page.set_default_navigation_timeout(self.timeout)
|
| 284 |
+
page.set_default_timeout(self.timeout)
|
| 285 |
+
|
| 286 |
+
if self.extra_headers:
|
| 287 |
+
await page.set_extra_http_headers(self.extra_headers)
|
| 288 |
+
|
| 289 |
+
if self.disable_resources:
|
| 290 |
+
await page.route("**/*", async_intercept_route)
|
| 291 |
+
|
| 292 |
+
if self.stealth:
|
| 293 |
+
for script in self.__stealth_scripts():
|
| 294 |
+
await page.add_init_script(path=script)
|
| 295 |
+
|
| 296 |
+
res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
| 297 |
+
await page.wait_for_load_state(state="domcontentloaded")
|
| 298 |
+
if self.network_idle:
|
| 299 |
+
await page.wait_for_load_state('networkidle')
|
| 300 |
+
|
| 301 |
+
if self.page_action is not None:
|
| 302 |
+
page = await self.page_action(page)
|
| 303 |
+
|
| 304 |
+
if self.wait_selector and type(self.wait_selector) is str:
|
| 305 |
+
waiter = page.locator(self.wait_selector)
|
| 306 |
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
| 307 |
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
| 308 |
+
await page.wait_for_load_state(state="load")
|
| 309 |
+
await page.wait_for_load_state(state="domcontentloaded")
|
| 310 |
+
if self.network_idle:
|
| 311 |
+
await page.wait_for_load_state('networkidle')
|
| 312 |
+
|
| 313 |
+
# This will be parsed inside `Response`
|
| 314 |
+
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
| 315 |
+
# PlayWright API sometimes give empty status text for some reason!
|
| 316 |
+
status_text = res.status_text or StatusText.get(res.status)
|
| 317 |
+
|
| 318 |
+
response = Response(
|
| 319 |
+
url=res.url,
|
| 320 |
+
text=await page.content(),
|
| 321 |
+
body=(await page.content()).encode('utf-8'),
|
| 322 |
+
status=res.status,
|
| 323 |
+
reason=status_text,
|
| 324 |
+
encoding=encoding,
|
| 325 |
+
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
| 326 |
+
headers=await res.all_headers(),
|
| 327 |
+
request_headers=await res.request.all_headers(),
|
| 328 |
+
**self.adaptor_arguments
|
| 329 |
+
)
|
| 330 |
+
await page.close()
|
| 331 |
+
return response
|
scrapling/engines/toolbelt/__init__.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
|
| 2 |
-
check_type_validity,
|
| 3 |
-
get_variable_name)
|
| 4 |
from .fingerprints import (generate_convincing_referer, generate_headers,
|
| 5 |
get_os_name)
|
| 6 |
-
from .navigation import (
|
| 7 |
-
intercept_route, js_bypass_path)
|
|
|
|
| 1 |
from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
|
| 2 |
+
check_type_validity, get_variable_name)
|
|
|
|
| 3 |
from .fingerprints import (generate_convincing_referer, generate_headers,
|
| 4 |
get_os_name)
|
| 5 |
+
from .navigation import (async_intercept_route, construct_cdp_url,
|
| 6 |
+
construct_proxy_dict, intercept_route, js_bypass_path)
|
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -296,14 +296,3 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
|
|
| 296 |
return default_value
|
| 297 |
|
| 298 |
return variable
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
# Pew Pew
|
| 302 |
-
def do_nothing(page):
|
| 303 |
-
# Just works as a filler for `page_action` argument in browser engines
|
| 304 |
-
return page
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
async def do_nothing_async(page):
|
| 308 |
-
# Just works as a filler for `page_action` argument in browser engines
|
| 309 |
-
return page
|
|
|
|
| 296 |
return default_value
|
| 297 |
|
| 298 |
return variable
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scrapling/engines/toolbelt/navigation.py
CHANGED
|
@@ -4,6 +4,7 @@ Functions related to files and URLs
|
|
| 4 |
import os
|
| 5 |
from urllib.parse import urlencode, urlparse
|
| 6 |
|
|
|
|
| 7 |
from playwright.sync_api import Route
|
| 8 |
|
| 9 |
from scrapling.core._types import Dict, Optional, Union
|
|
@@ -11,7 +12,7 @@ from scrapling.core.utils import log, lru_cache
|
|
| 11 |
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 12 |
|
| 13 |
|
| 14 |
-
def intercept_route(route: Route)
|
| 15 |
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
| 16 |
|
| 17 |
:param route: PlayWright `Route` object of the current page
|
|
@@ -19,8 +20,20 @@ def intercept_route(route: Route) -> Union[Route, None]:
|
|
| 19 |
"""
|
| 20 |
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
| 21 |
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:
|
|
|
|
| 4 |
import os
|
| 5 |
from urllib.parse import urlencode, urlparse
|
| 6 |
|
| 7 |
+
from playwright.async_api import Route as async_Route
|
| 8 |
from playwright.sync_api import Route
|
| 9 |
|
| 10 |
from scrapling.core._types import Dict, Optional, Union
|
|
|
|
| 12 |
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 13 |
|
| 14 |
|
| 15 |
+
def intercept_route(route: Route):
|
| 16 |
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
| 17 |
|
| 18 |
:param route: PlayWright `Route` object of the current page
|
|
|
|
| 20 |
"""
|
| 21 |
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
| 22 |
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
| 23 |
+
route.abort()
|
| 24 |
+
route.continue_()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
async def async_intercept_route(route: async_Route):
|
| 28 |
+
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
| 29 |
+
|
| 30 |
+
:param route: PlayWright `Route` object of the current page
|
| 31 |
+
:return: PlayWright `Route` object
|
| 32 |
+
"""
|
| 33 |
+
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
| 34 |
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
| 35 |
+
await route.abort()
|
| 36 |
+
await route.continue_()
|
| 37 |
|
| 38 |
|
| 39 |
def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:
|
scrapling/fetchers.py
CHANGED
|
@@ -2,7 +2,7 @@ from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
|
| 2 |
Union)
|
| 3 |
from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
|
| 4 |
check_if_engine_usable)
|
| 5 |
-
from scrapling.engines.toolbelt import BaseFetcher, Response
|
| 6 |
|
| 7 |
|
| 8 |
class Fetcher(BaseFetcher):
|
|
@@ -175,7 +175,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 175 |
def fetch(
|
| 176 |
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
| 177 |
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
| 178 |
-
timeout: Optional[float] = 30000, page_action: Callable =
|
| 179 |
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 180 |
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
| 181 |
) -> Response:
|
|
@@ -250,7 +250,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
| 250 |
def fetch(
|
| 251 |
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
| 252 |
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
| 253 |
-
page_action: Optional[Callable] =
|
| 254 |
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
| 255 |
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
| 256 |
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
|
@@ -307,6 +307,66 @@ class PlayWrightFetcher(BaseFetcher):
|
|
| 307 |
)
|
| 308 |
return engine.fetch(url)
|
| 309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
class CustomFetcher(BaseFetcher):
|
| 312 |
def fetch(self, url: str, browser_engine, **kwargs) -> Response:
|
|
|
|
| 2 |
Union)
|
| 3 |
from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
|
| 4 |
check_if_engine_usable)
|
| 5 |
+
from scrapling.engines.toolbelt import BaseFetcher, Response
|
| 6 |
|
| 7 |
|
| 8 |
class Fetcher(BaseFetcher):
|
|
|
|
| 175 |
def fetch(
|
| 176 |
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
| 177 |
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
| 178 |
+
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
| 179 |
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 180 |
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
| 181 |
) -> Response:
|
|
|
|
| 250 |
def fetch(
|
| 251 |
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
| 252 |
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
| 253 |
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
| 254 |
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
| 255 |
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
| 256 |
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
|
|
|
| 307 |
)
|
| 308 |
return engine.fetch(url)
|
| 309 |
|
| 310 |
+
async def async_fetch(
|
| 311 |
+
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
| 312 |
+
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
| 313 |
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
| 314 |
+
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
| 315 |
+
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
| 316 |
+
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
| 317 |
+
cdp_url: Optional[str] = None,
|
| 318 |
+
nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
|
| 319 |
+
) -> Response:
|
| 320 |
+
"""Opens up a browser and do your request based on your chosen options below.
|
| 321 |
+
|
| 322 |
+
:param url: Target url.
|
| 323 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 324 |
+
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
| 325 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 326 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 327 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 328 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 329 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
| 330 |
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
| 331 |
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
| 332 |
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
| 333 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
| 334 |
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
| 335 |
+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
| 336 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 337 |
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
| 338 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
| 339 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 340 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 341 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
| 342 |
+
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
| 343 |
+
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
| 344 |
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 345 |
+
"""
|
| 346 |
+
engine = PlaywrightEngine(
|
| 347 |
+
proxy=proxy,
|
| 348 |
+
locale=locale,
|
| 349 |
+
timeout=timeout,
|
| 350 |
+
stealth=stealth,
|
| 351 |
+
cdp_url=cdp_url,
|
| 352 |
+
headless=headless,
|
| 353 |
+
useragent=useragent,
|
| 354 |
+
real_chrome=real_chrome,
|
| 355 |
+
page_action=page_action,
|
| 356 |
+
hide_canvas=hide_canvas,
|
| 357 |
+
network_idle=network_idle,
|
| 358 |
+
google_search=google_search,
|
| 359 |
+
extra_headers=extra_headers,
|
| 360 |
+
wait_selector=wait_selector,
|
| 361 |
+
disable_webgl=disable_webgl,
|
| 362 |
+
nstbrowser_mode=nstbrowser_mode,
|
| 363 |
+
nstbrowser_config=nstbrowser_config,
|
| 364 |
+
disable_resources=disable_resources,
|
| 365 |
+
wait_selector_state=wait_selector_state,
|
| 366 |
+
adaptor_arguments=self.adaptor_arguments,
|
| 367 |
+
)
|
| 368 |
+
return await engine.async_fetch(url)
|
| 369 |
+
|
| 370 |
|
| 371 |
class CustomFetcher(BaseFetcher):
|
| 372 |
def fetch(self, url: str, browser_engine, **kwargs) -> Response:
|