Karim shoair commited on
Commit ·
e39bf62
1
Parent(s): 76e6484
refactor(DynamicSession): Optimization + Removing `max_pages` from sync version
Browse files
scrapling/engines/_browsers/_controllers.py
CHANGED
|
@@ -80,7 +80,7 @@ class DynamicSession:
|
|
| 80 |
|
| 81 |
def __init__(
|
| 82 |
self,
|
| 83 |
-
|
| 84 |
headless: bool = True,
|
| 85 |
google_search: bool = True,
|
| 86 |
hide_canvas: bool = False,
|
|
@@ -102,7 +102,7 @@ class DynamicSession:
|
|
| 102 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 103 |
adaptor_arguments: Optional[Dict] = None,
|
| 104 |
):
|
| 105 |
-
"""A Browser session manager with page pooling
|
| 106 |
|
| 107 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 108 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
|
@@ -125,12 +125,11 @@ class DynamicSession:
|
|
| 125 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 126 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 127 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 128 |
-
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 129 |
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 130 |
"""
|
| 131 |
|
| 132 |
params = {
|
| 133 |
-
"max_pages":
|
| 134 |
"headless": headless,
|
| 135 |
"google_search": google_search,
|
| 136 |
"hide_canvas": hide_canvas,
|
|
@@ -188,38 +187,46 @@ class DynamicSession:
|
|
| 188 |
self.__initiate_browser_options__()
|
| 189 |
|
| 190 |
def __initiate_browser_options__(self):
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
)
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
self.launch_options["extra_http_headers"]
|
| 207 |
-
)
|
| 208 |
-
self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
|
| 209 |
-
# while `context_options` is left to be used when cdp mode is enabled
|
| 210 |
-
self.context_options = dict(
|
| 211 |
-
_context_kwargs(
|
| 212 |
-
self.proxy,
|
| 213 |
-
self.locale,
|
| 214 |
-
tuple(self.extra_headers.items()) if self.extra_headers else tuple(),
|
| 215 |
-
self.useragent,
|
| 216 |
-
self.stealth,
|
| 217 |
)
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
def __create__(self):
|
| 225 |
"""Create a browser for this instance and context."""
|
|
@@ -386,7 +393,7 @@ class DynamicSession:
|
|
| 386 |
|
| 387 |
|
| 388 |
class AsyncDynamicSession(DynamicSession):
|
| 389 |
-
"""
|
| 390 |
|
| 391 |
def __init__(
|
| 392 |
self,
|
|
|
|
| 80 |
|
| 81 |
def __init__(
|
| 82 |
self,
|
| 83 |
+
__max_pages: int = 1,
|
| 84 |
headless: bool = True,
|
| 85 |
google_search: bool = True,
|
| 86 |
hide_canvas: bool = False,
|
|
|
|
| 102 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 103 |
adaptor_arguments: Optional[Dict] = None,
|
| 104 |
):
|
| 105 |
+
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
| 106 |
|
| 107 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 108 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
|
|
|
| 125 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 126 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 127 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
|
|
| 128 |
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 129 |
"""
|
| 130 |
|
| 131 |
params = {
|
| 132 |
+
"max_pages": __max_pages,
|
| 133 |
"headless": headless,
|
| 134 |
"google_search": google_search,
|
| 135 |
"hide_canvas": hide_canvas,
|
|
|
|
| 187 |
self.__initiate_browser_options__()
|
| 188 |
|
| 189 |
def __initiate_browser_options__(self):
|
| 190 |
+
if self.cdp_url:
|
| 191 |
+
# `launch_options` is used with persistent context
|
| 192 |
+
self.launch_options = dict(
|
| 193 |
+
_launch_kwargs(
|
| 194 |
+
self.headless,
|
| 195 |
+
self.proxy,
|
| 196 |
+
self.locale,
|
| 197 |
+
tuple(self.extra_headers.items())
|
| 198 |
+
if self.extra_headers
|
| 199 |
+
else tuple(),
|
| 200 |
+
self.useragent,
|
| 201 |
+
self.real_chrome,
|
| 202 |
+
self.stealth,
|
| 203 |
+
self.hide_canvas,
|
| 204 |
+
self.disable_webgl,
|
| 205 |
+
)
|
| 206 |
)
|
| 207 |
+
self.launch_options["extra_http_headers"] = dict(
|
| 208 |
+
self.launch_options["extra_http_headers"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
)
|
| 210 |
+
self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
|
| 211 |
+
self.context_options = dict()
|
| 212 |
+
else:
|
| 213 |
+
# while `context_options` is left to be used when cdp mode is enabled
|
| 214 |
+
self.launch_options = dict()
|
| 215 |
+
self.context_options = dict(
|
| 216 |
+
_context_kwargs(
|
| 217 |
+
self.proxy,
|
| 218 |
+
self.locale,
|
| 219 |
+
tuple(self.extra_headers.items())
|
| 220 |
+
if self.extra_headers
|
| 221 |
+
else tuple(),
|
| 222 |
+
self.useragent,
|
| 223 |
+
self.stealth,
|
| 224 |
+
)
|
| 225 |
+
)
|
| 226 |
+
self.context_options["extra_http_headers"] = dict(
|
| 227 |
+
self.context_options["extra_http_headers"]
|
| 228 |
+
)
|
| 229 |
+
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
| 230 |
|
| 231 |
def __create__(self):
|
| 232 |
"""Create a browser for this instance and context."""
|
|
|
|
| 393 |
|
| 394 |
|
| 395 |
class AsyncDynamicSession(DynamicSession):
|
| 396 |
+
"""An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
|
| 397 |
|
| 398 |
def __init__(
|
| 399 |
self,
|