Karim shoair commited on
Commit ·
1803348
1
Parent(s): a7eadeb
refactor: internal API changes to be easily used as indicators for spiders
Browse files
scrapling/engines/_browsers/_base.py
CHANGED
|
@@ -37,14 +37,14 @@ class SyncSession:
|
|
| 37 |
self._max_wait_for_page = 60
|
| 38 |
self.playwright: Playwright | Any = None
|
| 39 |
self.context: BrowserContext | Any = None
|
| 40 |
-
self.
|
| 41 |
|
| 42 |
def start(self):
|
| 43 |
pass
|
| 44 |
|
| 45 |
def close(self): # pragma: no cover
|
| 46 |
"""Close all resources"""
|
| 47 |
-
if self.
|
| 48 |
return
|
| 49 |
|
| 50 |
if self.context:
|
|
@@ -55,7 +55,7 @@ class SyncSession:
|
|
| 55 |
self.playwright.stop()
|
| 56 |
self.playwright = None # pyright: ignore
|
| 57 |
|
| 58 |
-
self.
|
| 59 |
|
| 60 |
def __enter__(self):
|
| 61 |
self.start()
|
|
@@ -137,7 +137,7 @@ class AsyncSession:
|
|
| 137 |
self._max_wait_for_page = 60
|
| 138 |
self.playwright: AsyncPlaywright | Any = None
|
| 139 |
self.context: AsyncBrowserContext | Any = None
|
| 140 |
-
self.
|
| 141 |
self._lock = Lock()
|
| 142 |
|
| 143 |
async def start(self):
|
|
@@ -145,7 +145,7 @@ class AsyncSession:
|
|
| 145 |
|
| 146 |
async def close(self):
|
| 147 |
"""Close all resources"""
|
| 148 |
-
if self.
|
| 149 |
return
|
| 150 |
|
| 151 |
if self.context:
|
|
@@ -156,7 +156,7 @@ class AsyncSession:
|
|
| 156 |
await self.playwright.stop()
|
| 157 |
self.playwright = None # pyright: ignore
|
| 158 |
|
| 159 |
-
self.
|
| 160 |
|
| 161 |
async def __aenter__(self):
|
| 162 |
await self.start()
|
|
|
|
| 37 |
self._max_wait_for_page = 60
|
| 38 |
self.playwright: Playwright | Any = None
|
| 39 |
self.context: BrowserContext | Any = None
|
| 40 |
+
self._is_alive = False
|
| 41 |
|
| 42 |
def start(self):
|
| 43 |
pass
|
| 44 |
|
| 45 |
def close(self): # pragma: no cover
|
| 46 |
"""Close all resources"""
|
| 47 |
+
if not self._is_alive:
|
| 48 |
return
|
| 49 |
|
| 50 |
if self.context:
|
|
|
|
| 55 |
self.playwright.stop()
|
| 56 |
self.playwright = None # pyright: ignore
|
| 57 |
|
| 58 |
+
self._is_alive = False
|
| 59 |
|
| 60 |
def __enter__(self):
|
| 61 |
self.start()
|
|
|
|
| 137 |
self._max_wait_for_page = 60
|
| 138 |
self.playwright: AsyncPlaywright | Any = None
|
| 139 |
self.context: AsyncBrowserContext | Any = None
|
| 140 |
+
self._is_alive = False
|
| 141 |
self._lock = Lock()
|
| 142 |
|
| 143 |
async def start(self):
|
|
|
|
| 145 |
|
| 146 |
async def close(self):
|
| 147 |
"""Close all resources"""
|
| 148 |
+
if not self._is_alive: # pragma: no cover
|
| 149 |
return
|
| 150 |
|
| 151 |
if self.context:
|
|
|
|
| 156 |
await self.playwright.stop()
|
| 157 |
self.playwright = None # pyright: ignore
|
| 158 |
|
| 159 |
+
self._is_alive = False
|
| 160 |
|
| 161 |
async def __aenter__(self):
|
| 162 |
await self.start()
|
scrapling/engines/_browsers/_controllers.py
CHANGED
|
@@ -31,7 +31,6 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 31 |
"_max_wait_for_page",
|
| 32 |
"playwright",
|
| 33 |
"context",
|
| 34 |
-
"_closed",
|
| 35 |
)
|
| 36 |
|
| 37 |
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
|
|
@@ -82,6 +81,8 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 82 |
|
| 83 |
if self._config.cookies: # pragma: no cover
|
| 84 |
self.context.add_cookies(self._config.cookies)
|
|
|
|
|
|
|
| 85 |
else:
|
| 86 |
raise RuntimeError("Session has been already started")
|
| 87 |
|
|
@@ -104,7 +105,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 104 |
:return: A `Response` object.
|
| 105 |
"""
|
| 106 |
params = _validate(kwargs, self, PlaywrightConfig)
|
| 107 |
-
if self.
|
| 108 |
raise RuntimeError("Context manager has been closed")
|
| 109 |
|
| 110 |
referer = (
|
|
@@ -211,6 +212,8 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 211 |
|
| 212 |
if self._config.cookies:
|
| 213 |
await self.context.add_cookies(self._config.cookies) # pyright: ignore
|
|
|
|
|
|
|
| 214 |
else:
|
| 215 |
raise RuntimeError("Session has been already started")
|
| 216 |
|
|
@@ -234,7 +237,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 234 |
"""
|
| 235 |
params = _validate(kwargs, self, PlaywrightConfig)
|
| 236 |
|
| 237 |
-
if self.
|
| 238 |
raise RuntimeError("Context manager has been closed")
|
| 239 |
|
| 240 |
referer = (
|
|
|
|
| 31 |
"_max_wait_for_page",
|
| 32 |
"playwright",
|
| 33 |
"context",
|
|
|
|
| 34 |
)
|
| 35 |
|
| 36 |
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
|
|
|
|
| 81 |
|
| 82 |
if self._config.cookies: # pragma: no cover
|
| 83 |
self.context.add_cookies(self._config.cookies)
|
| 84 |
+
|
| 85 |
+
self._is_alive = True
|
| 86 |
else:
|
| 87 |
raise RuntimeError("Session has been already started")
|
| 88 |
|
|
|
|
| 105 |
:return: A `Response` object.
|
| 106 |
"""
|
| 107 |
params = _validate(kwargs, self, PlaywrightConfig)
|
| 108 |
+
if not self._is_alive: # pragma: no cover
|
| 109 |
raise RuntimeError("Context manager has been closed")
|
| 110 |
|
| 111 |
referer = (
|
|
|
|
| 212 |
|
| 213 |
if self._config.cookies:
|
| 214 |
await self.context.add_cookies(self._config.cookies) # pyright: ignore
|
| 215 |
+
|
| 216 |
+
self._is_alive = True
|
| 217 |
else:
|
| 218 |
raise RuntimeError("Session has been already started")
|
| 219 |
|
|
|
|
| 237 |
"""
|
| 238 |
params = _validate(kwargs, self, PlaywrightConfig)
|
| 239 |
|
| 240 |
+
if not self._is_alive: # pragma: no cover
|
| 241 |
raise RuntimeError("Context manager has been closed")
|
| 242 |
|
| 243 |
referer = (
|
scrapling/engines/_browsers/_stealth.py
CHANGED
|
@@ -39,7 +39,6 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 39 |
"_max_wait_for_page",
|
| 40 |
"playwright",
|
| 41 |
"context",
|
| 42 |
-
"_closed",
|
| 43 |
)
|
| 44 |
|
| 45 |
def __init__(self, **kwargs: Unpack[StealthSession]):
|
|
@@ -191,7 +190,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 191 |
:return: A `Response` object.
|
| 192 |
"""
|
| 193 |
params = _validate(kwargs, self, StealthConfig)
|
| 194 |
-
if self.
|
| 195 |
raise RuntimeError("Context manager has been closed")
|
| 196 |
|
| 197 |
referer = (
|
|
@@ -404,7 +403,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 404 |
"""
|
| 405 |
params = _validate(kwargs, self, StealthConfig)
|
| 406 |
|
| 407 |
-
if self.
|
| 408 |
raise RuntimeError("Context manager has been closed")
|
| 409 |
|
| 410 |
referer = (
|
|
|
|
| 39 |
"_max_wait_for_page",
|
| 40 |
"playwright",
|
| 41 |
"context",
|
|
|
|
| 42 |
)
|
| 43 |
|
| 44 |
def __init__(self, **kwargs: Unpack[StealthSession]):
|
|
|
|
| 190 |
:return: A `Response` object.
|
| 191 |
"""
|
| 192 |
params = _validate(kwargs, self, StealthConfig)
|
| 193 |
+
if not self._is_alive: # pragma: no cover
|
| 194 |
raise RuntimeError("Context manager has been closed")
|
| 195 |
|
| 196 |
referer = (
|
|
|
|
| 403 |
"""
|
| 404 |
params = _validate(kwargs, self, StealthConfig)
|
| 405 |
|
| 406 |
+
if not self._is_alive: # pragma: no cover
|
| 407 |
raise RuntimeError("Context manager has been closed")
|
| 408 |
|
| 409 |
referer = (
|
scrapling/engines/static.py
CHANGED
|
@@ -62,6 +62,7 @@ class _ConfigurationLogic(ABC):
|
|
| 62 |
"_default_cert",
|
| 63 |
"_default_http3",
|
| 64 |
"selector_config",
|
|
|
|
| 65 |
)
|
| 66 |
|
| 67 |
def __init__(self, **kwargs: Unpack[RequestsSession]):
|
|
@@ -80,6 +81,7 @@ class _ConfigurationLogic(ABC):
|
|
| 80 |
self._default_cert = kwargs.get("cert") or None
|
| 81 |
self._default_http3 = kwargs.get("http3", False)
|
| 82 |
self.selector_config = kwargs.get("selector_config") or {}
|
|
|
|
| 83 |
|
| 84 |
@staticmethod
|
| 85 |
def _get_param(kwargs: Dict, key: str, default: Any) -> Any:
|
|
@@ -183,10 +185,11 @@ class _SyncSessionLogic(_ConfigurationLogic):
|
|
| 183 |
|
| 184 |
def __enter__(self):
|
| 185 |
"""Creates and returns a new synchronous Fetcher Session"""
|
| 186 |
-
if self.
|
| 187 |
raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
|
| 188 |
|
| 189 |
self._curl_session = CurlSession()
|
|
|
|
| 190 |
return self
|
| 191 |
|
| 192 |
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
@@ -201,7 +204,9 @@ class _SyncSessionLogic(_ConfigurationLogic):
|
|
| 201 |
self._curl_session.close()
|
| 202 |
self._curl_session = None
|
| 203 |
|
| 204 |
-
|
|
|
|
|
|
|
| 205 |
"""
|
| 206 |
Perform an HTTP request using the configured session.
|
| 207 |
"""
|
|
@@ -267,7 +272,7 @@ class _SyncSessionLogic(_ConfigurationLogic):
|
|
| 267 |
:return: A `Response` object.
|
| 268 |
"""
|
| 269 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 270 |
-
return self.
|
| 271 |
|
| 272 |
def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
|
| 273 |
"""
|
|
@@ -299,7 +304,7 @@ class _SyncSessionLogic(_ConfigurationLogic):
|
|
| 299 |
:return: A `Response` object.
|
| 300 |
"""
|
| 301 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 302 |
-
return self.
|
| 303 |
|
| 304 |
def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
|
| 305 |
"""
|
|
@@ -331,7 +336,7 @@ class _SyncSessionLogic(_ConfigurationLogic):
|
|
| 331 |
:return: A `Response` object.
|
| 332 |
"""
|
| 333 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 334 |
-
return self.
|
| 335 |
|
| 336 |
def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
|
| 337 |
"""
|
|
@@ -365,7 +370,7 @@ class _SyncSessionLogic(_ConfigurationLogic):
|
|
| 365 |
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
|
| 366 |
# But some websites accept it, it depends on the implementation used.
|
| 367 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 368 |
-
return self.
|
| 369 |
|
| 370 |
|
| 371 |
class _ASyncSessionLogic(_ConfigurationLogic):
|
|
@@ -377,10 +382,11 @@ class _ASyncSessionLogic(_ConfigurationLogic):
|
|
| 377 |
|
| 378 |
async def __aenter__(self): # pragma: no cover
|
| 379 |
"""Creates and returns a new asynchronous Session."""
|
| 380 |
-
if self.
|
| 381 |
raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
|
| 382 |
|
| 383 |
self._async_curl_session = AsyncCurlSession()
|
|
|
|
| 384 |
return self
|
| 385 |
|
| 386 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
@@ -395,9 +401,9 @@ class _ASyncSessionLogic(_ConfigurationLogic):
|
|
| 395 |
await self._async_curl_session.close()
|
| 396 |
self._async_curl_session = None
|
| 397 |
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
) -> Response:
|
| 401 |
"""
|
| 402 |
Perform an HTTP request using the configured session.
|
| 403 |
"""
|
|
@@ -465,7 +471,7 @@ class _ASyncSessionLogic(_ConfigurationLogic):
|
|
| 465 |
:return: A `Response` object.
|
| 466 |
"""
|
| 467 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 468 |
-
return self.
|
| 469 |
|
| 470 |
def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
|
| 471 |
"""
|
|
@@ -497,7 +503,7 @@ class _ASyncSessionLogic(_ConfigurationLogic):
|
|
| 497 |
:return: A `Response` object.
|
| 498 |
"""
|
| 499 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 500 |
-
return self.
|
| 501 |
|
| 502 |
def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
|
| 503 |
"""
|
|
@@ -529,7 +535,7 @@ class _ASyncSessionLogic(_ConfigurationLogic):
|
|
| 529 |
:return: A `Response` object.
|
| 530 |
"""
|
| 531 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 532 |
-
return self.
|
| 533 |
|
| 534 |
def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
|
| 535 |
"""
|
|
@@ -563,7 +569,7 @@ class _ASyncSessionLogic(_ConfigurationLogic):
|
|
| 563 |
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
|
| 564 |
# But some websites accept it, it depends on the implementation used.
|
| 565 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 566 |
-
return self.
|
| 567 |
|
| 568 |
|
| 569 |
class FetcherSession:
|
|
|
|
| 62 |
"_default_cert",
|
| 63 |
"_default_http3",
|
| 64 |
"selector_config",
|
| 65 |
+
"_is_alive",
|
| 66 |
)
|
| 67 |
|
| 68 |
def __init__(self, **kwargs: Unpack[RequestsSession]):
|
|
|
|
| 81 |
self._default_cert = kwargs.get("cert") or None
|
| 82 |
self._default_http3 = kwargs.get("http3", False)
|
| 83 |
self.selector_config = kwargs.get("selector_config") or {}
|
| 84 |
+
self._is_alive = False
|
| 85 |
|
| 86 |
@staticmethod
|
| 87 |
def _get_param(kwargs: Dict, key: str, default: Any) -> Any:
|
|
|
|
| 185 |
|
| 186 |
def __enter__(self):
|
| 187 |
"""Creates and returns a new synchronous Fetcher Session"""
|
| 188 |
+
if self._is_alive:
|
| 189 |
raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
|
| 190 |
|
| 191 |
self._curl_session = CurlSession()
|
| 192 |
+
self._is_alive = True
|
| 193 |
return self
|
| 194 |
|
| 195 |
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
|
|
| 204 |
self._curl_session.close()
|
| 205 |
self._curl_session = None
|
| 206 |
|
| 207 |
+
self._is_alive = False
|
| 208 |
+
|
| 209 |
+
def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
|
| 210 |
"""
|
| 211 |
Perform an HTTP request using the configured session.
|
| 212 |
"""
|
|
|
|
| 272 |
:return: A `Response` object.
|
| 273 |
"""
|
| 274 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 275 |
+
return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
|
| 276 |
|
| 277 |
def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
|
| 278 |
"""
|
|
|
|
| 304 |
:return: A `Response` object.
|
| 305 |
"""
|
| 306 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 307 |
+
return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
|
| 308 |
|
| 309 |
def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
|
| 310 |
"""
|
|
|
|
| 336 |
:return: A `Response` object.
|
| 337 |
"""
|
| 338 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 339 |
+
return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
|
| 340 |
|
| 341 |
def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
|
| 342 |
"""
|
|
|
|
| 370 |
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
|
| 371 |
# But some websites accept it, it depends on the implementation used.
|
| 372 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 373 |
+
return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
|
| 374 |
|
| 375 |
|
| 376 |
class _ASyncSessionLogic(_ConfigurationLogic):
|
|
|
|
| 382 |
|
| 383 |
async def __aenter__(self): # pragma: no cover
|
| 384 |
"""Creates and returns a new asynchronous Session."""
|
| 385 |
+
if self._is_alive:
|
| 386 |
raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
|
| 387 |
|
| 388 |
self._async_curl_session = AsyncCurlSession()
|
| 389 |
+
self._is_alive = True
|
| 390 |
return self
|
| 391 |
|
| 392 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
|
|
| 401 |
await self._async_curl_session.close()
|
| 402 |
self._async_curl_session = None
|
| 403 |
|
| 404 |
+
self._is_alive = False
|
| 405 |
+
|
| 406 |
+
async def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
|
| 407 |
"""
|
| 408 |
Perform an HTTP request using the configured session.
|
| 409 |
"""
|
|
|
|
| 471 |
:return: A `Response` object.
|
| 472 |
"""
|
| 473 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 474 |
+
return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
|
| 475 |
|
| 476 |
def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
|
| 477 |
"""
|
|
|
|
| 503 |
:return: A `Response` object.
|
| 504 |
"""
|
| 505 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 506 |
+
return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
|
| 507 |
|
| 508 |
def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
|
| 509 |
"""
|
|
|
|
| 535 |
:return: A `Response` object.
|
| 536 |
"""
|
| 537 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 538 |
+
return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
|
| 539 |
|
| 540 |
def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
|
| 541 |
"""
|
|
|
|
| 569 |
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
|
| 570 |
# But some websites accept it, it depends on the implementation used.
|
| 571 |
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 572 |
+
return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
|
| 573 |
|
| 574 |
|
| 575 |
class FetcherSession:
|