Karim shoair commited on
Commit
1803348
·
1 Parent(s): a7eadeb

refactor: internal API changes to be easily used as indicators for spiders

Browse files
scrapling/engines/_browsers/_base.py CHANGED
@@ -37,14 +37,14 @@ class SyncSession:
37
  self._max_wait_for_page = 60
38
  self.playwright: Playwright | Any = None
39
  self.context: BrowserContext | Any = None
40
- self._closed = False
41
 
42
  def start(self):
43
  pass
44
 
45
  def close(self): # pragma: no cover
46
  """Close all resources"""
47
- if self._closed:
48
  return
49
 
50
  if self.context:
@@ -55,7 +55,7 @@ class SyncSession:
55
  self.playwright.stop()
56
  self.playwright = None # pyright: ignore
57
 
58
- self._closed = True
59
 
60
  def __enter__(self):
61
  self.start()
@@ -137,7 +137,7 @@ class AsyncSession:
137
  self._max_wait_for_page = 60
138
  self.playwright: AsyncPlaywright | Any = None
139
  self.context: AsyncBrowserContext | Any = None
140
- self._closed = False
141
  self._lock = Lock()
142
 
143
  async def start(self):
@@ -145,7 +145,7 @@ class AsyncSession:
145
 
146
  async def close(self):
147
  """Close all resources"""
148
- if self._closed: # pragma: no cover
149
  return
150
 
151
  if self.context:
@@ -156,7 +156,7 @@ class AsyncSession:
156
  await self.playwright.stop()
157
  self.playwright = None # pyright: ignore
158
 
159
- self._closed = True
160
 
161
  async def __aenter__(self):
162
  await self.start()
 
37
  self._max_wait_for_page = 60
38
  self.playwright: Playwright | Any = None
39
  self.context: BrowserContext | Any = None
40
+ self._is_alive = False
41
 
42
  def start(self):
43
  pass
44
 
45
  def close(self): # pragma: no cover
46
  """Close all resources"""
47
+ if not self._is_alive:
48
  return
49
 
50
  if self.context:
 
55
  self.playwright.stop()
56
  self.playwright = None # pyright: ignore
57
 
58
+ self._is_alive = False
59
 
60
  def __enter__(self):
61
  self.start()
 
137
  self._max_wait_for_page = 60
138
  self.playwright: AsyncPlaywright | Any = None
139
  self.context: AsyncBrowserContext | Any = None
140
+ self._is_alive = False
141
  self._lock = Lock()
142
 
143
  async def start(self):
 
145
 
146
  async def close(self):
147
  """Close all resources"""
148
+ if not self._is_alive: # pragma: no cover
149
  return
150
 
151
  if self.context:
 
156
  await self.playwright.stop()
157
  self.playwright = None # pyright: ignore
158
 
159
+ self._is_alive = False
160
 
161
  async def __aenter__(self):
162
  await self.start()
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -31,7 +31,6 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
31
  "_max_wait_for_page",
32
  "playwright",
33
  "context",
34
- "_closed",
35
  )
36
 
37
  def __init__(self, **kwargs: Unpack[PlaywrightSession]):
@@ -82,6 +81,8 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
82
 
83
  if self._config.cookies: # pragma: no cover
84
  self.context.add_cookies(self._config.cookies)
 
 
85
  else:
86
  raise RuntimeError("Session has been already started")
87
 
@@ -104,7 +105,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
104
  :return: A `Response` object.
105
  """
106
  params = _validate(kwargs, self, PlaywrightConfig)
107
- if self._closed: # pragma: no cover
108
  raise RuntimeError("Context manager has been closed")
109
 
110
  referer = (
@@ -211,6 +212,8 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
211
 
212
  if self._config.cookies:
213
  await self.context.add_cookies(self._config.cookies) # pyright: ignore
 
 
214
  else:
215
  raise RuntimeError("Session has been already started")
216
 
@@ -234,7 +237,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
234
  """
235
  params = _validate(kwargs, self, PlaywrightConfig)
236
 
237
- if self._closed: # pragma: no cover
238
  raise RuntimeError("Context manager has been closed")
239
 
240
  referer = (
 
31
  "_max_wait_for_page",
32
  "playwright",
33
  "context",
 
34
  )
35
 
36
  def __init__(self, **kwargs: Unpack[PlaywrightSession]):
 
81
 
82
  if self._config.cookies: # pragma: no cover
83
  self.context.add_cookies(self._config.cookies)
84
+
85
+ self._is_alive = True
86
  else:
87
  raise RuntimeError("Session has been already started")
88
 
 
105
  :return: A `Response` object.
106
  """
107
  params = _validate(kwargs, self, PlaywrightConfig)
108
+ if not self._is_alive: # pragma: no cover
109
  raise RuntimeError("Context manager has been closed")
110
 
111
  referer = (
 
212
 
213
  if self._config.cookies:
214
  await self.context.add_cookies(self._config.cookies) # pyright: ignore
215
+
216
+ self._is_alive = True
217
  else:
218
  raise RuntimeError("Session has been already started")
219
 
 
237
  """
238
  params = _validate(kwargs, self, PlaywrightConfig)
239
 
240
+ if not self._is_alive: # pragma: no cover
241
  raise RuntimeError("Context manager has been closed")
242
 
243
  referer = (
scrapling/engines/_browsers/_stealth.py CHANGED
@@ -39,7 +39,6 @@ class StealthySession(SyncSession, StealthySessionMixin):
39
  "_max_wait_for_page",
40
  "playwright",
41
  "context",
42
- "_closed",
43
  )
44
 
45
  def __init__(self, **kwargs: Unpack[StealthSession]):
@@ -191,7 +190,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
191
  :return: A `Response` object.
192
  """
193
  params = _validate(kwargs, self, StealthConfig)
194
- if self._closed: # pragma: no cover
195
  raise RuntimeError("Context manager has been closed")
196
 
197
  referer = (
@@ -404,7 +403,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
404
  """
405
  params = _validate(kwargs, self, StealthConfig)
406
 
407
- if self._closed: # pragma: no cover
408
  raise RuntimeError("Context manager has been closed")
409
 
410
  referer = (
 
39
  "_max_wait_for_page",
40
  "playwright",
41
  "context",
 
42
  )
43
 
44
  def __init__(self, **kwargs: Unpack[StealthSession]):
 
190
  :return: A `Response` object.
191
  """
192
  params = _validate(kwargs, self, StealthConfig)
193
+ if not self._is_alive: # pragma: no cover
194
  raise RuntimeError("Context manager has been closed")
195
 
196
  referer = (
 
403
  """
404
  params = _validate(kwargs, self, StealthConfig)
405
 
406
+ if not self._is_alive: # pragma: no cover
407
  raise RuntimeError("Context manager has been closed")
408
 
409
  referer = (
scrapling/engines/static.py CHANGED
@@ -62,6 +62,7 @@ class _ConfigurationLogic(ABC):
62
  "_default_cert",
63
  "_default_http3",
64
  "selector_config",
 
65
  )
66
 
67
  def __init__(self, **kwargs: Unpack[RequestsSession]):
@@ -80,6 +81,7 @@ class _ConfigurationLogic(ABC):
80
  self._default_cert = kwargs.get("cert") or None
81
  self._default_http3 = kwargs.get("http3", False)
82
  self.selector_config = kwargs.get("selector_config") or {}
 
83
 
84
  @staticmethod
85
  def _get_param(kwargs: Dict, key: str, default: Any) -> Any:
@@ -183,10 +185,11 @@ class _SyncSessionLogic(_ConfigurationLogic):
183
 
184
  def __enter__(self):
185
  """Creates and returns a new synchronous Fetcher Session"""
186
- if self._curl_session:
187
  raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
188
 
189
  self._curl_session = CurlSession()
 
190
  return self
191
 
192
  def __exit__(self, exc_type, exc_val, exc_tb):
@@ -201,7 +204,9 @@ class _SyncSessionLogic(_ConfigurationLogic):
201
  self._curl_session.close()
202
  self._curl_session = None
203
 
204
- def __make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
 
 
205
  """
206
  Perform an HTTP request using the configured session.
207
  """
@@ -267,7 +272,7 @@ class _SyncSessionLogic(_ConfigurationLogic):
267
  :return: A `Response` object.
268
  """
269
  stealthy_headers = kwargs.pop("stealthy_headers", None)
270
- return self.__make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
271
 
272
  def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
273
  """
@@ -299,7 +304,7 @@ class _SyncSessionLogic(_ConfigurationLogic):
299
  :return: A `Response` object.
300
  """
301
  stealthy_headers = kwargs.pop("stealthy_headers", None)
302
- return self.__make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
303
 
304
  def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
305
  """
@@ -331,7 +336,7 @@ class _SyncSessionLogic(_ConfigurationLogic):
331
  :return: A `Response` object.
332
  """
333
  stealthy_headers = kwargs.pop("stealthy_headers", None)
334
- return self.__make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
335
 
336
  def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
337
  """
@@ -365,7 +370,7 @@ class _SyncSessionLogic(_ConfigurationLogic):
365
  # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
366
  # But some websites accept it, it depends on the implementation used.
367
  stealthy_headers = kwargs.pop("stealthy_headers", None)
368
- return self.__make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
369
 
370
 
371
  class _ASyncSessionLogic(_ConfigurationLogic):
@@ -377,10 +382,11 @@ class _ASyncSessionLogic(_ConfigurationLogic):
377
 
378
  async def __aenter__(self): # pragma: no cover
379
  """Creates and returns a new asynchronous Session."""
380
- if self._async_curl_session:
381
  raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
382
 
383
  self._async_curl_session = AsyncCurlSession()
 
384
  return self
385
 
386
  async def __aexit__(self, exc_type, exc_val, exc_tb):
@@ -395,9 +401,9 @@ class _ASyncSessionLogic(_ConfigurationLogic):
395
  await self._async_curl_session.close()
396
  self._async_curl_session = None
397
 
398
- async def __make_request(
399
- self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs
400
- ) -> Response:
401
  """
402
  Perform an HTTP request using the configured session.
403
  """
@@ -465,7 +471,7 @@ class _ASyncSessionLogic(_ConfigurationLogic):
465
  :return: A `Response` object.
466
  """
467
  stealthy_headers = kwargs.pop("stealthy_headers", None)
468
- return self.__make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
469
 
470
  def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
471
  """
@@ -497,7 +503,7 @@ class _ASyncSessionLogic(_ConfigurationLogic):
497
  :return: A `Response` object.
498
  """
499
  stealthy_headers = kwargs.pop("stealthy_headers", None)
500
- return self.__make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
501
 
502
  def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
503
  """
@@ -529,7 +535,7 @@ class _ASyncSessionLogic(_ConfigurationLogic):
529
  :return: A `Response` object.
530
  """
531
  stealthy_headers = kwargs.pop("stealthy_headers", None)
532
- return self.__make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
533
 
534
  def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
535
  """
@@ -563,7 +569,7 @@ class _ASyncSessionLogic(_ConfigurationLogic):
563
  # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
564
  # But some websites accept it, it depends on the implementation used.
565
  stealthy_headers = kwargs.pop("stealthy_headers", None)
566
- return self.__make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
567
 
568
 
569
  class FetcherSession:
 
62
  "_default_cert",
63
  "_default_http3",
64
  "selector_config",
65
+ "_is_alive",
66
  )
67
 
68
  def __init__(self, **kwargs: Unpack[RequestsSession]):
 
81
  self._default_cert = kwargs.get("cert") or None
82
  self._default_http3 = kwargs.get("http3", False)
83
  self.selector_config = kwargs.get("selector_config") or {}
84
+ self._is_alive = False
85
 
86
  @staticmethod
87
  def _get_param(kwargs: Dict, key: str, default: Any) -> Any:
 
185
 
186
  def __enter__(self):
187
  """Creates and returns a new synchronous Fetcher Session"""
188
+ if self._is_alive:
189
  raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
190
 
191
  self._curl_session = CurlSession()
192
+ self._is_alive = True
193
  return self
194
 
195
  def __exit__(self, exc_type, exc_val, exc_tb):
 
204
  self._curl_session.close()
205
  self._curl_session = None
206
 
207
+ self._is_alive = False
208
+
209
+ def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
210
  """
211
  Perform an HTTP request using the configured session.
212
  """
 
272
  :return: A `Response` object.
273
  """
274
  stealthy_headers = kwargs.pop("stealthy_headers", None)
275
+ return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
276
 
277
  def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
278
  """
 
304
  :return: A `Response` object.
305
  """
306
  stealthy_headers = kwargs.pop("stealthy_headers", None)
307
+ return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
308
 
309
  def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
310
  """
 
336
  :return: A `Response` object.
337
  """
338
  stealthy_headers = kwargs.pop("stealthy_headers", None)
339
+ return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
340
 
341
  def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
342
  """
 
370
  # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
371
  # But some websites accept it, it depends on the implementation used.
372
  stealthy_headers = kwargs.pop("stealthy_headers", None)
373
+ return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
374
 
375
 
376
  class _ASyncSessionLogic(_ConfigurationLogic):
 
382
 
383
  async def __aenter__(self): # pragma: no cover
384
  """Creates and returns a new asynchronous Session."""
385
+ if self._is_alive:
386
  raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
387
 
388
  self._async_curl_session = AsyncCurlSession()
389
+ self._is_alive = True
390
  return self
391
 
392
  async def __aexit__(self, exc_type, exc_val, exc_tb):
 
401
  await self._async_curl_session.close()
402
  self._async_curl_session = None
403
 
404
+ self._is_alive = False
405
+
406
+ async def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
407
  """
408
  Perform an HTTP request using the configured session.
409
  """
 
471
  :return: A `Response` object.
472
  """
473
  stealthy_headers = kwargs.pop("stealthy_headers", None)
474
+ return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
475
 
476
  def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
477
  """
 
503
  :return: A `Response` object.
504
  """
505
  stealthy_headers = kwargs.pop("stealthy_headers", None)
506
+ return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
507
 
508
  def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
509
  """
 
535
  :return: A `Response` object.
536
  """
537
  stealthy_headers = kwargs.pop("stealthy_headers", None)
538
+ return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
539
 
540
  def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
541
  """
 
569
  # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
570
  # But some websites accept it, it depends on the implementation used.
571
  stealthy_headers = kwargs.pop("stealthy_headers", None)
572
+ return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
573
 
574
 
575
  class FetcherSession: